
    eThj                        S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJrJr  SSKJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%J&r&J'r'  SSK(J)r)  \'RT                  " \+5      r,S r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\05      r1\0\1S.r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S S\R\                  5      r5\& " S S\5      5       r6\ " S  S!\%5      5       r7\& " S" S#\65      5       r8\&" S$S%9 " S& S'\65      5       r9 " S( S)\R\                  5      r: " S* S+\R\                  5      r;\& " S, S-\65      5       r<\&" S.S%9 " S/ S0\65      5       r=\& " S1 S2\65      5       r>\& " S3 S4\65      5       r?\& " S5 S6\65      5       r@/ S7QrAg)8zPyTorch ALBERT model.    N)	dataclass)DictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indices"is_torch_greater_or_equal_than_2_2prune_linear_layer)ModelOutputauto_docstringlogging   )AlbertConfigc           	      
    SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ n	U H]  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        U	R                  U5        M_     [        X5       H  u  p[        U
5        M     [        X5       GHo  u  pU
nU
R!                  SS5      n
U
R!                  S	S
5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  SS5      n
U
R!                  S S!5      n
U
R!                  S"S#5      n
U
R!                  S$S%5      n
[#        U
R%                  S5      5      S&:X  a  S'U
;   d  S(U
;   a  S)U
-   n
S*U
;   a$  U
R!                  S+S,5      n
U
R!                  S-S.5      n
U
R%                  S5      n
S/U
;   d  S0U
;   d  S1U
;   d  S2U
;   d  S3U
;   a*  [        R                  S4SR'                  U
5       35        GM  U nU
 H  nUR)                  S5U5      (       a  UR%                  S6U5      nOU/nUS   S7:X  d	  US   S8:X  a  [+        US.5      nO[US   S':X  d	  US   S9:X  a  [+        US:5      nO<US   S(:X  a  [+        US.5      nO&US   S;:X  a  [+        US<5      nO [+        UUS   5      n[#        U5      S=:  d  M  [/        US&   5      nUU   nM     WS>S S?:X  a  [+        US.5      nOUS7:X  a  UR1                  U5      n UR2                  UR2                  :w  a&  [5        S@UR2                   SAUR2                   SB35      e [        SCU
 SDU 35        [8        R:                  " U5      Ul        GMr     U $ ! [         a    [        R                  S5        e f = f! [,         a,    [        R                  S4SR'                  U
5       35         GM  f = f! [4         a1  nU=R6                  UR2                  UR2                  4-  sl        e SnAff = f)Ez'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/ ffn_1ffnzbert/zalbert/attention_1	attentionz
transform/LayerNorm_1full_layer_layer_norm	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output//z/self/zpooler/densepoolerzcls/predictionspredictionszpredictions/attentionzembeddings/attention
embeddingsinner_group_zalbert_layers/group_zalbert_layer_groups/r   output_biasoutput_weightszclassifier/seq_relationshipzseq_relationship/output_zsop_classifier/classifier/weightsweightadam_madam_vAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammabetabiassquad
classifier   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipprintreplacelensplitjoin	fullmatchgetattrAttributeErrorint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathrB   nptftf_path	init_varsnamesarraysnamer[   arrayoriginal_namepointerm_namescope_namesnumes                      b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/albert/modeling_albert.pyload_tf_weights_in_albertrs   3   s   
 ggoo01G
KK8	BC''0IEF (l5'BC&&w5Te	 ! 5)d * 5) ||Ir* ||GU+||GY/||M;7||L"-||M+BC||K)>?||NB/ ||126||<mL ||J,||Hc* ||NH5 ||-}=||3]C ||2LA||N,<=||H&<= tzz#1$-4*?CSW[C[ 4'D %<< :<XYD<<	84Dzz# 4)T1+t3$KK)CHHTN#345F||,f55 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g{1~>G ;1$+a.)!#,- 0 #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	*4&}oFG''.{ *~ Li  Q	
 	@ & KK)CHHTN+; <=  	FFw}}ekk22F	s6   Q' R3A S'!R1S S
S?,S::S?c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\R                     S\\R                     S	\S
\R                  4S jjrSrU =r$ )AlbertEmbeddings   zI
Construct the embeddings from word, position and token_type embeddings.
rb   c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  [+        USS5      U l        U R#                  S	[$        R.                  " U R0                  R3                  5       [$        R4                  S
9SS9  g )N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r	   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr)   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr^   arangeexpandrW   r~   zerosr{   sizelongselfrb   	__class__s     rr   r   AlbertEmbeddings.__init__   s1   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsr   r{   inputs_embedspast_key_values_lengthreturnc                 d   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      n	U	nO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R                  U5      nU R                  U5      nU$ )Nr|   r   r   r   r   devicer   )r   r{   hasattrr   r   r^   r   r   r   r   r   r~   r   r)   r   )r   r   r   r{   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r-   r   s                rr   forwardAlbertEmbeddings.forward   sC     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r   )r)   r   r~   r   r   r   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r    r   r   r^   
LongTensorFloatTensorrY   Tensorr   __static_attributes____classcell__r   s   @rr   ru   ru      s    
| 
. 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
' 'r   ru   c                   H  ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrS\	\
   SS4S	 jr   SS
\R                  S\\R                     S\\R                     S\S\\\R                     \\R                  \R                  4   4   4
S jjrSrU =r$ )AlbertAttention   rb   c                   > [         TU ]  5         UR                  UR                  -  S:w  a6  [	        US5      (       d%  [        SUR                   SUR                   35      eUR                  U l        UR                  U l        UR                  UR                  -  U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                   5      U l        [        R                  " UR                  UR                  5      U l        [        R&                  " UR                  UR(                  S9U l        [+        5       U l        [/        USS5      U l        U R0                  S:X  d  U R0                  S	:X  aH  UR2                  U l        [        R4                  " S
UR2                  -  S-
  U R                  5      U l        g g )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads (ry   r~   r   relative_keyrelative_key_queryr@   r   )r   r   hidden_sizenum_attention_headsr   r\   attention_head_sizeall_head_sizer	   Linearquerykeyvaluer   attention_probs_dropout_probattention_dropoutr   output_dropoutdenser)   r   setpruned_headsrW   r~   r   r   distance_embeddingr   s     rr   r   AlbertAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 4457 
 $*#=#= !--#)#5#59S9S#S !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
!#F,O,O!P jj)C)CDYYv1163E3EF
f&8&8f>S>STE'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr   xr   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr|   r   r@   r   r   )r   r   r   viewpermute)r   r   new_x_shapes      rr   transpose_for_scores$AlbertAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r   headsNc                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g )Nr   r   dim)rS   r   r   r   r   r   r   r   r   r   r   union)r   r   indexs      rr   prune_headsAlbertAttention.prune_heads  s    u:?7++T-E-EtGXGX

 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r   hidden_statesattention_mask	head_maskoutput_attentionsc                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n	U R                  U5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   nU R                  S:X  d  U R                  S:X  GaB  UR                  5       S   n[        R                  " U[        R                  UR                  S9R                  SS5      n[        R                  " U[        R                  UR                  S9R                  SS5      nX-
  nU R!                  XR"                  -   S-
  5      nUR%                  UR&                  S9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " S	U	U5      nUU-   U-   n[*        R,                  R/                  USS
9nU R1                  U5      nUb  UU-  n[        R
                  " UU
5      nUR                  SS5      R3                  S5      nU R5                  U5      nU R7                  U5      nU R9                  UU-   5      nU(       a  UU4$ U4$ )Nr|   r   r   r   r   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r@   )r   r   r   r   r^   matmulrZ   mathsqrtr   r~   r   r   r   r   r   r   r   tor   einsumr	   
functionalsoftmaxr   flattenr   r   r)   )r   r   r   r   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresr   position_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layerprojected_context_layerprojected_context_layer_dropoutlayernormed_context_layers                            rr   r   AlbertAttention.forward*  s    !JJ}5((=1 JJ}5//0AB--o>	//0AB !<<5H5HR5PQ+dii8P8P.QQ%/@''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8FbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s  --//0@b/I 00A  -	9O_kB%//15==a@"&**]";*.*=*=>U*V'$(NN=Cb3b$c!?P)?;rWpVrrr   )r)   r   r   r   r   r   r   r   r   r   r   r~   r   r   r   NNF)r   r   r   r   r    r   r^   r   r   r   rY   r   r   r   boolr   r   r   r   r   r   s   @rr   r   r      s    u| u:%ell %u|| %
;c ;t ;* 7;15"'8s||8s !!2!238s E--.	8s
  8s 
uU\\"E%,,*D$EE	F8s 8sr   r   c                      ^  \ rS rSrU 4S jr   S
S\R                  S\\R                     S\\R                     S\	S\
\\R                     \\R                  \R                  4   4   4
U 4S jjjrS	rU =r$ )AlbertSdpaAttentionie  c                 f   > [         TU ]  U5        UR                  U l        [        (       + U l        g N)r   r   r   dropout_probr   require_contiguous_qkvr   s     rr   r   AlbertSdpaAttention.__init__f  s)     "??*L&L#r   r   r   r   r   r   c           	      x  > U R                   S:w  d  U(       a#  [        R                  S5        [        TU ]  XUS9$ UR                  5       u  pVnU R                  U R                  U5      5      nU R                  U R                  U5      5      n	U R                  U R                  U5      5      n
U R                  (       aM  UR                  R                  S:X  a3  Ub0  UR                  5       nU	R                  5       n	U
R                  5       n
[        R                  R                   R#                  UU	U
UU R$                  (       a  U R&                  OSSS9nUR)                  SS	5      nUR+                  XVU R,                  5      nU R/                  U5      nU R1                  U5      nU R3                  X-   5      nU4$ )
Nr   a  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   cuda        F)r   r   r   	attn_mask	dropout_p	is_causalr   r@   )r~   rF   warningr   r   r   r   r   r   r   r  r   type
contiguousr^   r	   r   scaled_dot_product_attentiontrainingr  rZ   reshaper   r   r   r)   )r   r   r   r   r   
batch_sizeseq_len_r   r   r   attention_outputr   r   r   r   s                  rr   r   AlbertSdpaAttention.forwardk  s    '':59JNNH 7?=Te?ff!.!3!3!5
Q//

=0IJ--dhh}.EF	//

=0IJ
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K 88..KK$+/==d''c L 
 ,55a;+33JI[I[\"&**-=">*.*=*=>U*V'$(NN=3b$c!)++r   )r  r  r   )r   r   r   r   r   r^   r   r   r   r   r   r   r   r   r   r   s   @rr   r   r   e  s    M 7;15"'-,||-, !!2!23-, E--.	-,
  -, 
uU\\"E%,,*D$EE	F-, -,r   r   )eagersdpac                     ^  \ rS rSrS\4U 4S jjr    SS\R                  S\\R                     S\\R                     S\
S\
S	\\R                  \R                  4   4S
 jjrS\R                  S	\R                  4S jrSrU =r$ )AlbertLayeri  rb   c                 0  > [         TU ]  5         Xl        UR                  U l        SU l        [
        R                  " UR                  UR                  S9U l	        [        UR                     " U5      U l        [
        R                  " UR                  UR                  5      U l        [
        R                  " UR                  UR                  5      U l        ["        UR$                     U l        [
        R(                  " UR*                  5      U l        g )Nr   ry   )r   r   rb   chunk_size_feed_forwardseq_len_dimr	   r)   r   r   r(   ALBERT_ATTENTION_CLASSES_attn_implementationr&   r   intermediate_sizer$   
ffn_outputr   
hidden_act
activationr   r   r   r   s     rr   r   AlbertLayer.__init__  s    '-'E'E$%'\\&2D2D&J_J_%`"1&2M2MNvV99V//1I1IJ))F$<$<f>P>PQ !2!23zz&"<"<=r   r   r   r   r   output_hidden_statesr   c                     U R                  XX45      n[        U R                  U R                  U R                  US   5      nU R                  XvS   -   5      nU4USS  -   $ )Nr   r   )r&   r   ff_chunkr  r  r(   )r   r   r   r   r   r%  r  r!  s           rr   r   AlbertLayer.forward  sq      >>-f.MM((Q	

 22:QR@S3ST"212"666r   r  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r$   r#  r!  )r   r  r!  s      rr   r'  AlbertLayer.ff_chunk  s3    XX./
__Z0
__Z0
r   )	r#  r&   r  rb   r   r$   r!  r(   r  NNFF)r   r   r   r   r    r   r^   r   r   r   r   r   r   r'  r   r   r   s   @rr   r  r    s    >| >  7;15"'%*7||7 !!2!237 E--.	7
  7 #7 
u||U\\)	*7( %,,  r   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\\R                     S\\R                     S\
S\
S	\\\R                  \\R                     4   S
4   4S jjrSrU =r$ )AlbertLayerGroupi  rb   c                    > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r  )r   r   r	   
ModuleListrangeinner_group_numr  albert_layersr   rb   r  r   s      rr   r   AlbertLayerGroup.__init__  sC    ]]vOeOeIf+gIfAK,?If+gh+gs   Ar   r   r   r   r%  r   .c                     SnSn[        U R                  5       H3  u  pU	" XX8   U5      n
U
S   nU(       a  XzS   4-   nU(       d  M.  Xa4-   nM5     U4nU(       a  X4-   nU(       a  X4-   nU$ )N r   r   )	enumerater2  )r   r   r   r   r   r%  layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputoutputss               rr   r   AlbertLayerGroup.forward  s     !)243E3E)F%K'yG]_pqL(OM #3A6H#H ##&9<L&L# *G !" 66G 33Gr   )r2  r+  )r   r   r   r   r    r   r^   r   r   r   r   r   r   r   r   r   r   s   @rr   r-  r-    s    i| i 7;15"'%*|| !!2!23 E--.	
   # 
uU\\5#667<	= r   r-  c                      ^  \ rS rSrS\4U 4S jjr     SS\R                  S\\R                     S\\R                     S\
S\
S	\
S
\\\4   4S jjrSrU =r$ )AlbertTransformeri  rb   c                 (  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r  )r   r   rb   r	   r   r   r   embedding_hidden_mapping_inr/  r0  num_hidden_groupsr-  albert_layer_groupsr3  s      rr   r   AlbertTransformer.__init__  sj    +-99V5J5JFL^L^+_(#%==TYZ`ZrZrTs1tTsq2B62JTs1t#u 1ts   -Br   r   r   r   r%  return_dictr   c           	         U R                  U5      nU(       a  U4OS nU(       a  SOS nUc  S /U R                  R                  -  OUn[        U R                  R                  5       H  n	[	        U R                  R                  U R                  R
                  -  5      n
[	        XR                  R                  U R                  R
                  -  -  5      nU R                  U   " UUX;U
-  US-   U
-   UU5      nUS   nU(       a  XS   -   nU(       d  M  Xq4-   nM     U(       d  [        S XU4 5       5      $ [        XUS9$ )Nr6  r   r   r|   c              3   .   #    U  H  oc  M  Uv   M     g 7fr  r6  ).0vs     rr   	<genexpr>,AlbertTransformer.forward.<locals>.<genexpr>  s     h$Vq$Vs   	)last_hidden_stater   
attentions)	rB  rb   num_hidden_layersr0  rY   rC  rD  tupler   )r   r   r   r   r   r%  rF  all_hidden_statesall_attentionsilayers_per_group	group_idxlayer_group_outputs                rr   r   AlbertTransformer.forward  sC    88G0D],$0d>G>OTFT[[:::U^	t{{445A"4;;#@#@4;;C`C`#`a A!>!>A^A^!^_`I!%!9!9)!D&66)a-K[9[\!$" /q1M !/R2H!H##$58H$H!) 6, h]~$Vhhh+Yg
 	
r   )rD  rb   rB  )NNFFT)r   r   r   r   r    r   r^   r   r   r   r   r   r   r   r   r   r   r   s   @rr   r@  r@    s    v| v 7;15"'%* *
||*
 !!2!23*
 E--.	*

  *
 #*
 *
 
%	&*
 *
r   r@  c                   *    \ rS rSr\r\rSrSr	S r
Srg)AlbertPreTrainedModeli"  albertTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weights.r  )meanstdN      ?)
isinstancer	   r   r4   r`   normal_rb   initializer_ranger=   zero_r   rx   r)   fill_AlbertMLMHead)r   modules     rr   _init_weights#AlbertPreTrainedModel._init_weights)  s2   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)..KK""$ /r   r6  N)r   r   r   r   r    config_classrs   load_tf_weightsbase_model_prefix_supports_sdparf  r   r6  r   rr   rY  rY  "  s    L/O N%r   rY  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
AlbertForPreTrainingOutputi<  a  
Output type of [`AlbertForPreTraining`].

Args:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossprediction_logits
sop_logitsr   rN  r6  )r   r   r   r   r   rn  r   r^   r   __annotations__ro  rp  r   r   rN  r   r6  r   rr   rm  rm  <  s}    2 )-D(5$$
%,59x 1 129.2J**+28<M8E%"3"345<59Ju00129r   rm  c                     ^  \ rS rSr\rSrSS\S\4U 4S jjjrS\	R                  4S jrS\	R                  SS	4S
 jrS\\\\   4   SS	4S jr\         SS\\R(                     S\\R*                     S\\R(                     S\\R(                     S\\R*                     S\\R*                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertModeli^  rZ  rb   add_pooling_layerc                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       aK  [        R                  " UR                  UR                  5      U l
        [        R                  " 5       U l        OSU l
        SU l        UR                  U l        UR                  U l        U R!                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r   r   rb   ru   r-   r@  encoderr	   r   r   r+   Tanhpooler_activationr  attn_implementationr~   	post_init)r   rb   rt  r   s      rr   r   AlbertModel.__init__c  s    
 	 *62(0))F$6$68J8JKDK%'WWYD"DK%)D"#)#>#> '-'E'E$ 	r   r   c                 .    U R                   R                  $ r  r-   r   r   s    rr   get_input_embeddings AlbertModel.get_input_embeddingsz  s    ...r   r   Nc                 $    XR                   l        g r  r}  )r   r   s     rr   set_input_embeddings AlbertModel.set_input_embeddings}  s    */'r   heads_to_prunec                 >   UR                  5        H  u  p#[        X R                  R                  -  5      n[        X$U R                  R                  -  -
  5      nU R                  R
                  U   R                  U   R                  R                  U5        M     g)a  
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
while [2,3] correspond to the two inner groups of the second hidden layer.

Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
information about head pruning
N)	itemsrY   rb   r1  rv  rD  r2  r&   r   )r   r  layerr   rU  inner_group_idxs         rr   _prune_headsAlbertModel._prune_heads  s{     +002LEEKK$?$??@I!%dkk6Q6Q*Q"QROLL,,Y7EEoV``llmrs 3r   r   r   r   r{   r   r   r   r%  rF  c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " XS9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R                  XX6S9nU R                   S:H  =(       a(    U R"                  S	:H  =(       a    US L =(       a    U(       + nU(       a  [%        UUR&                  US
9nOiUR)                  S5      R)                  S5      nUR+                  U R&                  S9nSU-
  [        R,                  " U R&                  5      R.                  -  nU R1                  XPR                   R2                  5      nU R5                  UUUUUU	S9nUS   nU R6                  b'  U R9                  U R7                  US S 2S4   5      5      OS nU	(       d
  UU4USS  -   $ [;        UUUR<                  UR>                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer|   z5You have to specify either input_ids or inputs_embeds)r   r   r   )r{   r   r   r  r   )tgt_lenr   r@   r   r^  )r   r   r%  rF  r   )rM  pooler_outputr   rN  ) rb   r   r%  use_return_dictr\   %warn_if_padding_and_no_attention_maskr   r   r^   onesr   r-   r   r   r   r   ry  r~   r   r   	unsqueezer   finfominget_head_maskrO  rv  r+   rx  r   r   rN  )r   r   r   r   r{   r   r   r   r%  rF  r   r  r   r   r   r   embedding_outputuse_sdpa_attention_maskextended_attention_maskencoder_outputssequence_outputpooled_outputs                         rr   r   AlbertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZCN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z?? + 

 $$. &,,
:&T!& &%	 	  #&I 0 6 6
'# '5&>&>q&A&K&KA&N#&=&@&@tzz&@&R#'*-D'DTXT^T^H_HcHc&c#&&y++2O2OP	,,#/!5# ' 
 *!,VZVaVaVm..t{{?1a4;P/QRsw#]3oab6III)-')77&11	
 	
r   )ry  rb   r-   rv  r+   rx  r~   )T)	NNNNNNNNN)r   r   r   r   r    rh  rj  r   r   r	   r   r  r  r   rY   r   r  r   r   r^   r   r   r   r   r   r   r   r   r   s   @rr   rs  rs  ^  sh   L |   ./bll /0",, 04 0t4T#Y+? tD t"  156:59371559,0/3&*T
E,,-T
 !!2!23T
 !!1!12	T

 u//0T
 E--.T
   1 12T
 $D>T
 'tnT
 d^T
 
)50	1T
 T
r   rs  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    )custom_introc                     ^  \ rS rSrSS/rS\4U 4S jjrS\R                  4S jr	S\R                  SS	4S
 jr
S\R                  4S jr\           SS\\R                      S\\R"                     S\\R                      S\\R                      S\\R"                     S\\R"                     S\\R                      S\\R                      S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertForPreTrainingi  predictions.decoder.biaspredictions.decoder.weightrb   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  5         g r  )	r   r   rs  rZ  rd  r,   AlbertSOPHeadsop_classifierrz  r   s     rr   r   AlbertForPreTraining.__init__  sB     !&)(0+F3 	r   r   c                 .    U R                   R                  $ r  r,   decoderr~  s    rr   get_output_embeddings*AlbertForPreTraining.get_output_embeddings      '''r   new_embeddingsNc                 $    XR                   l        g r  r  r   r  s     rr   set_output_embeddings*AlbertForPreTraining.set_output_embeddings  s    #1 r   c                 B    U R                   R                  R                  $ r  rZ  r-   r   r~  s    rr   r  )AlbertForPreTraining.get_input_embeddings      {{%%555r   r   r   r   r{   r   r   labelssentence_order_labelr   r%  rF  c                 (   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUSS u  pU R                  U5      nU R	                  U5      nSnUbv  Ubs  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU" UR                  SS5      UR                  S5      5      nUU-   nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
    sequence B), `1` indicates switched order (sequence B, then sequence A).

Example:

```python
>>> from transformers import AutoTokenizer, AlbertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
>>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
>>> # Batch size 1
>>> outputs = model(input_ids)

>>> prediction_logits = outputs.prediction_logits
>>> sop_logits = outputs.sop_logits
```Nr   r   r{   r   r   r   r%  rF  r@   r|   )rn  ro  rp  r   rN  )rb   r  rZ  r,   r  r   r   r   rm  r   rN  )r   r   r   r   r{   r   r   r  r  r   r%  rF  r=  r  r  prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_lossoutputs                         rr   r   AlbertForPreTraining.forward  sK   R &1%<k$++B]B]++))%'/!5#  

 *1!& ,,_=((7

"6"B')H%&7&<&<RAWAW&XZ`ZeZefhZijN"*:??2q+ACWC\C\]_C`"a'*==J'4wqr{BF/9/EZMF*Q6Q)/!!//))
 	
r   )rZ  r,   r  NNNNNNNNNNN)r   r   r   r   _tied_weights_keysr    r   r	   r   r  r  r   r  r   r   r^   r   r   r   r   rm  r   r   r   r   r   s   @rr   r  r    s~    56RS| (ryy (2BII 2$ 26bll 6  156:59371559-1;?,0/3&*L
E,,-L
 !!2!23L
 !!1!12	L

 u//0L
 E--.L
   1 12L
 ))*L
 'u'7'78L
 $D>L
 'tnL
 d^L
 
)50	1L
 L
r   r  c                   t   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrS	S jr	Sr
U =r$ )
rd  iU  rb   c                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        R                  " [        R                  " UR                  5      5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        UR                      U l        U R                  U R                  l
        g )Nry   )r   r   r	   r)   r   r   	Parameterr^   r   r   r=   r   r   r   r  r   r"  r#  r   s     rr   r   AlbertMLMHead.__init__V  s    f&;&;AVAVWLLV->->!?@	YYv1163H3HI
yy!6!68I8IJ !2!23 IIr   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUnU$ r  )r   r#  r)   r  )r   r   r  s      rr   r   AlbertMLMHead.forward`  sF    

=16}5]3)  r   c                     U R                   R                  R                  R                  S:X  a  U R                  U R                   l        g U R                   R                  U l        g )Nmeta)r  r=   r   r  r~  s    rr   _tie_weightsAlbertMLMHead._tie_weightsj  sC    <<##((F2 $		DLL ))DIr   )r)   r#  r=   r  r   )r   N)r   r   r   r   r    r   r^   r   r   r  r   r   r   s   @rr   rd  rd  U  s4    &| &!U\\ !ell !* *r   rd  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r  is  rb   c                    > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        g r  )
r   r   r	   r   classifier_dropout_probr   r   r   
num_labelsr?   r   s     rr   r   AlbertSOPHead.__init__t  sB    zz&"@"@A))F$6$68I8IJr   r  r   c                 J    U R                  U5      nU R                  U5      nU$ r  )r   r?   )r   r  dropout_pooled_outputlogitss       rr   r   AlbertSOPHead.forwardz  s%     $] ;!67r   )r?   r   )r   r   r   r   r    r   r^   r   r   r   r   r   s   @rr   r  r  s  s1    K| KU\\ ell  r   r  c                     ^  \ rS rSrSS/rU 4S jrS\R                  4S jrS\R                  SS4S	 jr	S\R                  4S
 jr\          SS\\R                     S\\R                      S\\R                     S\\R                     S\\R                      S\\R                      S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertForMaskedLMi  r  r  c                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        U R                  5         g NF)rt  )r   r   rs  rZ  rd  r,   rz  r   s     rr   r   AlbertForMaskedLM.__init__  s7     !&EB(0 	r   r   c                 .    U R                   R                  $ r  r  r~  s    rr   r  'AlbertForMaskedLM.get_output_embeddings  r  r   r  Nc                 Z    XR                   l        UR                  U R                   l        g r  )r,   r  r=   r  s     rr   r  'AlbertForMaskedLM.set_output_embeddings  s"    #1  . 3 3r   c                 B    U R                   R                  R                  $ r  r  r~  s    rr   r  &AlbertForMaskedLM.get_input_embeddings  r  r   r   r   r   r{   r   r   r  r   r%  rF  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a$  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> import torch
>>> from transformers import AutoTokenizer, AlbertForMaskedLM

>>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
>>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

>>> # add mask_token
>>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> # retrieve index of [MASK]
>>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
>>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
>>> tokenizer.decode(predicted_token_id)
'france'
```

```python
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
>>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
>>> outputs = model(**inputs, labels=labels)
>>> round(outputs.loss.item(), 2)
0.81
```
N	r   r   r   r{   r   r   r   r%  rF  r   r|   r@   rn  r  r   rN  )
rb   r  rZ  r,   r   r   r   r   r   rN  )r   r   r   r   r{   r   r   r  r   r%  rF  r=  sequence_outputsr  r  r  r  s                    rr   r   AlbertForMaskedLM.forward  s    b &1%<k$++B]B]++))%'/!5#  

 #1: ,,-=>')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   )rZ  r,   
NNNNNNNNNN)r   r   r   r   r  r   r	   r   r  r  r   r  r   r   r^   r   r   r   r   r   r   r   r   r   r   s   @rr   r  r    s[   46RS(ryy (4BII 4$ 46bll 6  156:59371559-1,0/3&*O
E,,-O
 !!2!23O
 !!1!12	O

 u//0O
 E--.O
   1 12O
 ))*O
 $D>O
 'tnO
 d^O
 
~u$	%O
 O
r   r  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   Z  ^  \ rS rSrS\4U 4S jjr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertForSequenceClassificationi  rb   c                 P  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g r  )r   r   r  rb   rs  rZ  r	   r   r  r   r   r   r?   rz  r   s     rr   r   (AlbertForSequenceClassification.__init__  sr      ++!&)zz&"@"@A))F$6$68N8NO 	r   r   r   r   r{   r   r   r  r   r%  rF  r   c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   
regressionsingle_label_classificationmulti_label_classificationr|   r@   r  )rb   r  rZ  r   r?   problem_typer  r   r^   r   rY   r   squeezer   r   r
   r   r   rN  )r   r   r   r   r{   r   r   r  r   r%  rF  r=  r  r  rn  r  r  s                    rr   r   'AlbertForSequenceClassification.forward  s   ( &1%<k$++B]B]++))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   )rZ  r?   rb   r   r  r  )r   r   r   r   r    r   r   r   r^   r   r   r   r   r   r   r   r   r   r   s   @rr   r  r    s    
| 
  156:59371559-1,0/3&*F
E,,-F
 !!2!23F
 !!1!12	F

 u//0F
 E--.F
   1 12F
 ))*F
 $D>F
 'tnF
 d^F
 
'.	/F
 F
r   r  c                   Z  ^  \ rS rSrS\4U 4S jjr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertForTokenClassificationiG  rb   c                 x  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g r  )r   r   r  rs  rZ  r  r   r	   r   r   r   r   rb   r?   rz  )r   rb   r  r   s      rr   r   %AlbertForTokenClassification.__init__I  s      ++!&EB --9 **++ 	 
 zz"9:))F$6$68N8NO 	r   r   r   r   r{   r   r   r  r   r%  rF  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r|   r@   r  )rb   r  rZ  r   r?   r   r   r  r   r   rN  )r   r   r   r   r{   r   r   r  r   r%  rF  r=  r  r  rn  r  r  s                    rr   r   $AlbertForTokenClassification.forwardY  s    $ &1%<k$++B]B]++))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   )rZ  r?   r   r  r  )r   r   r   r   r    r   r   r   r^   r   r   r   r   r   r   r   r   r   r   s   @rr   r  r  G  s   |    156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
$e+	,2
 2
r   r  c                   z  ^  \ rS rSrS\4U 4S jjr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertForQuestionAnsweringi  rb   c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r   r   r  rs  rZ  r	   r   r   
qa_outputsrz  r   s     rr   r   #AlbertForQuestionAnswering.__init__  sU      ++!&EB))F$6$68I8IJ 	r   r   r   r   r{   r   r   start_positionsend_positionsr   r%  rF  r   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r|   r   )ignore_indexr@   )rn  start_logits
end_logitsr   rN  )rb   r  rZ  r  rT   r  r  rS   r   clampr   r   r   rN  )r   r   r   r   r{   r   r   r  r  r   r%  rF  r=  r  r  r   r  r  ignored_indexr  
start_lossend_lossr  s                          rr   r   "AlbertForQuestionAnswering.forward  s    &1%<k$++B]B]++))%'/!5#  

 "!*#?#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )rZ  r  r  r  r   r   r   r   r    r   r   r   r^   r   r   r   r   rm  r   r   r   r   r   s   @rr   r  r    s+   |   156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
)50	1>
 >
r   r  c                   Z  ^  \ rS rSrS\4U 4S jjr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )AlbertForMultipleChoicei  rb   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r   r   rs  rZ  r	   r   r  r   r   r   r?   rz  r   s     rr   r    AlbertForMultipleChoice.__init__  sV     !&)zz&"@"@A))F$6$6: 	r   r   r   r   r{   r   r   r  r   r%  rF  r   c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
    *input_ids* above)
Nr   r|   r   r  r@   r  )rb   r  r[   r   r   rZ  r   r?   r   r   r   rN  )r   r   r   r   r{   r   r   r  r   r%  rF  num_choicesr=  r  r  reshaped_logitsrn  r  r  s                      rr   r   AlbertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	
 ++))%'/!5#  

  
]3#}= ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   )rZ  r?   r   r  r  r   s   @rr   r	  r	    s    |   156:59371559-1,0/3&*W
E,,-W
 !!2!23W
 !!1!12	W

 u//0W
 E--.W
   1 12W
 ))*W
 $D>W
 'tnW
 d^W
 
)50	1W
 W
r   r	  )	rs   rY  rs  r  r  r  r  r  r	  )Br   r   rH   dataclassesr   typingr   r   r   r   r   r^   r	   torch.nnr
   r   r   activationsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   utilsr   r   r   configuration_albertr    
get_loggerr   rF   rs   Moduleru   r   r   r  r  r-  r@  rY  rm  rs  r  rd  r  r  r  r  r  r	  __all__r6  r   rr   <module>r     s7     	 ! 5 5   A A ! K   .  : 9 . 
		H	%{|Aryy AHmsbii ms`3,/ 3,n  &")) &R ryy  F2
		 2
j %O % %2 : : :B G
' G
 G
T c
0 c
c
L*BII *<
BII 
 f
- f
 f
R T
&; T
T
n D
#8 D
 D
N J
!6 J
 J
Z c
3 c
 c
L
r   