
    fTh`                        S r SSKrSSKJrJrJr  SSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJr  SS
KJrJr  SSKJr  \R:                  " \5      r " S S\R@                  5      r! " S S\R@                  5      r" " S S\RF                  5      r$ " S S\R@                  5      r% " S S\R@                  5      r& " S S\R@                  5      r' " S S\R@                  5      r( " S S\R@                  5      r) " S S\R@                  5      r* " S S \R@                  5      r+ " S! S"\R@                  5      r, " S# S$\R@                  5      r-\ " S% S&\5      5       r.\ " S' S(\.5      5       r/\ " S) S*\.5      5       r0\" S+S,9 " S- S.\.5      5       r1\ " S/ S0\.5      5       r2\ " S1 S2\.5      5       r3\ " S3 S4\.5      5       r4/ S5Qr5g)6zPyTorch SqueezeBert model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )SqueezeBertConfigc                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )SqueezeBertEmbeddings-   zGConstruct the embeddings from word, position and token_type embeddings.c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  S[&        R(                  " UR                  5      R+                  S5      SS9  g )N)padding_idxepsposition_ids)r   F)
persistent)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormhidden_sizelayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     l/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/squeezebert/modeling_squeezebert.pyr#   SqueezeBertEmbeddings.__init__0   s    !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
    c                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      nU R                  U5      nXG-   U-   n	U R                  U	5      n	U R                  U	5      n	U	$ )Nr    r   dtypedevice)sizer   r4   zeroslongrA   r(   r*   r,   r-   r2   )
r8   	input_idstoken_type_idsr   inputs_embedsinput_shape
seq_lengthr*   r,   
embeddingss
             r;   forwardSqueezeBertEmbeddings.forward@   s     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
r=   )r-   r2   r*   r,   r(   )NNNN	__name__
__module____qualname____firstlineno____doc__r#   rK   __static_attributes____classcell__r:   s   @r;   r   r   -   s    Q
  r=   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MatMulWrapperY   z
Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
c                 "   > [         TU ]  5         g N)r"   r#   )r8   r:   s    r;   r#   MatMulWrapper.__init___   s    r=   c                 .    [         R                  " X5      $ )a  

:param inputs: two torch tensors :return: matmul of these tensors

Here are the typical dimensions found in BERT (the B is optional) mat1.shape: [B, <optional extra dims>, M, K]
mat2.shape: [B, <optional extra dims>, K, N] output shape: [B, <optional extra dims>, M, N]
)r4   matmul)r8   mat1mat2s      r;   rK   MatMulWrapper.forwardb   s     ||D''r=    rM   rU   s   @r;   rW   rW   Y   s    
( (r=   rW   c                   (    \ rS rSrSrSS jrS rSrg)SqueezeBertLayerNormm   z
This is a nn.LayerNorm subclass that accepts NCW data layout and performs normalization in the C dimension.

N = batch C = channels W = sequence length
c                 @    [         R                  R                  XUS9  g )N)normalized_shaper   )r   r-   r#   )r8   r.   r   s      r;   r#   SqueezeBertLayerNorm.__init__t   s    
dcJr=   c                     UR                  SSS5      n[        R                  R                  X5      nUR                  SSS5      $ )Nr      r   )permuter   r-   rK   )r8   xs     r;   rK   SqueezeBertLayerNorm.forwardw   s;    IIaALL  )yyAq!!r=   ra   N)g-q=)rN   rO   rP   rQ   rR   r#   rK   rS   ra   r=   r;   rc   rc   m   s    K"r=   rc   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ConvDropoutLayerNorm}   z0
ConvDropoutLayerNorm: Conv, Dropout, LayerNorm
c                    > [         TU ]  5         [        R                  " XSUS9U l        [        U5      U l        [        R                  " U5      U l        g Nr   in_channelsout_channelskernel_sizegroups)	r"   r#   r   Conv1dconv1drc   	layernormr0   r2   )r8   cincoutrv   dropout_probr:   s        r;   r#   ConvDropoutLayerNorm.__init__   s@    iiCPQZ`a-d3zz,/r=   c                 t    U R                  U5      nU R                  U5      nX2-   nU R                  U5      nU$ rZ   rx   r2   ry   )r8   hidden_statesinput_tensorrk   s       r;   rK   ConvDropoutLayerNorm.forward   s8    KK&LLONN1r=   r   rM   rU   s   @r;   rn   rn   }   s    0 r=   rn   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ConvActivation   z"
ConvActivation: Conv, Activation
c                 t   > [         TU ]  5         [        R                  " XSUS9U l        [
        U   U l        g rq   )r"   r#   r   rw   rx   r   act)r8   rz   r{   rv   r   r:   s        r;   r#   ConvActivation.__init__   s/    iiCPQZ`a#;r=   c                 F    U R                  U5      nU R                  U5      $ rZ   )rx   r   )r8   rk   outputs      r;   rK   ConvActivation.forward   s    Qxxr=   )r   rx   rM   rU   s   @r;   r   r      s    
   r=   r   c                   D   ^  \ rS rSrSU 4S jjrS rS rS rS rSr	U =r
$ )	SqueezeBertSelfAttention   c                 n  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X"SUS9U l	        [        R                  " X"SUS9U l
        [        R                  " X"SUS9U l        [        R                  " UR                  5      U l        [        R                  " SS9U l        [#        5       U l        [#        5       U l        g	)
z
config = used for some things; ignored for others (work in progress...) cin = input channels = output channels
groups = number of groups to use in conv1d layers
r   zcin (z6) is not a multiple of the number of attention heads ()r   rr   r    dimN)r"   r#   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rw   querykeyvaluer0   attention_probs_dropout_probr2   SoftmaxsoftmaxrW   	matmul_qk
matmul_qkv)r8   r9   rz   q_groupsk_groupsv_groupsr:   s         r;   r#   !SqueezeBertSelfAttention.__init__   s    
 	+++q0uRSYSmSmRnnop  $*#=#= #&s-G-G'G#H !558P8PPYY3aX`a
99AV^_YY3aX`a
zz&"E"EFzzb)&'/r=   c                     UR                  5       S   U R                  U R                  UR                  5       S   4nUR                  " U6 nUR	                  SSSS5      $ )zg
- input: [N, C, W]
- output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
r   r    r   r
   ri   )rB   r   r   viewrj   r8   rk   new_x_shapes      r;   transpose_for_scores-SqueezeBertSelfAttention.transpose_for_scores   s[    
 vvx{D$<$<d>V>VXYX^X^X`acXdeFFK yyAq!$$r=   c                     UR                  5       S   U R                  U R                  UR                  5       S   4nUR                  " U6 nU$ )zg
- input: [N, C, W]
- output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
r   r    )rB   r   r   r   r   s      r;   transpose_key_for_scores1SqueezeBertSelfAttention.transpose_key_for_scores   sK    
 vvx{D$<$<d>V>VXYX^X^X`acXdeFFK r=   c                     UR                  SSSS5      R                  5       nUR                  5       S   U R                  UR                  5       S   4nUR                  " U6 nU$ )z-
- input: [N, C1, W, C2]
- output: [N, C, W]
r   r   r
   ri   )rj   
contiguousrB   r   r   r   s      r;   transpose_output)SqueezeBertSelfAttention.transpose_output   sZ    
 IIaAq!,,.vvx{D$6$6DFFK r=   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      n	U R                  Xx5      n
U
[        R                  " U R                  5      -  n
X-   n
U R                  U
5      nU R                  U5      nU R                  X5      nU R                  U5      nSU0nU(       a  XS'   U$ )z
expects hidden_states in [N, C, W] data layout.

The attention_mask data layout is [N, W], and it does not need to be transposed.
context_layerattention_score)r   r   r   r   r   r   mathsqrtr   r   r2   r   r   )r8   r   attention_maskoutput_attentionsmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerr   attention_probsr   results                 r;   rK    SqueezeBertSelfAttention.forward   s     !JJ}5((=1 JJ}5//0AB11/B	//0AB ..@)DIId6N6N,OO): ,,7 ,,7E--m<!=1(7$%r=   )
r   r   r2   r   r   r   r   r   r   r   )r   r   r   )rN   rO   rP   rQ   r#   r   r   r   rK   rS   rT   rU   s   @r;   r   r      s!    *0%! !r=   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertModule   c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR                  n[	        XUR
                  UR                  UR                  S9U l        [        X#UR                  UR                  S9U l        [        X4UR                  UR                  S9U l        [        XEUR"                  UR                  S9U l        g)aP  
- hidden_size = input chans = output chans for Q, K, V (they are all the same ... for now) = output chans for
  the module
- intermediate_size = output chans for intermediate layer
- groups = number of groups for all layers in the BertModule. (eventually we could change the interface to
  allow different groups for different layers)
)r9   rz   r   r   r   )rz   r{   rv   r|   )rz   r{   rv   r   N)r"   r#   r.   intermediate_sizer   r   r   r   	attentionrn   post_attention_groupsr1   post_attentionr   intermediate_groups
hidden_actintermediateoutput_groupsr   )r8   r9   c0c1c2c3r:   s         r;   r#   SqueezeBertModule.__init__   s     	%%1FOOfoo`f`o`o
 3F$@$@vOiOi
 +r6C]C]cictctu*F$8$8vGaGa
r=   c                     U R                  XU5      nUS   nU R                  XQ5      nU R                  U5      nU R                  Xv5      nSU0n	U(       a  US   U	S'   U	$ )Nr   feature_mapr   )r   r   r   r   )
r8   r   r   r   attattention_outputpost_attention_outputintermediate_outputlayer_outputoutput_dicts
             r;   rK   SqueezeBertModule.forward  su    nn]<MN/ $ 3 34D T"//0EF{{#6N$l3-01B-CK)*r=   )r   r   r   r   rN   rO   rP   rQ   r#   rK   rS   rT   rU   s   @r;   r   r      s    
4 r=   r   c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )SqueezeBertEncoderi$  c                    >^ [         TU ]  5         TR                  TR                  :X  d   S5       e[        R
                  " U4S j[        TR                  5       5       5      U l        g )NzIf you want embedding_size != intermediate hidden_size, please insert a Conv1d layer to adjust the number of channels before the first SqueezeBertModule.c              3   :   >#    U  H  n[        T5      v   M     g 7frZ   )r   ).0_r9   s     r;   	<genexpr>.SqueezeBertEncoder.__init__.<locals>.<genexpr>.  s     #gGf!$5f$=$=Gfs   )	r"   r#   r&   r.   r   
ModuleListrangenum_hidden_layerslayersr7   s    `r;   r#   SqueezeBertEncoder.__init__%  sW    $$(:(:: 	
2	
: mm#guVMeMeGf#ggr=   c                     Uc  SnO#UR                  S 5      [        U5      :X  a  SnOSnUSL d   S5       eUR                  SSS5      nU(       a  SOS nU(       a  SOS n	U R                   H]  n
U(       a+  UR                  SSS5      nX4-  nUR                  SSS5      nU
R	                  XU5      nUS   nU(       d  MU  XS	   4-  n	M_     UR                  SSS5      nU(       a  X4-  nU(       d  [        S
 XU	4 5       5      $ [        XU	S9$ )NTFzAhead_mask is not yet supported in the SqueezeBert implementation.r   ri   r   ra   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frZ   ra   )r   vs     r;   r   -SqueezeBertEncoder.forward.<locals>.<genexpr>[  s     h$Vq$Vs   	)last_hidden_stater   
attentions)countlenrj   r   rK   tupler   )r8   r   r   	head_maskr   output_hidden_statesreturn_dicthead_mask_is_all_noneall_hidden_statesall_attentionslayerr   s               r;   rK   SqueezeBertEncoder.forward0  s6    $(!__T"c)n4$(!$)!$,q.qq, &--aA6"6BD0d[[E# - 5 5aA >!%55! - 5 5aA > ==HYZL(7M  0A#B"DD ! &--aA6!11h]~$Vhhh+Yg
 	
r=   )r   )NNFFTr   rU   s   @r;   r   r   $  s$    	h ".
 .
r=   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertPooleria  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rZ   )r"   r#   r   Linearr.   denseTanh
activationr7   s     r;   r#   SqueezeBertPooler.__init__b  s9    YYv1163E3EF
'')r=   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r8   r   first_token_tensorpooled_outputs       r;   rK   SqueezeBertPooler.forwardg  s6     +1a40

#566r=   )r   r   r   rU   s   @r;   r   r   a  s    $
 r=   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"SqueezeBertPredictionHeadTransformip  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g )Nr   )r"   r#   r   r   r.   r   
isinstancer   strr   transform_act_fnr-   r/   r7   s     r;   r#   +SqueezeBertPredictionHeadTransform.__init__q  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr=   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rZ   )r   r  r-   r8   r   s     r;   rK   *SqueezeBertPredictionHeadTransform.forwardz  s4    

=1--m<}5r=   )r-   r   r  r   rU   s   @r;   r  r  p  s    U r=   r  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )SqueezeBertLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)bias)r"   r#   r  	transformr   r   r.   r%   decoder	Parameterr4   rC   r  r7   s     r;   r#   $SqueezeBertLMPredictionHead.__init__  sm    ;FC yy!3!3V5F5FUSLLV->->!?@	 !IIr=   c                 :    U R                   U R                  l         g rZ   )r  r  r8   s    r;   _tie_weights(SqueezeBertLMPredictionHead._tie_weights  s     IIr=   c                 J    U R                  U5      nU R                  U5      nU$ rZ   )r  r  r  s     r;   rK   #SqueezeBertLMPredictionHead.forward  s$    }5]3r=   )r  r  r  )returnN)	rN   rO   rP   rQ   r#   r  rK   rS   rT   rU   s   @r;   r  r    s    && r=   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g rZ   )r"   r#   r  predictionsr7   s     r;   r#   SqueezeBertOnlyMLMHead.__init__  s    6v>r=   c                 (    U R                  U5      nU$ rZ   r"  )r8   sequence_outputprediction_scoress      r;   rK   SqueezeBertOnlyMLMHead.forward  s     ,,_=  r=   r%  r   rU   s   @r;   r   r     s    ?! !r=   r   c                   "    \ rS rSr\rSrS rSrg)SqueezeBertPreTrainedModeli  transformerc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       a%  UR                  R
                  R                  5         gg)zInitialize the weightsg        )meanstdNg      ?)r	  r   r   rw   weightdatanormal_r9   initializer_ranger  zero_r$   r   r-   fill_r  )r8   modules     r;   _init_weights(SqueezeBertPreTrainedModel._init_weights  s<   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) ;<<KK""$ =r=   ra   N)	rN   rO   rP   rQ   r   config_classbase_model_prefixr6  rS   ra   r=   r;   r*  r*    s    $L%%r=   r*  c                   D  ^  \ rS rSrU 4S jrS rS rS r\         SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )SqueezeBertModeli  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  5         g rZ   )	r"   r#   r   rJ   r   encoderr   pooler	post_initr7   s     r;   r#   SqueezeBertModel.__init__  s@     /7)&1'/ 	r=   c                 .    U R                   R                  $ rZ   rJ   r(   r  s    r;   get_input_embeddings%SqueezeBertModel.get_input_embeddings  s    ...r=   c                 $    XR                   l        g rZ   rB  r8   new_embeddingss     r;   set_input_embeddings%SqueezeBertModel.set_input_embeddings  s    *8'r=   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr=  r   r   prune_heads)r8   heads_to_pruner   headss       r;   _prune_headsSqueezeBertModel._prune_heads  s<    
 +002LELLu%//;;EB 3r=   rE   r   rF   r   r   rG   r   r   r   r  c
           	      H   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eUb  UR                  OUR                  nUc  [        R                  " XS9nUc$  [        R                  " U
[        R                  US9nU R                  X*5      nU R                  XPR                   R                  5      nU R                  XX6S9nU R!                  UUUUUU	S9nUS   nU R#                  U5      nU	(       d
  UU4US	S  -   $ [%        UUUR&                  UR(                  S
9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer    z5You have to specify either input_ids or inputs_embeds)rA   r?   )rE   r   rF   rG   )r   r   r   r   r   r   r   r   )r   pooler_outputr   r   )r9   r   r   use_return_dictr   %warn_if_padding_and_no_attention_maskrB   rA   r4   onesrC   rD   get_extended_attention_maskget_head_maskr   rJ   r=  r>  r   r   r   )r8   rE   r   rF   r   r   rG   r   r   r   rH   rA   extended_attention_maskembedding_outputencoder_outputsr&  r  s                    r;   rK   SqueezeBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU%.%:!!@T@T!"ZZCN!"[[EJJvVN"&"B"B>"_ &&y++2O2OP	??> + 
 ,,*2/!5# ' 
 *!,O4#]3oab6III)-')77&11	
 	
r=   )rJ   r=  r>  )	NNNNNNNNN)rN   rO   rP   rQ   r#   rC  rH  rO  r   r   r4   TensorFloatTensorboolr   r   r   rK   rS   rT   rU   s   @r;   r;  r;    s   /9C  -11515/3,059,0/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
   1 12A
 $D>A
 'tnA
 d^A
 
u00	1A
 A
r=   r;  c                   f  ^  \ rS rSrSS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )SqueezeBertForMaskedLMi  zcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rZ   )r"   r#   r;  r+  r   clsr?  r7   s     r;   r#   SqueezeBertForMaskedLM.__init__  s5     +F3)&1 	r=   c                 B    U R                   R                  R                  $ rZ   )rb  r"  r  r  s    r;   get_output_embeddings,SqueezeBertForMaskedLM.get_output_embeddings&  s    xx##+++r=   c                     XR                   R                  l        UR                  U R                   R                  l        g rZ   )rb  r"  r  r  rF  s     r;   set_output_embeddings,SqueezeBertForMaskedLM.set_output_embeddings)  s*    '5$$2$7$7!r=   rE   r   rF   r   r   rG   labelsr   r   r   r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr   rF   r   r   rG   r   r   r   r   r    ri   losslogitsr   r   )
r9   rS  r+  rb  r   r   r%   r   r   r   )r8   rE   r   rF   r   r   rG   rj  r   r   r   outputsr&  r'  masked_lm_lossloss_fctr   s                    r;   rK   SqueezeBertForMaskedLM.forward-  s    ( &1%<k$++B]B]""))%'/!5# # 

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r=   )rb  r+  
NNNNNNNNNN)rN   rO   rP   rQ   _tied_weights_keysr#   re  rh  r   r   r4   r\  r^  r   r   r   rK   rS   rT   rU   s   @r;   r`  r`    s   :<Z[,8  -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
r=   r`  z
    SqueezeBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )$SqueezeBertForSequenceClassificationic  c                 P  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g rZ   )r"   r#   
num_labelsr9   r;  r+  r   r0   r1   r2   r   r.   
classifierr?  r7   s     r;   r#   -SqueezeBertForSequenceClassification.__init__j  ss      +++F3zz&"<"<=))F$6$68N8NO 	r=   rE   r   rF   r   r   rG   rj  r   r   r   r  c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrl  r   
regressionsingle_label_classificationmulti_label_classificationr    ri   rm  )r9   rS  r+  r2   r{  problem_typerz  r@   r4   rD   r   r	   squeezer   r   r   r   r   r   )r8   rE   r   rF   r   r   rG   rj  r   r   r   rp  r  ro  rn  rr  r   s                    r;   rK   ,SqueezeBertForSequenceClassification.forwardv  s   ( &1%<k$++B]B]""))%'/!5# # 

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r=   )r{  r9   r2   rz  r+  rt  )rN   rO   rP   rQ   r#   r   r   r4   r\  r^  r   r   r   rK   rS   rT   rU   s   @r;   rx  rx  c  s   
  -11515/3,004)-,0/3&*F
ELL)F
 !.F
 !.	F

 u||,F
 ELL)F
  -F
 &F
 $D>F
 'tnF
 d^F
 
u..	/F
 F
r=   rx  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )SqueezeBertForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r"   r#   r;  r+  r   r0   r1   r2   r   r.   r{  r?  r7   s     r;   r#   %SqueezeBertForMultipleChoice.__init__  sW     +F3zz&"<"<=))F$6$6: 	r=   rE   r   rF   r   r   rG   rj  r   r   r   r  c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
    *input_ids* above)
Nr   r    rl  ri   rm  )r9   rS  shaper   rB   r+  r2   r{  r   r   r   r   )r8   rE   r   rF   r   r   rG   rj  r   r   r   num_choicesrp  r  ro  reshaped_logitsrn  rr  r   s                      r;   rK   $SqueezeBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ""))%'/!5# # 

  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r=   )r{  r2   r+  rt  )rN   rO   rP   rQ   r#   r   r   r4   r\  r^  r   r   r   rK   rS   rT   rU   s   @r;   r  r    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
u//	0X
 X
r=   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )!SqueezeBertForTokenClassificationi(  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rZ   )r"   r#   rz  r;  r+  r   r0   r1   r2   r   r.   r{  r?  r7   s     r;   r#   *SqueezeBertForTokenClassification.__init__*  sj      +++F3zz&"<"<=))F$6$68I8IJ 	r=   rE   r   rF   r   r   rG   rj  r   r   r   r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nrl  r   r    ri   rm  )r9   rS  r+  r2   r{  r   r   rz  r   r   r   )r8   rE   r   rF   r   r   rG   rj  r   r   r   rp  r&  ro  rn  rr  r   s                    r;   rK   )SqueezeBertForTokenClassification.forward5  s    $ &1%<k$++B]B]""))%'/!5# # 

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r=   )r{  r2   rz  r+  rt  )rN   rO   rP   rQ   r#   r   r   r4   r\  r^  r   r   r   rK   rS   rT   rU   s   @r;   r  r  (  s    	  -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
r=   r  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )SqueezeBertForQuestionAnsweringik  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rZ   )
r"   r#   rz  r;  r+  r   r   r.   
qa_outputsr?  r7   s     r;   r#   (SqueezeBertForQuestionAnswering.__init__m  sT      +++F3))F$6$68I8IJ 	r=   rE   r   rF   r   r   rG   start_positionsend_positionsr   r   r   r  c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nrl  r   r   r    r   )ignore_indexri   )rn  start_logits
end_logitsr   r   )r9   rS  r+  r  splitr  r   r   rB   clampr   r   r   r   )r8   rE   r   rF   r   r   rG   r  r  r   r   r   rp  r&  ro  r  r  
total_lossignored_indexrr  
start_lossend_lossr   s                          r;   rK   'SqueezeBertForQuestionAnswering.forwardw  s    &1%<k$++B]B]""))%'/!5# # 

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r=   )rz  r  r+  )NNNNNNNNNNN)rN   rO   rP   rQ   r#   r   r   r4   r\  r^  r   r   r   rK   rS   rT   rU   s   @r;   r  r  k  s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r=   r  )r`  r  r  rx  r  r;  r   r*  )6rR   r   typingr   r   r   r4   r   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_squeezebertr   
get_loggerrN   loggerModuler   rW   r-   rc   rn   r   r   r   r   r   r  r  r   r*  r;  r`  rx  r  r  r  __all__ra   r=   r;   <module>r     s   !  ) )   A A !   . 9 
		H	%)BII )X(BII (("2<< " 299 ( RYY  Wryy Wt'		 'T:
 :
z		  "")) .!RYY ! % % %. [
1 [
 [
| F
7 F
 F
R T
+E T
T
n d
#= d
 d
N ?
(B ?
 ?
D J
&@ J
 J
Z	r=   