
    fThF                     8   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  \R>                  " \ 5      r! " S S\RD                  5      r# " S S\RD                  5      r$ " S S\RD                  5      r%S\$0r& " S S\RD                  5      r' " S S\RD                  5      r( " S S\RD                  5      r) " S S\RD                  5      r* " S S\RD                  5      r+\ " S  S!\5      5       r,\ " S" S#\,5      5       r- " S$ S%\RD                  5      r. " S& S'\RD                  5      r/\ " S( S)\,5      5       r0\ " S* S+\5      5       r1\" S,S-9 " S. S/\,5      5       r2/ S0Qr3g)1zPyTorch Splinter model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsModelOutputQuestionAnsweringModelOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )SplinterConfigc                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\\
   S	\4S
 jjrSrU =r$ )SplinterEmbeddings(   zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  [+        USS5      U l        g )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr!   selfconfig	__class__s     f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/splinter/modeling_splinter.pyr$   SplinterEmbeddings.__init__+   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$    	input_idstoken_type_idsr   inputs_embedspast_key_values_lengthreturnc                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      nXH-   n	U R                  S:X  a  U R                  U5      n
X-  n	U R                  U	5      n	U R                  U	5      n	U	$ )Nr   r   dtypedevicer"   )sizer   r4   zeroslongrG   r)   r-   r!   r+   r.   r2   )r9   r?   r@   r   rA   rB   input_shape
seq_lengthr-   
embeddingsr+   s              r<   forwardSplinterEmbeddings.forward<   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL!"[[EJJtO`O`OgOghN  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r>   )r.   r2   r!   r+   r-   r)   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r$   r   r4   
LongTensorFloatTensorintr   rN   __static_attributes____classcell__r;   s   @r<   r   r   (   s    Q^& 1559375901E,,- !!1!12 u//0	
   1 12 !) 
 r>   r   c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )SplinterSelfAttention_   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r!   r"   relative_keyrelative_key_query   r   )r#   r$   r'   num_attention_headshasattr
ValueErrorrW   attention_head_sizeall_head_sizer   Linearquerykeyvaluer0   attention_probs_dropout_probr2   r7   r!   r*   r%   distance_embedding
is_decoderr9   r:   r!   r;   s      r<   r$   SplinterSelfAttention.__init__`   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r>   xrC   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr   r   rc   r   r
   )rH   rd   rg   viewpermute)r9   rr   new_x_shapes      r<   transpose_for_scores*SplinterSelfAttention.transpose_for_scoresz   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r>   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   rc   dimr   ra   rb   rE   )rF   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) rj   rw   rk   rl   r4   catro   matmul	transposer!   shapetensorrJ   rG   rt   r5   rn   r*   torF   einsummathsqrtrg   r   
functionalsoftmaxr2   ru   
contiguousrH   rh   )r9   ry   rz   r{   r|   r}   r~   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r<   rN   SplinterSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr>   )rh   rg   rn   r2   ro   rk   r*   rd   r!   rj   rl   NNNNNNF)rP   rQ   rR   rS   r$   r4   Tensorrw   r   rV   r   boolrN   rX   rY   rZ   s   @r<   r\   r\   _   s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr>   r\   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )SplinterSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r#   r$   r   ri   r'   denser.   r/   r0   r1   r2   r8   s     r<   r$   SplinterSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r>   ry   input_tensorrC   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r2   r.   r9   ry   r   s      r<   rN   SplinterSelfOutput.forward   5    

=1]3}'CDr>   r.   r   r2   
rP   rQ   rR   rS   r$   r4   r   rN   rX   rY   rZ   s   @r<   r   r      6    >U\\  RWR^R^  r>   r   eagerc                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )SplinterAttention   c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr!   )	r#   r$   SPLINTER_SELF_ATTENTION_CLASSES_attn_implementationr9   r   outputsetpruned_headsrp   s      r<   r$   SplinterAttention.__init__   s@    3F4O4OP
	 )0Er>   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r9   rd   rg   r   r   rj   rk   rl   r   r   rh   union)r9   headsindexs      r<   prune_headsSplinterAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r>   ry   rz   r{   r|   r}   r~   r   rC   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   )r9   r   )r9   ry   rz   r{   r|   r}   r~   r   self_outputsattention_outputr   s              r<   rN   SplinterAttention.forward  sW     yy!"
  ;;|AF#%QR(88r>   )r   r   r9   r   r   )rP   rQ   rR   rS   r$   r   r4   r   r   rV   r   r   rN   rX   rY   rZ   s   @r<   r   r      s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r>   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )SplinterIntermediatei.  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r#   r$   r   ri   r'   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr8   s     r<   r$   SplinterIntermediate.__init__/  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r>   ry   rC   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )r9   ry   s     r<   rN   SplinterIntermediate.forward7  s&    

=100?r>   r   r   rZ   s   @r<   r   r   .  s(    9U\\ ell  r>   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )SplinterOutputi>  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r#   r$   r   ri   r   r'   r   r.   r/   r0   r1   r2   r8   s     r<   r$   SplinterOutput.__init__?  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r>   ry   r   rC   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r<   rN   SplinterOutput.forwardE  r   r>   r   r   rZ   s   @r<   r   r   >  r   r>   r   c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )SplinterLayeriM  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is addedr"   r   )r#   r$   chunk_size_feed_forwardseq_len_dimr   	attentionro   add_cross_attentionrf   crossattentionr   intermediater   r   r8   s     r<   r$   SplinterLayer.__init__N  s    '-'E'E$*62 ++#)#=#= ##?? D6)g!hii"3FT^"_D08$V,r>   ry   rz   r{   r|   r}   r~   r   rC   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nrc   )r   r~   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   ro   re   rf   r   r   feed_forward_chunkr   r   )r9   ry   rz   r{   r|   r}   r~   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r<   rN   SplinterLayer.forward\  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr>   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )r9   r   intermediate_outputr   s       r<   r    SplinterLayer.feed_forward_chunk  s)    "//0@A{{#6Ir>   )r   r   r   r   r   ro   r   r   r   )rP   rQ   rR   rS   r$   r4   r   r   rV   r   r   rN   r   rX   rY   rZ   s   @r<   r   r   M  s    -" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r>   r   c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )SplinterEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r#   r$   r:   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r9   r:   _r;   s      r<   r$   SplinterEncoder.__init__  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A&ry   rz   r{   r|   r}   past_key_valuesr   r   output_hidden_statesreturn_dictrC   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   rc   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r  ).0vs     r<   	<genexpr>*SplinterEncoder.forward.<locals>.<genexpr>  s"      
A  s   	last_hidden_stater  ry   
attentionscross_attentions)r:   r   r   trainingloggerwarning_once	enumerater   _gradient_checkpointing_func__call__tupler   )r9   ry   rz   r{   r|   r}   r  r   r   r  r  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr~   layer_outputss                       r<   rN   SplinterEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r>   )r:   r   r   )	NNNNNNFFT)rP   rQ   rR   rS   r$   r4   r   r   rV   r   r   r   r   rN   rX   rY   rZ   s   @r<   r   r     s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r>   r   c                   &    \ rS rSr\rSrSrS rSr	g)SplinterPreTrainedModeli  splinterTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)r   r   ri   weightdatanormal_r:   initializer_rangebiaszero_r%   r   r.   fill_)r9   modules     r<   _init_weights%SplinterPreTrainedModel._init_weights  s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r>   r  N)
rP   rQ   rR   rS   r   config_classbase_model_prefixsupports_gradient_checkpointingr/  rX   r  r>   r<   r"  r"    s    !L"&*#*r>   r"  c                      ^  \ rS rSrSrU 4S jrS rS rS r\	             SS\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\\R                        S\
\   S\
\   S\
\   S\
\   S\\\4   4S jj5       rSrU =r$ )SplinterModeli  a  
The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r#   r$   r:   r   rM   r   encoder	post_initr8   s     r<   r$   SplinterModel.__init__   s9     ,V4&v. 	r>   c                 .    U R                   R                  $ r   rM   r)   )r9   s    r<   get_input_embeddings"SplinterModel.get_input_embeddings*  s    ...r>   c                 $    XR                   l        g r   r;  )r9   rl   s     r<   set_input_embeddings"SplinterModel.set_input_embeddings-  s    */'r>   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr7  r   r   r   )r9   heads_to_pruner   r   s       r<   _prune_headsSplinterModel._prune_heads0  s<    
 +002LELLu%//;;EB 3r>   r?   rz   r@   r   r{   rA   r|   r}   r  r   r   r  r  rC   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUc  [        R                  " UUU-   4US9nUc$  [        R                  " U[        R                  US	9nU R                  X.5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R!                  U5      nOSnU R#                  XPR                   R$                  5      nU R'                  UUUUUS
9nU R)                  UUUUUU	U
UUUS9
nUS   nU(       d	  U4USS -   $ [+        UUR,                  UR.                  UR0                  UR2                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   rc   )rG   rE   )r?   r   r@   rA   rB   )	rz   r{   r|   r}   r  r   r   r  r  r   r  )r:   r   r  use_return_dictro   r   rf   %warn_if_padding_and_no_attention_maskrH   rG   r   r4   onesrI   rJ   get_extended_attention_maskinvert_attention_maskget_head_maskr   rM   r7  r   r  ry   r  r  )r9   r?   rz   r@   r   r{   rA   r|   r}   r  r   r   r  r  rK   
batch_sizerL   rG   rB   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputs                               r<   rN   SplinterModel.forward8  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!"[[EJJvVN 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,#%(;;;8-+;;)77&11,==
 	
r>   )r:   rM   r7  )NNNNNNNNNNNNN)rP   rQ   rR   rS   rT   r$   r<  r?  rD  r   r   r4   r   r   rV   r   r   r   r   rN   rX   rY   rZ   s   @r<   r5  r5    sg   /0C  -11515/3,0048<9==A$(,0/3&*s
ELL)s
 !.s
 !.	s

 u||,s
 ELL)s
  -s
  (5s
 !) 6s
 "$u'8'8"9:s
 D>s
 $D>s
 'tns
 d^s
 
u??	@s
 s
r>   r5  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )SplinterFullyConnectedLayeri  c                    > [         TU ]  5         Xl        X l        [        R
                  " U R                  U R                  5      U l        [        U   U l        [        R                  " U R                  5      U l	        g r   )
r#   r$   	input_dim
output_dimr   ri   r   r   act_fnr.   )r9   rZ  r[  r   r;   s       r<   r$   $SplinterFullyConnectedLayer.__init__  sR    "$YYt~~t?
Z(doo6r>   inputsrC   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r\  r.   )r9   r^  ry   s      r<   rN   #SplinterFullyConnectedLayer.forward  s2    

6*M2}5r>   )r.   r\  r   rZ  r[  )gelur   rZ   s   @r<   rX  rX    s(    7ell u||  r>   rX  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )QuestionAwareSpanSelectionHeadi  z^
Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

c                   > [         TU ]  5         [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        [        R                  " UR                  UR                  SS9U l
        [        R                  " UR                  UR                  SS9U l        g )NF)r+  )r#   r$   rX  r'   query_start_transformquery_end_transformstart_transformend_transformr   ri   start_classifierend_classifierr8   s     r<   r$   'QuestionAwareSpanSelectionHead.__init__  s    %@ASASU[UgUg%h"#>v?Q?QSYSeSe#f :6;M;MvOaOab89K9KVM_M_` "		&*<*<f>P>PW\ ] ii(:(:F<N<NUZ[r>   c                    UR                  5       u    p4UR                  S5      R                  SSU5      n[        R                  " USUS9nU R                  U5      nU R                  U5      nU R                  U5      n	U R                  U5      n
U R                  U5      nU	R                  SSS5      n	[        R                  " X5      nU R                  U5      nU
R                  SSS5      n
[        R                  " X5      nX4$ )Nr   r   )r   r   r   rc   )rH   	unsqueezerepeatr4   gatherre  rf  rg  rh  ri  ru   r   rj  )r9   r^  	positionsr  r   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsry   start_logits
end_logitss                 r<   rN   &QuestionAwareSpanSelectionHead.forward  s    KKM	1##B'..q!S9V%@55mD11-@))&1
%%f---.>?''1a0
||M>++N;##Aq!,\\-:
''r>   )rj  rh  rf  re  ri  rg  )	rP   rQ   rR   rS   rT   r$   rN   rX   rY   rZ   s   @r<   rc  rc    s    
	\( (r>   rc  c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\R                     S\\\4   4S jj5       rSrU =r$ )SplinterForQuestionAnsweringi  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        UR                  U l        U R                  5         g r   r#   r$   r5  r#  rc  splinter_qassquestion_token_idr8  r8   s     r<   r$   %SplinterForQuestionAnswering.__init__  C     %f-;FC!'!9!9 	r>   r?   rz   r@   r   r{   rA   start_positionsend_positionsr   r  r  question_positionsrC   c                    Ub  UOU R                   R                  nSnUc  UbB  [        R                  " [        R                  " XR
                  5      R                  5       SS9nOH[        R                  " UR                  S5      [        R                  UR                  UR                  S9nUR                  S5      nSnU R                  UUUUUUU	U
US9	nUS   nU R                  UU5      u  nnU(       a"  UR                  S	5      UR                  S	5      nnUbf  US	U-
  [        R                   " UR"                  5      R$                  -  -   nUS	U-
  [        R                   " UR"                  5      R$                  -  -   nSnUb  Ub  ['        UR                  5       5      S	:  a  UR                  S5      n['        UR                  5       5      S	:  a  UR                  S5      nUR                  S	5      nUR)                  SU5        UR)                  SU5        [+        US
9nU" UU5      nU" UU5      nUU-   S-  nU(       d  UU4US	S -   nUb  U4U-   $ U$ [-        UUUUR.                  UR0                  S9$ )aI  
token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
    The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
    num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
    the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
    sequence_length)`.
NFr   r   r   )rF   layoutrG   Trz   r@   r   r{   rA   r   r  r  r   ignore_indexrc   lossrv  rw  ry   r  )r:   rG  r4   argmaxeqr~  rW   rI   rH   rJ   r  rG   rm  r#  r}  squeezefinforF   minr   clamp_r	   r   ry   r  )r9   r?   rz   r@   r   r{   rA   r  r  r   r  r  r  question_positions_were_none"question_position_for_each_exampler   rU  rv  rw  
total_lossignored_indexloss_fct
start_lossend_lossr   s                            r<   rN   $SplinterForQuestionAnswering.forward  s   H &1%<k$++B]B]',$%$5:\\XXi)?)?@EEGR62 6;[[!&&q)MDXDXanauau62 "D!M!Mb!Q+/(--))%'/!5#   

 "!*#'#5#5oGY#Z j''3';';A'>
@R@RST@U*L%'1~+=\M_M_A`AdAd*ddL#q>'9U[[IYIY=Z=^=^&^^J
&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r>   r~  r#  r}  NNNNNNNNNNNN)rP   rQ   rR   rS   r$   r   r   r4   r   rU   r   r   r   r   rN   rX   rY   rZ   s   @r<   rz  rz    s?     -11515/3,0046:48,0/3&*9=c
ELL)c
 !.c
 !.	c

 u||,c
 ELL)c
  -c
 "%"2"23c
   0 01c
 $D>c
 'tnc
 d^c
 %U%5%56c
 
u22	3c
 c
r>   rz  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
SplinterForPreTrainingOutputiZ  a  
Class for outputs of Splinter as a span selection model.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-end scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr  rv  rw  ry   r  r  )rP   rQ   rR   rS   rT   r  r   r4   rV   __annotations__rv  rw  ry   r   r  rX   r  r>   r<   r  r  Z  s|    . )-D(5$$
%,04L(5,,-4.2J**+28<M8E%"3"345<59Ju00129r>   r  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    )custom_introc                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\R                     S\\\4   4S jj5       rS\R                  S\R                  4S jrSrU =r$ )SplinterForPreTrainingiz  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        UR                  U l        U R                  5         g r   r|  r8   s     r<   r$   SplinterForPreTraining.__init__  r  r>   r?   rz   r@   r   r{   rA   r  r  r   r  r  r  rC   c                 r   Ub  UOU R                   R                  nUc  Ub  Ub  [        S5      eUc  Uc  [        S5      eUc  U R                  U5      nU R	                  UUUUUUU	U
US9	nUS   nUR                  5       u  nnnU R                  X5      u  nnUR                  S5      nUb  UR                  S5      R                  UUU5      nUSU-
  [        R                  " UR                  5      R                  -  -   nUSU-
  [        R                  " UR                  5      R                  -  -   nSnUb  Ub  UR                  S[        SUS-
  5      5        UR                  S[        SUS-
  5      5        [        U R                   R                   S9nU" UR#                  UU-  U5      UR#                  UU-  5      5      nU" UR#                  UU-  U5      UR#                  UU-  5      5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [%        UUUUR&                  UR(                  S	9$ )
a
  
input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
    Labels for position (index) of the start of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
    Labels for position (index) of the end of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
    The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
    num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
    the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
    sequence_length)`.
NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedr  r   r   r  rc   r  )r:   rG  	TypeError_prepare_question_positionsr#  rH   r}  rm  r6   r4   r  rF   r  r  maxr	   r(   rt   r  ry   r  )r9   r?   rz   r@   r   r{   rA   r  r  r   r  r  r  r   rU  rM  sequence_lengthr   rv  rw  num_questions attention_mask_for_each_questionr  r  r  r  r   s                              r<   rN   SplinterForPreTraining.forward  s   n &1%<k$++B]B]%/*E-Jcabb'I,=\]]'!%!A!A)!L--))%'/!5#   

 "!*+:+?+?+A(
OS#'#5#5o#Z j*//2%/=/G/G/J/Q/QM?0, (1/O+OSXS^S^_k_q_qSrSvSv*vvL#q+K'Ku{{[e[k[kOlOpOp&ppJ
&=+D""1c!_q-@&AB  C?Q+>$?@ (T[[5M5MNH!!!*}"<oN$$Z-%?@J  
] :OL"":#=>H %x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r>   c                    [         R                  " XR                  R                  :H  5      u  p#[         R                  " U5      n[         R
                  " UR                  S5      UR                  5       4U R                  R                  [         R                  UR                  S9n[         R                  " U Vs/ s H  n[         R                  " U5      PM     sn5      nX5X'4'   U$ s  snf )Nr   rE   )r4   wherer:   r~  bincountfullrH   r  r(   rJ   rG   r   r5   )r9   r?   rowsflat_positionsr  rp  ncolss           r<   r  2SplinterForPreTraining._prepare_question_positions	  s    ${{98U8U+UVt,JJ^^A 1 1 34KK$$**##	
	 yy=A=a%,,q/=AB .$* Bs   ; C)r  r  )rP   rQ   rR   rS   r$   r   r   r4   r   rU   r   r   r   r  rN   r  rX   rY   rZ   s   @r<   r  r  z  s`     -11515/3,0046:48,0/3&*9=z
ELL)z
 !.z
 !.	z

 u||,z
 ELL)z
  -z
 "%"2"23z
   0 01z
 $D>z
 'tnz
 d^z
 %U%5%56z
 
u22	3z
 z
xU\\ ell  r>   r  )rz  r  r   r5  r"  )4rT   r   dataclassesr   typingr   r   r   r   r4   torch.utils.checkpointr   torch.nnr	   activationsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_splinterr   
get_loggerrP   r  Moduler   r\   r   r   r   r   r   r   r   r"  r5  rX  rc  rz  r  r  __all__r  r>   r<   <module>r     s     ! / /    % ! t t - l l 3 
		H	%3 3nCBII CN  "# 0		 0h299  RYY SBII SnZ
bii Z
z *o * *, S
+ S
 S
l")) $#(RYY #(L o
#: o
 o
d :; : :> S4 SSlr>   