
    fTh                    .   S r SSKrSSKJrJrJrJr  SSKrSSKrSSK	J
r
  SSKJr  SSKJrJrJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJrJrJrJrJrJrJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(J)r)J*r*  SSK+J,r,  \*RZ                  " \.5      r/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S S\25      r3 " S S\R`                  5      r4\2\3S.r5 " S S\R`                  5      r6 " S S\R`                  5      r7 " S S\R`                  5      r8 " S  S!\R`                  5      r9 " S" S#\R`                  5      r: " S$ S%\R`                  5      r;\( " S& S'\"5      5       r< " S( S)\R`                  5      r= " S* S+\R`                  5      r>\( " S, S-\<5      5       r?\( " S. S/\<5      5       r@\(" S0S19 " S2 S3\<5      5       rA\( " S4 S5\<5      5       rB\( " S6 S7\<5      5       rC\( " S8 S9\<5      5       rD\(" S:S19 " S; S<\<\5      5       rES?S= jrF/ S>QrGg)@zPyTorch CamemBERT model.    N)ListOptionalTupleUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringget_torch_versionlogging   )CamembertConfigc                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )CamembertEmbeddings2   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r!   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr*   register_buffertorcharangeexpandzerosr,   sizelongr'   selfconfig	__class__s     h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/camembert/modeling_camembert.pyr3   CamembertEmbeddings.__init__8   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr-   r!   r/   r   r1   devicer+   )"create_position_ids_from_input_idsr'   &create_position_ids_from_inputs_embedsrH   hasattrr/   rF   rD   rG   rI   r,   rS   r8   r<   r*   r:   r=   rA   )rK   	input_idsr/   r,   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr<   
embeddingsr:   s                rN   forwardCamembertEmbeddings.forwardQ   sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
rP   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr-   r!   rR   r   )rH   rD   rE   r'   rI   rS   	unsqueezerF   )rK   rX   rZ   sequence_lengthr,   s        rN   rU   :CamembertEmbeddings.create_position_ids_from_inputs_embedsy   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rP   )r=   rA   r'   r*   r:   r<   r8   )NNNNr   )
__name__
__module____qualname____firstlineno____doc__r3   r_   rU   __static_attributes____classcell__rM   s   @rN   r$   r$   2   s$    

4 rs&P= =rP   r$   c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )CamembertSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r*   r+   relative_keyrelative_key_query   r!   )r2   r3   r6   num_attention_headsrV   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer?   attention_probs_dropout_probrA   rB   r*   r9   r4   distance_embedding
is_decoderrK   rL   r*   rM   s      rN   r3   CamembertSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++rP   xreturnc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr-   r   ru   r!   r   )rH   rv   ry   viewpermute)rK   r   new_x_shapes      rN   transpose_for_scores+CamembertSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$rP   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r!   ru   dimr-   rs   rt   rR   r0   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r|   r   r}   r~   rD   catr   matmul	transposer*   shapetensorrI   rS   r   rE   r   r9   tor1   einsummathsqrtry   r   
functionalsoftmaxrA   r   
contiguousrH   rz   )rK   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               rN   r_   CamembertSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11GrP   )rz   ry   r   rA   r   r}   r9   rv   r*   r|   r~   NNNNNNF)re   rf   rg   rh   r3   rD   Tensorr   r   FloatTensorr   boolr_   rj   rk   rl   s   @rN   rn   rn      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c crP   rn   c                   .  ^  \ rS rSrSU 4S jjr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4U 4S jjjrSrU =r$ )CamembertSdpaSelfAttentioni  c                    > [         TU ]  XS9  UR                  U l        [        R
                  " [        5       5      [        R
                  " S5      :  U l        g )Nr*   z2.2.0)r2   r3   r   dropout_probr   parser   require_contiguous_qkvr   s      rN   r3   #CamembertSdpaSelfAttention.__init__  sE    Q"??&-mm4E4G&H7==Y`Ka&a#rP   r   r   r   r   r   r   r   r   c           	        > U R                   S:w  d
  U(       d  Ub*  [        R                  S5        [        TU ]  UUUUUUU5      $ UR                  5       u  pn
U R                  U R                  U5      5      nUS LnU(       a  UOUnU(       a  UOUnU(       a/  U(       a(  US   R                  S   UR                  S   :X  a  Uu  pO~U R                  U R                  U5      5      nU R                  U R                  U5      5      nUb;  U(       d4  [        R                  " US   U/SS9n[        R                  " US   U/SS9nU R                  (       a  X4nU R                  (       aM  UR                  R                   S:X  a3  Ub0  UR#                  5       nUR#                  5       nUR#                  5       nU R                  (       a  U(       d  Uc  U	S:  a  SOS	n[        R$                  R&                  R)                  UUUUU R*                  (       a  U R,                  OS
US9nUR/                  SS5      nUR1                  XU R2                  5      nU4nU R                  (       a  UU4-   nU$ )Nr+   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   ru   r!   r   cudaTF        )	attn_mask	dropout_p	is_causal)r*   loggerwarning_oncer2   r_   rH   r   r|   r   r}   r~   rD   r   r   r   rS   typer   r   r   scaled_dot_product_attentiontrainingr   r   reshaperz   )rK   r   r   r   r   r   r   r   bsztgt_len_r   r   current_statesr   r   r   attn_outputr   rM   s                      rN   r_   "CamembertSdpaSelfAttention.forward  sZ    '':59JiNcH 7?%&!  (,,.a//

=0IJ 3$>2D.-3E/> .^A5F5L5LQ5OSaSgSghiSj5j%3"I{11$((>2JKI33DJJ~4NOK)2D!II~a'8)&D!L	#ii):K(HaP?? (5N
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OO,>>CY^ehi^iDot 	 hh))FF$+/==d''c G 
 "++Aq1!))#8J8JK.?? 11GrP   )r   r   r   r   )re   rf   rg   rh   r3   rD   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r   r     s    b 2615=A>BDH,1[||[ !.[ E--.	[
  ((9(9:[ !)):): ;[ !uU->->'?!@A[ $D>[ 
u||	[ [rP   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )CamembertSelfOutputiy  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr(   )r2   r3   r   r{   r6   denser=   r>   r?   r@   rA   rJ   s     rN   r3   CamembertSelfOutput.__init__z  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rP   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rA   r=   rK   r   r   s      rN   r_   CamembertSelfOutput.forward  5    

=1]3}'CDrP   r=   r   rA   
re   rf   rg   rh   r3   rD   r   r_   rj   rk   rl   s   @rN   r   r   y  6    >U\\  RWR^R^  rP   r   )eagersdpac                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )CamembertAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr   )	r2   r3    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrK   r   outputsetpruned_headsr   s      rN   r3   CamembertAttention.__init__  s@    4V5P5PQ
	 *&1ErP   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r!   r   )lenr   rK   rv   ry   r   r   r|   r}   r~   r   r   rz   union)rK   headsindexs      rN   prune_headsCamembertAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rP   r   r   r   r   r   r   r   r   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r!   )rK   r   )rK   r   r   r   r   r   r   r   self_outputsattention_outputr   s              rN   r_   CamembertAttention.forward  sW     yy!"
  ;;|AF#%QR(88rP   )r   r   rK   r   r   )re   rf   rg   rh   r3   r   rD   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r   r     s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 rP   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CamembertIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r2   r3   r   r{   r6   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrJ   s     rN   r3   CamembertIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rP   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r  )rK   r   s     rN   r_   CamembertIntermediate.forward  s&    

=100?rP   r  r   rl   s   @rN   r   r     s(    9U\\ ell  rP   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )CamembertOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r2   r3   r   r{   r   r6   r   r=   r>   r?   r@   rA   rJ   s     rN   r3   CamembertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rP   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      rN   r_   CamembertOutput.forward  r   rP   r   r   rl   s   @rN   r	  r	    r   rP   r	  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )CamembertLayeri  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr!   z> should be used as a decoder model if cross attention is addedr+   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionrw   crossattentionr   intermediater	  r   rJ   s     rN   r3   CamembertLayer.__init__  s    '-'E'E$+F3 ++#)#=#= ##?? D6)g!hii"4VU_"`D1&9%f-rP   r   r   r   r   r   r   r   r   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nru   )r   r   r   r!   r-   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r   rV   rw   r  r   feed_forward_chunkr  r  )rK   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    rN   r_   CamembertLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44GrP   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r   )rK   r   intermediate_outputr   s       rN   r  !CamembertLayer.feed_forward_chunk1  s)    "//0@A{{#6IrP   )r  r  r  r  r  r   r   r  r   )re   rf   rg   rh   r3   rD   r   r   r   r   r   r_   r  rj   rk   rl   s   @rN   r  r    s    ." 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B rP   r  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )CamembertEncoderi8  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r2   r3   rL   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)rK   rL   r   rM   s      rN   r3   CamembertEncoder.__init__9  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r-   r!   ru   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r2  ).0vs     rN   	<genexpr>+CamembertEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater.  r   
attentionscross_attentions)rL   r  r,  r   r   r   	enumerater+  _gradient_checkpointing_func__call__tupler   )rK   r   r   r   r   r   r.  r   r   r/  r0  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       rN   r_   CamembertEncoder.forward?  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
rP   )rL   r,  r+  )	NNNNNNFFT)re   rf   rg   rh   r3   rD   r   r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r&  r&  8  s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
rP   r&  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CamembertPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r2   r3   r   r{   r6   r   Tanh
activationrJ   s     rN   r3   CamembertPooler.__init__  s9    YYv1163E3EF
'')rP   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   rL  )rK   r   first_token_tensorpooled_outputs       rN   r_   CamembertPooler.forward  s6     +1a40

#566rP   )rL  r   r   rl   s   @rN   rI  rI    s(    $
U\\ ell  rP   rI  c                   *    \ rS rSr\rSrSrSrS r	Sr
g)CamembertPreTrainedModeli  robertaTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsr   )meanstdNg      ?)r   r   r{   weightdatanormal_rL   initializer_rangebiaszero_r4   r'   r=   fill_CamembertLMHead)rK   modules     rN   _init_weights&CamembertPreTrainedModel._init_weights  s2   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)00KK""$ 1rP   r2  N)re   rf   rg   rh   r"   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdparb  rj   r2  rP   rN   rT  rT    s    "L!&*#N%rP   rT  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )CamembertClassificationHeadi  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r2   r3   r   r{   r6   r   classifier_dropoutr@   r?   rA   
num_labelsout_projrK   rL   rk  rM   s      rN   r3   $CamembertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrP   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rO  )rA   r   rD   tanhrm  rK   featureskwargsr   s       rN   r_   #CamembertClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rP   )r   rA   rm  )	re   rf   rg   rh   ri   r3   r_   rj   rk   rl   s   @rN   ri  ri    s    7I rP   ri  c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )r`  i  z,Camembert Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g r   )r2   r3   r   r{   r6   r   r=   r>   
layer_normr5   decoder	ParameterrD   rG   r]  rJ   s     rN   r3   CamembertLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrP   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   rx  ry  rr  s       rN   r_   CamembertLMHead.forward  s;    JJx GOOA LLOrP   c                     U R                   R                  R                  R                  S:X  a  U R                  U R                   l        g U R                   R                  U l        g )Nmeta)ry  r]  rS   r   rK   s    rN   _tie_weightsCamembertLMHead._tie_weights  sC     <<##((F2 $		DLL))DIrP   )r]  ry  r   rx  )
re   rf   rg   rh   ri   r3   r_   r  rj   rk   rl   s   @rN   r`  r`    s    6&* *rP   r`  c                      ^  \ rS rSrSr/ rSU 4S jjrS rS rS r	\
             SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )CamembertModeli  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in *Attention is
all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
Kaiser and Illia Polosukhin.

To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
`True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        UR                  U l
        UR                  U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r2   r3   rL   r$   r^   r&  encoderrI  poolerr   attn_implementationr*   	post_init)rK   rL   add_pooling_layerrM   s      rN   r3   CamembertModel.__init__  sg    
 	 -f5'/1Bof-#)#>#> '-'E'E$ 	rP   c                 .    U R                   R                  $ r   r^   r8   r  s    rN   get_input_embeddings#CamembertModel.get_input_embeddings  s    ...rP   c                 $    XR                   l        g r   r  )rK   r~   s     rN   set_input_embeddings#CamembertModel.set_input_embeddings!  s    */'rP   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r+  r  r   )rK   heads_to_pruner+  r   s       rN   _prune_headsCamembertModel._prune_heads$  s<    
 +002LELLu%//;;EB 3rP   rW   r   r/   r,   r   rX   r   r   r.  r   r   r/  r0  r   c                 R   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUcs  [        U R                  S5      (       a4  U R                  R                  S S 2S U24   nUR                  UU5      nUnO$[        R                   " U[        R"                  US9nU R                  UUUUUS	9nUc  [        R$                  " UUU-   4US
9nU R&                  S:H  =(       a(    U R(                  S:H  =(       a    US L =(       a    U(       + nU(       aT  UR+                  5       S:X  a@  U R                   R                  (       a  [-        UUUU5      nO'[/        UUR0                  US9nOU R3                  X.5      nU R                   R                  (       av  Ubs  UR                  5       u  nnnUU4nUc  [        R$                  " UUS
9nU(       a*  UR+                  5       S:X  a  [/        UUR0                  US9nOU R5                  U5      nOS nU R7                  XPR                   R8                  5      nU R;                  UUUUUU	U
UUUS9
nUS   nU R<                  b  U R=                  U5      OS nU(       d
  UU4USS  -   $ [?        UUUR@                  URB                  URD                  URF                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr   ru   r/   rR   )rW   r,   r/   rX   rY   )rS   r   r+   )r   )	r   r   r   r   r.  r   r   r/  r0  r!   )r8  pooler_outputr.  r   r9  r:  )$rL   r   r/  use_return_dictr   r   rw   %warn_if_padding_and_no_attention_maskrH   rS   r   rV   r^   r/   rF   rD   rG   rI   onesr  r*   r   r   r   r1   get_extended_attention_maskinvert_attention_maskget_head_maskr*  r  r  r   r.  r   r9  r:  ) rK   rW   r   r/   r,   r   rX   r   r   r.  r   r   r/  r0  rZ   
batch_sizer[   rS   rY   r\   r]   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputrQ  s                                    rN   r_   CamembertModel.forward,  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y++2O2OP	,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rP   )r  rL   r^   r  r  r*   )T)NNNNNNNNNNNNN)re   rf   rg   rh   ri   _no_split_modulesr3   r  r  r  r   r   rD   r   r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    sx    &/0C  -11515/3,0048<9==A$(,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
  (5L
 !) 6L
 "$u'8'8"9:L
 D>L
 $D>L
 'tnL
 d^L
 
uU\\"$PP	QL
 L
rP   r  c                     ^  \ rS rSrSS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rSrU =r$ )CamembertForMaskedLMi  lm_head.decoder.weightlm_head.decoder.biasc                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  
r2   r3   r   r   warningr  rU  r`  lm_headr  rJ   s     rN   r3   CamembertForMaskedLM.__init__  sR     NN1
 &fF&v. 	rP   c                 .    U R                   R                  $ r   r  ry  r  s    rN   get_output_embeddings*CamembertForMaskedLM.get_output_embeddings      ||###rP   c                 $    XR                   l        g r   r  rK   new_embeddingss     rN   set_output_embeddings*CamembertForMaskedLM.set_output_embeddings      -rP   rW   r   r/   r,   r   rX   r   r   labelsr   r/  r0  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nU R                  U5      nSnU	ba  U	R	                  UR
                  5      n	[        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
N)
r   r/   r,   r   rX   r   r   r   r/  r0  r   r-   ru   losslogitsr   r9  )rL   r  rU  r  r   rS   r
   r   r5   r   r   r9  )rK   rW   r   r/   r,   r   rX   r   r   r  r   r/  r0  r   r  prediction_scoresmasked_lm_lossloss_fctr   s                      rN   r_   CamembertForMaskedLM.forward  s   > &1%<k$++B]B],,))%'"7#9/!5#  
 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rP   r  rU  )NNNNNNNNNNNN)re   rf   rg   rh   _tied_weights_keysr3   r  r  r   r   rD   
LongTensorr   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    sk    34JK$.  156:59371559=A>B-1,0/3&*@
E,,-@
 !!2!23@
 !!1!12	@

 u//0@
 E--.@
   1 12@
  ((9(9:@
 !)):): ;@
 ))*@
 $D>@
 'tn@
 d^@
 
uU\\"N2	3@
 @
rP   r  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )"CamembertForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFr  )	r2   r3   rl  rL   r  rU  ri  
classifierr  rJ   s     rN   r3   +CamembertForSequenceClassification.__init__#  sH      ++%fF5f= 	rP   rW   r   r/   r,   r   rX   r  r   r/  r0  r   c                 f   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUGb  UR	                  UR
                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [!        5       nU" X5      nU
(       d  U4US	S -   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  S
9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r/   r,   r   rX   r   r/  r0  r   r!   
regressionsingle_label_classificationmulti_label_classificationr-   ru   r  )rL   r  rU  r  r   rS   problem_typerl  r1   rD   rI   rx   r   squeezer
   r   r	   r   r   r9  rK   rW   r   r/   r,   r   rX   r  r   r/  r0  r   r  r  r  r  r   s                    rN   r_   *CamembertForSequenceClassification.forward.  s   : &1%<k$++B]B],,))%'/!5#  

 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rP   )r  rL   rl  rU  
NNNNNNNNNN)re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s"   	  156:59371559-1,0/3&*N
E,,-N
 !!2!23N
 !!1!12	N

 u//0N
 E--.N
   1 12N
 ))*N
 $D>N
 'tnN
 d^N
 
uU\\"$<<	=N
 N
rP   r  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )CamembertForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr!   )r2   r3   r  rU  r   r?   r@   rA   r{   r6   r  r  rJ   s     rN   r3   #CamembertForMultipleChoice.__init__  sV     %f-zz&"<"<=))F$6$6: 	rP   rW   r/   r   r  r,   r   rX   r   r/  r0  r   c                    U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb.  UR                  UR                  5      n[        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )aO  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr!   r-   r   )r,   r/   r   r   rX   r   r/  r0  ru   r  )rL   r  r   r   rH   rU  rA   r  r   rS   r
   r   r   r9  )rK   rW   r/   r   r  r,   r   rX   r   r/  r0  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   rQ  r  reshaped_logitsr  r  r   s                           rN   r_   "CamembertForMultipleChoice.forward  s   Z &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,*..,/!5#  

  
]3/ ++b+6YY556F')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rP   )r  rA   rU  r  )re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s"     15596:-1371559,0/3&*Z
E,,-Z
 !!1!12Z
 !!2!23	Z

 ))*Z
 u//0Z
 E--.Z
   1 12Z
 $D>Z
 'tnZ
 d^Z
 
uU\\"$==	>Z
 Z
rP   r  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )CamembertForTokenClassificationi  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r  )r2   r3   rl  r  rU  rk  r@   r   r?   rA   r{   r6   r  r  rn  s      rN   r3   (CamembertForTokenClassification.__init__  s      ++%fF)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rP   rW   r   r/   r,   r   rX   r  r   r/  r0  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUbW  UR                  UR                  5      n[        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r-   ru   r  )rL   r  rU  rA   r  r   rS   r
   r   rl  r   r   r9  r  s                    rN   r_   'CamembertForTokenClassification.forward  s   6 &1%<k$++B]B],,))%'/!5#  

 "!*,,71YYv}}-F')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rP   )r  rA   rl  rU  r  )re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s     156:59371559-1,0/3&*=
E,,-=
 !!2!23=
 !!1!12	=

 u//0=
 E--.=
   1 12=
 ))*=
 $D>=
 'tn=
 d^=
 
uU\\"$99	:=
 =
rP   r  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )CamembertForQuestionAnsweringi=  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r2   r3   rl  r  rU  r   r{   r6   
qa_outputsr  rJ   s     rN   r3   &CamembertForQuestionAnswering.__init__@  sU      ++%fF))F$6$68I8IJ 	rP   rW   r   r/   r,   r   rX   start_positionsend_positionsr   r/  r0  r   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
Nr  r   r!   r-   r   )ignore_indexru   )r  start_logits
end_logitsr   r9  )rL   r  rU  r  splitr  r   r   rH   clampr
   r   r   r9  )rK   rW   r   r/   r,   r   rX   r  r  r   r/  r0  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rN   r_   %CamembertForQuestionAnswering.forwardJ  s   4 &1%<k$++B]B],,))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rP   )rl  r  rU  )NNNNNNNNNNN)re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r  =  s;     156:593715596:48,0/3&*I
E,,-I
 !!2!23I
 !!1!12	I

 u//0I
 E--.I
   1 12I
 "%"2"23I
   0 01I
 $D>I
 'tnI
 d^I
 
uU\\"$@@	AI
 I
rP   r  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            "         ^  \ rS rSrSS/rU 4S jrS rS r\              SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\\
R                           S\	\   S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rS rSrU =r$ )CamembertForCausalLMi  r  r  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  r  rJ   s     rN   r3   CamembertForCausalLM.__init__  sL       NNno%fF&v. 	rP   c                 .    U R                   R                  $ r   r  r  s    rN   r  *CamembertForCausalLM.get_output_embeddings  r  rP   c                 $    XR                   l        g r   r  r  s     rN   r  *CamembertForCausalLM.set_output_embeddings  r  rP   rW   r   r/   r,   r   rX   r   r   r  r.  r   r   r/  r0  r   c                    Ub  UOU R                   R                  nU	b  SnU R                  UUUUUUUUU
UUUUS9nUS   nU R                  U5      nSnU	bE  U	R	                  UR
                  5      n	U R                  " UU	4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
>>> config = AutoConfig.from_pretrained("almanach/camembert-base")
>>> config.is_decoder = True
>>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NF)r   r/   r,   r   rX   r   r   r.  r   r   r/  r0  r   r5   ru   )r  r  r.  r   r9  r:  )rL   r  rU  r  r   rS   loss_functionr5   r   r.  r   r9  r:  )rK   rW   r   r/   r,   r   rX   r   r   r  r.  r   r   r/  r0  rt  r   r  r  lm_lossr   s                        rN   r_   CamembertForCausalLM.forward  s4   d &1%<k$++B]B]I,,))%'"7#9+/!5#  
  "!* LL9YY0778F((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rP   c                 P   ^ SnU H  nU[        U4S jU 5       5      4-  nM     U$ )Nr2  c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr   rS   )r4  
past_statebeam_idxs     rN   r6  6CamembertForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)r>  )rK   r.  r  reordered_past
layer_pasts     `  rN   _reorder_cache#CamembertForCausalLM._reorder_cache  s8    )Jncmnn N * rP   r  )NNNNNNNNNNNNNN)re   rf   rg   rh   r  r3   r  r  r   r   rD   r  r   r   r   r   r   r   r_   r  rj   rk   rl   s   @rN   r  r    s    34JK
$.  156:59371559=A>B-1EI$(,0/3&*^
E,,-^
 !!2!23^
 !!1!12	^

 u//0^
 E--.^
   1 12^
  ((9(9:^
 !)):): ;^
 ))*^
 "%e.?.?(@"AB^
 D>^
 $D>^
 'tn^
 d^^
" 
uU\\"$EE	F#^
 ^
@ rP   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r!   r   )nerx   rD   cumsumtype_asrI   )rW   r'   rY   maskincremental_indicess        rN   rT   rT     sW     <<$((*D <<!4<<TBE[[_cc##%33rP   )r  r  r  r  r  r  r  rT  )r   )Hri   r   typingr   r   r   r   rD   torch.utils.checkpoint	packagingr   r   torch.nnr	   r
   r   activationsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r    configuration_camembertr"   
get_loggerre   r   Moduler$   rn   r   r   r   r   r   r	  r  r&  rI  rT  ri  r`  r  r  r  r  r  r  r  rT   __all__r2  rP   rN   <module>r.     s]      / /     A A ' ) w	 	 	 . l l ? ? 4 
		H	%V=")) V=tCRYY CNb!7 bL"))  $&$  0 0hBII  bii SRYY SnZ
ryy Z
|bii  % % %6")) .*bii *> B
- B
 B
J Y
3 Y
 Y
x [
)A [
[
| f
!9 f
 f
R M
&> M
 M
` U
$< U
 U
p |3_ ||@4 	rP   