
    fTh             
          S r SSKrSSKJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJrJr  SSK
Jr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+  SSK,J-r-  \(       a  SSK.J/r/  \" 5       (       a  SSKJ0r0  \+Rb                  " \25      r3 " S S\	Rh                  5      r5S r6S>S jr7 " S S\	Rp                  5      r9S\Rt                  S\;S\Rx                  S\Rt                  4S jr=S \Rt                  S!\Rt                  S"\>S#\?S\Rt                  4
S$ jr@ " S% S&\	Rp                  5      rA " S' S(\A5      rB " S) S*\	Rp                  5      rC\A\A\BS+.rD " S, S-\	Rp                  5      rE\* " S. S/\(5      5       rF\* " S0 S1\F5      5       rG\*" S2S39 " S4 S5\F\5      5       rH\*" S6S39 " S7 S8\F5      5       rI\* " S9 S:\F5      5       rJ\* " S; S<\F5      5       rK/ S=QrLg)?zPyTorch Falcon model.    N)TYPE_CHECKINGOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )get_activation)CacheDynamicCacheStaticCache)GenerationMixin)AttentionMaskConverter)!flash_attn_supports_top_left_maskis_flash_attn_available))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringlogging   )FalconConfig)PretrainedConfig)_flash_attention_forwardc                   N    \ rS rSrS\R
                  S\R
                  4S jrSrg)FalconLinear<   inputreturnc                 l    XR                   R                  -  nU R                  c  U$ X R                  -   $ N)weightTbias)selfr'   hidden_statess      b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/falcon/modeling_falcon.pyforwardFalconLinear.forward=   s/    -99  yy((     N)__name__
__module____qualname____firstlineno__torchTensorr1   __static_attributes__r4   r3   r0   r%   r%   <   s    )U\\ )ell )r3   r%   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shaper9   cat)xx1x2s      r0   rotate_halfrF   E   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerF   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embrQ   M   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr3   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )FalconRotaryEmbeddingi   configc                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)super__init__hasattrrW   getrX   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrU   r   rope_init_fnattention_scalingregister_bufferr[   original_inv_freq)r.   rU   devicer[   	__class__s       r0   r^   FalconRotaryEmbedding.__init__j   s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r3   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r=   r    mpscpuF)device_typeenabledr>   r?   dtype)r[   floatexpandrA   torh   
isinstancerY   strr9   autocast	transposerB   rK   re   rL   rq   )
r.   rC   rM   inv_freq_expandedposition_ids_expandedrn   freqsembrK   rL   s
             r0   r1   FalconRotaryEmbedding.forward{   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)re   rU   rb   rg   rc   rd   rX   r*   )r5   r6   r7   r8   r!   r^   r9   no_gradr   r1   r;   __classcell__ri   s   @r0   rS   rS   i   s6    /| / /" ]]_<  <r3   rS   attention_mask	num_headsrq   r(   c                    U R                   u  p4S[        R                  " [        R                  " U5      5      -  n[        R
                  " SS[        R                  " U5      S-
  * -  * -  U R                  [        R                  S9n[        R                  " SSU-   U R                  [        R                  S9n[        R                  " Xg5      nXQ:w  a  [        R
                  " SS[        R                  " SU-  5      S-
  * -  * -  U R                  [        R                  S9n	[        XQU-
  5      n
[        R                  " SSSU
-  -   SU R                  [        R                  S9n[        R                  " U[        R                  " X5      /SS9nU R                  SS9S-
  U -  S S 2S S S 24   nUS   R                  5       U-  nUR                  X1-  SU5      R!                  U5      $ )	Nr>   r   rh   rq   r    r   r?   r=   ).N)rA   mathfloorlog2r9   tensorrh   float32arangeint32powminrB   cumsumbfloat16reshapert   )r   r   rq   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 r0   build_alibi_tensorr      s   +11Jdjj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYt$F&\\A499Q);%;<q@AABCNLaLainiviv

 ""4BT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj$GHaP %+++3a7>I1dTU:VM9&&(=8E==/J?BB5IIr3   rC   residualprobtrainingc                 8    [         R                  " XUS9nX-   nU$ )z
Dropout add function

Args:
    x (`torch.tensor`):
        input tensor
    residual (`torch.tensor`):
        residual tensor
    prob (`float`):
        dropout probability
    training (`bool`):
        training mode
)pr   )Fdropout)rC   r   r   r   outs        r0   dropout_addr      s      ))A
1C
.CJr3   c                     ^  \ rS rSrSS\4U 4S jjjrS\R                  S\\R                  \R                  \R                  4   4S jr	S\R                  S\R                  4S jr
       SS	\R                  S
\\R                     S\R                  S\\R                     S\\   S\\R                     S\S\S\\R                     S\\\R                  \R                  4      4S jjrSrU =r$ )FalconAttention   rU   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        U R                  U R
                  -  U l        U R                  U l        UR                  U l        UR                  U l	        UR                  U l
        SU l        UR                  S:H  U l        X l        Uc-  [        R!                  SU R"                  R$                   S35        U R                  U R
                  -  U R                  :w  a&  ['        SU R                   SU R
                   S35      eS[(        R*                  " U R                  5      -  U l        U R,                  U l        UR0                  (       a*  UR2                  S	-  UR                  -   U R                  -  nO=UR4                  (       a  U R                  S	U R                  -  -   nOS
U R                  -  n[7        U R                  X1R8                  S9U l        UR0                  U l        UR4                  U l        [7        U R                  U R                  UR8                  S9U l        [>        R@                  " URB                  5      U l!        U R0                  (       d  U R4                  (       d  UR2                  OSU l        URD                  (       a  [G        U R                  S9U l$        g g )NTsdpazInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).      ?r>   r   r-   r    rU   )%r]   r^   rU   hidden_sizenum_attention_headsr   head_dim
split_sizehidden_dropoutra   
rope_theta	is_causal_attn_implementation	_use_sdpa	layer_idxloggerwarning_onceri   r5   
ValueErrorr   sqrtinv_norm_factorbetanew_decoder_architecturenum_kv_headsmulti_queryr%   r-   query_key_valuedenser   Dropoutattention_dropoutrotaryrS   
rotary_emb)r.   rU   r   qkv_out_dimri   s       r0   r^   FalconAttention.__init__   sQ   !--33((DNN:**$33'-'E'E$ ++44>" !8!8 9 :, , ==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==((	**!..2V5O5OOSWS`S``K**Q->>Kd...K+D,<,<kP[P[\(.(G(G%!--!$"2"2D4D4D6;;W
!#F,D,D!E484Q4QY]YiYiF//pq ==34;;GDO r3   	fused_qkvr(   c                 r   U R                   (       a  UR                  u  p#nUR                  X#SU R                  U R                  -  S-   U R
                  5      nUSS2SS2SS2SS24   nUSS2SS2SS2S/4   nUSS2SS2SS2S/4   n[        R                  " XvR                  5      n[        R                  " XR                  5      nXgU4 V	s/ s H  oR                  SS5      PM     sn	u  pgnXgU4$ U R                  (       dT  UR                  u  pnUR                  XU R                  SU R
                  5      nUSSSS24   USSSS24   USSSS24   4$ UR                  u  pnUR                  XU R                  S-   U R
                  5      nUSSS2SS24   USS/SS24   USS/SS24   4$ s  sn	f )	ap  
Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`

Args:
    fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

Returns:
    query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
    value: [batch_size, seq_length, num_heads, head_dim]
r=   r>   Nr   .r   r    )
r   rA   viewr   r   r   r9   broadcast_toflattenr   )r.   r   batchseq_len_qkvquerykeyvaluerC   r   r   three_times_hidden_sizes                r0   _split_headsFalconAttention._split_heads   s    (( )EA..T^^tGXGX5X[\5\^b^k^klC1a"%EaAtm$C1a"&E$$S++6C&&ukk:E;@u:M N:MQ1a:M NEu$$!!>Goo;J$;!zt~~qRVR_R_`IS!QY'319)=yaQR?SSS>Goo;J$;!zt~~PQ?QSWS`S`aIS#2#q[)9S2$\+BIcTVSWYZlD[[[ !Os   F4rC   c                    UR                   u  p#nX R                  -  nUR                  XPR                  X0R                  5      nUR	                  SSSS5      nUR                  XSU R                  U R                  -  5      $ )z
Merge heads together over the last dimension

Args:
    x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

Returns:
    torch.tensor: [batch_size, seq_length, num_heads * head_dim]
r   r>   r    r   )rA   r   r   r   permuter   )r.   rC   batch_size_and_num_headsr   r   r   s         r0   _merge_headsFalconAttention._merge_heads  so     34''/ a-?
 FF:~~z==I IIaAq! yy$--1OPPr3   r/   r   r   rM   
layer_past	head_mask	use_cacheoutput_attentionscache_positionposition_embeddingsc           	      
   U R                  U5      nU R                  (       a  U R                  OU R                  nU R	                  U5      u  pnUR
                  u  nn  nUR                  SS5      R                  UU R                  UU R                  5      nUR                  SS5      R                  UUUU R                  5      nUR                  SS5      R                  UUUU R                  5      nUc  U
u  nn[        XUU5      u  pUb:  SU	0nUc  UR                  WWS.5        UR                  XU R                  U5      u  pUR
                  S   nU R                  (       aM  UR                  R                  S:X  a3  Ub0  UR                  5       nUR                  5       nUR                  5       nUb  US S 2S S 2S S 2S UR
                  S   24   nUGcK  U R                  (       aT  U(       dM  U R                   (       a  Uc  US:  a  SOSn["        R$                  R&                  R)                  UUUUS	US
9nS nO_XR                  SS5      -  nU[*        R,                  " U R                  5      -  n[.        R0                  " UU-   SUR2                  S9nUU-  nUR5                  UU R                  UU R                  5      nUR7                  SSSS5      nUR                  UUU R                  U R                  -  5      nU R9                  U5      nU(       a  UUU4$ UU4$ U R                  (       a  U(       d  Uc  U R                   (       a  Uc  US:  a  SOSn["        R$                  R&                  R)                  UUUUU R:                  (       a  U R<                  R>                  OS	US
9nUR                  SS5      nUR                  UUU R                  U R                  -  5      nU R9                  U5      nGOFXR                  SS5      -  nUR5                  UU R                  UU5      nUR2                  nU["        R@                  :X  d  U["        RB                  :X  a  URE                  ["        RF                  5      nUUR5                  UU R                  SS5      -   nUU RH                  -  n[.        R0                  " UU-   SUR2                  S9nU R=                  U5      nUb  UU-  nUR5                  UU R                  UU5      nUU-  RK                  SS5      nU RM                  U5      nU R9                  U5      nU(       a  UUW4$ UU4$ )Nr    r>   r   rL   rK   r   cudaTF        )	attn_mask	dropout_pr   r=   )r@   rq   r   r   )'r   r   r   r   r   rA   rx   r   r   rQ   updater   r   rh   rY   
contiguousr   r9   r   r   scaled_dot_product_attentionr   r   r   softmaxrq   r   r   r   r   r   r   float16r   rt   r   r   r   r   )r.   r/   r   r   rM   r   r   r   r   r   r   r   r   query_layer	key_layervalue_layerr   query_lengthr   rK   rL   cache_kwargs	kv_lengthr   attn_outputattention_scoresmatmul_resultinput_dtypeattention_logitsattention_probsattention_probs_reshapeds                                  r0   r1   FalconAttention.forward$  s    ((7	)-)F)Ft~~DL]L]040A0A)0L-)4):):&
L!Q!++Aq199*dnnVbdhdqdqr''1-55j,P\^b^k^kl	!++Aq199*lT`bfbobop=*HC%9+RUWZ%["K!,n=L}##C$<=%/%6%6yt~~_k%l"IOOB'	>>k0055?ND^ &002K!,,.I%002K%+Aq!5Jyr7J5J,JKN=~~&7
 %)NN~7MR^abRbDhm	#hh11NN,!' O  $( #.1D1DR1L#L  DIIdmm$<< #$99-=-NTV^k^q^q#r .<%**:t~~|UYUbUbcK%--aAq9K%--j,Y]YfYfHfgK**[1K "J0@@@"J.. ~~&7I<M %)NN~7MR^abRbDhm	#hh11NN,:>--d4466S' O  *33Aq9)11*lDNN]a]j]jLjk"jj5 +.A.A"b.I I $1#5#5j$..R^`i#j  /44%--/;%..3P'7':':5=='I$#3ejjT^^]^`b6c#c  D$8$88 "#)),<~,MSU]j]p]p"q"&"8"8"I(&5	&AO ,;+?+?
DNN\hjs+t(  8+ENNqRST #//<"jj5 "J??"J..r3   )r   r   r   rU   r   r   r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r*   NNNFFNN)r5   r6   r7   r8   r!   r^   r9   r:   r   r   r   r   
LongTensorr   boolr1   r;   r   r   s   @r0   r   r      sF   -H| -H -H^\ell \uU\\5<<Y^YeYe=e7f \@Qell Qu|| Q< 48&*,0"'59KOD/||D/ %D/ 	D/
 u//0D/ UOD/ ELL)D/ D/  D/ !!1!12D/ &eELL%,,,F&GHD/ D/r3   r   c                   >  ^  \ rS rSrSrU 4S jr       SS\R                  S\\R                     S\R                  S\\R                     S\\
   S	\\R                     S
\S\S\\R                     S\\\R                  \R                  4      4S jjrSrU =r$ )FalconFlashAttention2i  a8  
Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r*   )r]   r^   r   _flash_attn_uses_top_left_mask)r.   argskwargsri   s      r0   r^   FalconFlashAttention2.__init__  s#    $)&)
 /P.Q+r3   r/   r   r   rM   r   r   r   r   r   r   c                 "   U R                  U5      nU R                  (       a  U R                  OU R                  nU R	                  U5      u  pnUR
                  u  nn  nUR                  SS5      R                  UU R                  UU R                  5      nUR                  SS5      R                  UUUU R                  5      nUR                  SS5      R                  UUUU R                  5      nUc  U
u  nn[        XUU5      u  pUb:  SU	0nUc  UR                  WWS.5        UR                  XU R                  U5      u  pUR                  SS5      nUR                  SS5      nUR                  SS5      nUb  [        S5      eU R                  (       a  U R                  R                  OSnUR                   nU["        R$                  :X  a  ["        R&                  " 5       (       a  ["        R(                  " 5       nOR[+        U R                  S5      (       a  U R                  R,                  nO U R                   R.                  R                   n[0        R3                  SU S	35        UR5                  U5      nUR5                  U5      nUR5                  U5      n[7        UUUUUUUU R8                  U R:                  S
9	nUR                  UUU R                  U R                  -  5      nU R=                  U5      nU(       d  S nUUU4$ )Nr    r>   r   r   z6`alibi` is not supported when `use_flash_attn` is Truer   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)rM   r   r   use_top_left_mask)r   r   r   r   r   rA   rx   r   r   rQ   r   r   r   r   rU   r   rq   r9   r   is_autocast_enabledget_autocast_gpu_dtyper_   r  r+   r   r   rt   r#   r   r  r   )r.   r/   r   r   rM   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   rL   r   attn_dropoutr   target_dtyper   attn_weightss                              r0   r1   FalconFlashAttention2.forward  s    ((7	)-)F)Ft~~DL]L]040A0A)0L-)4):):&
L!Q!++Aq199*dnnVbdhdqdqr''1-55j,P\^b^k^kl	!++Aq199*lT`bfbobop=*HC%9+RUWZ%["K!,n=L}##C$<=%/%6%6yt~~_k%l"I "++Aq1''1-	!++Aq1UVV8<t{{443
 "''%--'((**$;;=&?@@#{{BB#33::@@ >$ &..6K!\2I%..6K.% nn"AA

 #**:|T^^VZVcVcEcdjj. LJ44r3   )r  r   )r5   r6   r7   r8   __doc__r^   r9   r:   r   r   r   r   r   r1   r;   r   r   s   @r0   r  r    s    R 48&*,0"'59KOV5||V5 %V5 	V5
 u//0V5 UOV5 ELL)V5 V5  V5 !!1!12V5 &eELL%,,,F&GHV5 V5r3   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	FalconMLPi  rU   c                   > [         TU ]  5         UR                  n[        X!R                  UR
                  S9U l        [        UR                  5      U l	        [        UR                  X!R
                  S9U l
        UR                  U l        g )Nr   )r]   r^   r   r%   ffn_hidden_sizer-   dense_h_to_4hr   
activationactdense_4h_to_hr   )r.   rU   r   ri   s      r0   r^   FalconMLP.__init__  sp    (()+7M7MTZT_T_`!&"3"34)&*@*@+T_T_`$33r3   rC   r(   c                 h    U R                  U R                  U5      5      nU R                  U5      nU$ r*   )r  r  r  )r.   rC   s     r0   r1   FalconMLP.forward  s0    HHT''*+q!r3   )r  r  r  r   )r5   r6   r7   r8   r!   r^   r9   r:   r1   r;   r   r   s   @r0   r  r    s/    4| 4 %,,  r3   r  )eagerr   flash_attention_2c                     ^  \ rS rSrSS\4U 4S jjjr       SS\R                  S\\R                     S\R                  S\\R                     S\\
\\\R                  \R                  4   4      S	\\R                     S
\S\S\\R                     S\\\R                  \R                  4      4S jjrSrU =r$ )FalconDecoderLayeri*  rU   c                 f  > [         TU ]  5         UR                  nUR                  U l        [
        UR                     " X5      U l        [        U5      U l	        UR                  U l
        Xl        UR                  c  UR                  (       a  SUl        UR                  (       d1  [        X1R                   S9U l        [        X1R                   S9U l        g UR                  S:X  a1  [        X1R                   S9U l        [        X1R                   S9U l        g [        X1R                   S9U l        g )Nr>   eps)r]   r^   r   r   r   FALCON_ATTENTION_CLASSESr   self_attentionr  mlpr   rU   num_ln_in_parallel_attnr   parallel_attnr
   layer_norm_epsilonpost_attention_layernorminput_layernormln_attnln_mlp)r.   rU   r   r   ri   s       r0   r^   FalconDecoderLayer.__init__+  s    ((336v7R7RSTZfV$$33))1f6U6U-.F*##,5kG`G`,aD)#,[>W>W#XD --2(:S:ST'9R9RS'0B[B['\$r3   r/   r   r   rM   r   r   r   r   r   r   c                    UnU R                   R                  (       a=  U R                   R                  S:X  a#  U R                  U5      nU R	                  U5      nOU R                  U5      nU R                  UUUUUUUUU	U
S9
nUS   nU R                   R                  (       dX  U R                   R                  (       a  UnO:[        UXR                   R                  U R                  S9nU R                  U5      nU R                   R                  (       a7  U R                   R                  (       a  U R                   R                  S:X  a  UnUSS  nU R                  W5      nU R                   R                  (       d  U R                   R                  (       a  UU-  n[        UXR                   R                  U R                  S9nU(       a  U4U-   nU$ U4USS  -   nU$ )Nr>   )	r   r   rM   r   r   r   r   r   r   r   )r   r    )rU   r   r'  r,  r-  r+  r%  r(  r   r   r   r*  r&  r   )r.   r/   r   r   rM   r   r   r   r   r   r   r  r   attention_layernorm_outmlp_layernorm_outattn_outputsattention_outputoutputs
mlp_outputoutputs                       r0   r1   FalconDecoderLayer.forwardD  s    !;;//DKK4W4W[\4\&*ll=&A# $M :&*&:&:=&I# **#!)%/) 3 + 
 (?{{33{{(($;!&$h0M0MX\XeXe %)$A$A($K! KK00))33q8 7qr" XX/0
;;//4;;3L3L**JZ;;3M3MX\XeXefi')G  i'!"+-Gr3   )	rU   r   r+  r,  r-  r&  r   r*  r%  r*   r   )r5   r6   r7   r8   r!   r^   r9   r:   r   r   r   r   r   r   r1   r;   r   r   s   @r0   r   r   *  s   ]| ] ]< 48PT,0"'59KOE||E %E 	E
 u//0E U5%ell0J*K#KLME ELL)E E  E !!1!12E &eELL%,,,F&GHE Er3   r   c                      ^  \ rS rSr\rSrSrS/rSr	Sr
SrSrSrU 4S jrS\R                   4S jr\SS\S	S
4S jj5       rSrU =r$ )FalconPreTrainedModeli  transformerTr   c                 &   > [         TU ]  " U0 UD6  g r*   )r]   r^   )r.   inputsr  ri   s      r0   r^   FalconPreTrainedModel.__init__  s    &+F+r3   modulec                 .   [        U[        R                  5      (       d  [        U[        5      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weights.r   )meanstdNr   )ru   r   Linearr%   r+   datanormal_rU   initializer_ranger-   zero_	Embeddingpadding_idxr
   fill_)r.   r>  s     r0   _init_weights#FalconPreTrainedModel._init_weights  s   fbii((Jv|,L,L MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .	**KK""$MM$$S) +r3   hard_check_onlyr(   r"   c                 N    [        U SS5      nU(       a  U$ U(       d  SUl        U$ )Nuse_bettertransformerFr   )getattrr   )clsrU   rL  _is_bettertransformers       r0   _check_and_enable_sdpa,FalconPreTrainedModel._check_and_enable_sdpa  s*     '-De L M*0F'r3   r4   )F)r5   r6   r7   r8   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_quantized_cache_supports_static_cacher^   r   ModulerJ  classmethodr   rR  r;   r   r   s   @r0   r9  r9    sr    L%&*#-.!N  $!,*BII *" T N`  r3   r9  c                     ^  \ rS rSrS\4U 4S jjrS rS\R                  4S jr	\
           SS\\R                     S\\\\\\R                  \R                  4   S	4   4      S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  S	4   \4   4S jj5       rS
\R                  S\R                  S\R                  S\S\S\R                  S\R                  4S jr\S
\R                  S\S\S\R,                  S\R                  S\4S j5       rSrU =r$ )FalconModeli  rU   c           
      `  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        [        R                  " UR                  U R                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        UR"                  S:H  U l        UR"                  S:H  U l        [)        U R                  UR*                  S9U l        [/        US9U l        SU l        U R5                  5         g s  snf )N)r   r  r   r"  r   F)r]   r^   r   	embed_dimr   r   r   	use_alibir   rG  
vocab_sizeword_embeddings
ModuleListrangenum_hidden_layersr   hr   _use_flash_attention_2r   r
   r)  ln_frS   r   gradient_checkpointing	post_init)r.   rU   iri   s      r0   r^   FalconModel.__init__  s     ++33  "||F,=,=t~~N QVW]WoWoQpqQpA 26 GQpqr&,&A&AEX&X#44> dnn&2K2KL	/v>&+# 	  rs   D+c                     U R                   $ r*   re  r.   s    r0   get_input_embeddings FalconModel.get_input_embeddings  s    ###r3   new_embeddingsc                     Xl         g r*   rq  r.   ru  s     r0   set_input_embeddings FalconModel.set_input_embeddings  s    -r3   	input_idspast_key_values.r   rM   r   inputs_embedsr   r   output_hidden_statesreturn_dictr   r(   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
USL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nSnU(       aP  [        U[        5      (       d;  SnUc  [        5       nO+[        R                  " U5      n[        R                  S5        SnUb  UR                  5       OSnUR                   u  nnnU R"                  (       aW  Uc3  [$        R&                  " UUU-   4UR(                  [$        R*                  S9OUn[-        UU R.                  UR0                  S	9nUc"  [$        R2                  " XU-   UR(                  S
9nUc  UR5                  S5      nU R7                  X6XXU5      nU R9                  XPR                   R:                  5      nUnU R=                  UU5      nSnU(       a  SOSnU	(       a  SOSn[?        U R@                  5       H  u  nnU	(       a  UU4-   nU R                  (       a:  U R                  (       a)  U RC                  URD                  UUUUUU   UUUUU5      nOU" UUUUUU   UUUUUS9
nUS   nUSL a  US   nU(       d  M  UUU(       a  SOS   4-   nM     U RG                  U5      nU	(       a  UU4-   nU(       a  UOSnU(       a  URI                  5       nU
(       d  [K        S UUUU4 5       5      $ [M        UUUUS9$ )h  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   rp   rh   r4   )	r   r   rM   r   r   r   r   r   r   r    r>   c              3   .   #    U  H  oc  M  Uv   M     g 7fr*   r4   ).0vs     r0   	<genexpr>&FalconModel.forward.<locals>.<genexpr>p  s      ^a^s   	)last_hidden_stater{  r/   
attentions)'rU   r   r}  r   use_return_dictr   rl  r   r   r   re  ru   r   r   from_legacy_cacheget_seq_lengthrA   rc  r9   onesrh   longr   r   rq   r   rH   _update_causal_maskget_head_maskrh  r   	enumerateri  _gradient_checkpointing_func__call__rk  to_legacy_cachetupler   )r.   rz  r{  r   rM   r   r|  r   r   r}  r~  r   return_legacy_cacher   past_key_values_lengthr   r   r   maskcausal_maskr/   r   next_decoder_cacheall_self_attentionsall_hidden_statesrn  blockr4  
next_caches                                r0   r1   FalconModel.forward  s   8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  00;M $Z??"&&".."."@"@"Q##^ ETE`!?!?!Afg$1$7$7!
J>>
 ") 

.D!DEmNbNbjojtjt $  'tT^^=CVCVWE!"\\&(KTaThThN )33A6L..>L]jo
 &&y++2O2OP	% #oom\J!$5b4"6BD!$&&)HAu#$58H$H!**t}};;NN! aL#%"'  !.#.!-'l'&7#1(; $AJMD %,QZ"  &9W)QYZ=[<]&]#K *P 		-0 1]4D D+4'$
#335J ):7HJ]^   9+&+*	
 	
r3   input_tensorr   c           
      H   U R                   R                  S:X  a  Ub  SU;   a  U$ g Ub  UR                  5       OSn[        U[        5      n	U R                   R                  S:X  a;  U	(       d4  U(       d-  Uc*  Uc'  [
        R                  " UUUU R                  S9(       a  g UR                  UR                  p[        R                  " U
5      R                  nUR                  u  pnU	(       a  UR                  5       nO2[        U[        R                  5      (       a  UR                  S   OX-   nU R!                  UUUU
UUUR                  S   S9nUcw  Ubt  UR"                  " US/UR                  SS  Q76 n[        R$                  " U[&        R(                  " U R                   R*                  U R,                  -  5      -  US:  U5      nU R                   R                  S:X  a;  Ub8  UR                  R.                  S	;   a  U(       d  [
        R0                  " UU5      nU$ )
Nr  r   r   r   )r|  r  is_trainingr=   )sequence_lengthtarget_lengthrq   rh   r   r   r    )r   xpunpu)rU   r   r  ru   r   r   _ignore_causal_mask_sdpar   rq   rh   r9   finfor   rA   get_max_cache_shaper:   5_prepare_4d_causal_attention_mask_with_cache_positionr   masked_fillr   r   r   r   rY   _unmask_unattended)r.   r   r  r   r{  r   r   r   past_seen_tokensusing_static_cacherq   rh   	min_dtyper   r  r   r  r  s                     r0   r  FalconModel._update_causal_mask{  s    ;;++/BB)c^.C%%
 @O?Z?99;`a'E KK,,6&%!%>>*'7 MM	 $**L,?,?vKK&**	)5););&
Q+??AM nell;; $$R(%7  PP+')#))!, Q 
 !2MM*bC5;;qr?CE++		$++"9"9T^^"KLLb K KK,,6*%%**.DD%
 1CCKQZ[Kr3   r  r  rq   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuerq   rh   r    )diagonalr  r=   r   )r@   r9   r  r   fullrh   triur   r   rs   clonerA   rt   r  )r   r  r  rq   r   r   r  r  r  mask_lengthpadding_masks              r0   r  AFalconModel._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r3   )
rj  r   rb  rl  ri  rk  r   r   rc  re  )NNNNNNNNNNN)r5   r6   r7   r8   r!   r^   rs  r9   r:   rx  r   r   r   r   r   r   r   r   r1   r  staticmethodintrq   r  r;   r   r   s   @r0   r`  r`    s3   | 2$.5<< .  15ae15370448$(,0/3&*59`
E,,-`
 "%uU5<<;U5VX[5[/\(\"]^`
 !.	`

 u//0`
 E,,-`
   0 01`
 D>`
 $D>`
 'tn`
 d^`
 !!1!12`
 
uU\\3&')RR	S`
 `
DUU llU 	U
 U  U <<U ||Un 444 4 {{	4
 4 4 4r3   r`  z
    The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                      ^  \ rS rSrS/rS\4U 4S jjrS rS\R                  4S jr
\             SS\\R                     S	\\\\\\R                  \R                  4   S
4   4      S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  4   S\\\R                     \4   4S jj5       rS\\\R                  \R                  4   S
4   S\R                  S\\\R                  \R                  4   S
4   4S jrSrU =r$ )FalconForCausalLMi  zlm_head.weightrU   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr   )
r]   r^   r`  r:  r   rB  r   rd  lm_headrm  r.   rU   ri   s     r0   r^   FalconForCausalLM.__init__  sI     &v.yy!3!3V5F5FUS 	r3   c                     U R                   $ r*   r  rr  s    r0   get_output_embeddings'FalconForCausalLM.get_output_embeddings  s    ||r3   ru  c                     Xl         g r*   r  rw  s     r0   set_output_embeddings'FalconForCausalLM.set_output_embeddings  s    %r3   rz  r{  .r   rM   r   r|  labelsr   r   r}  r~  r   logits_to_keepr(   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb*  U R                  " UU4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)
r{  r   rM   r   r|  r   r   r}  r~  r   r   rd  r    losslogitsr{  r/   r  )rU   r  r:  ru   r  slicer  loss_functionrd  r   r{  r/   r  )r.   rz  r{  r   rM   r   r|  r  r   r   r}  r~  r   r  r  transformer_outputsr/   slice_indices	lm_logitsr  r6  s                        r0   r1   FalconForCausalLM.forward!  s3   H &1%<k$++B]B]"..+)%'/!5#) / 
 ,A.8B>SV8W8W~ot4]kLLq-/B!CD	%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r3   pastbeam_idxc           	         ^ U VVs0 s H1  o3  H(  oDR                   UR                  UR                   5      _M*     M3     snnm[        U4S jU 5       5      nU$ s  snnf )a$  
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.

Output shares the same memory storage as `past`.
c              3      >#    U  HO  nUS    R                  S TUS    R                     5      US   R                  S TUS    R                     5      4v   MQ     g7f)r   r    N)index_selectrh   )r  r   device_to_beam_idxs     r0   r  3FalconForCausalLM._reorder_cache.<locals>.<genexpr>}  sf      

 #
 1**1.@AAUAU.VW1**1.@AAUAU.VW #s   AA)rh   rt   r  )r.   r  r  r   
past_statereordered_pastr  s         @r0   _reorder_cache FalconForCausalLM._reorder_cachen  sn     QU
PT*gqYcx{{:+<+<==gqPT
  

 #
 
 
s   8A)r  r:  )NNNNNNNNNNNNr   )r5   r6   r7   r8   _tied_weights_keysr!   r^   r  r9   r:   r  r   r   r   r   r   r   r   r  r   r1   r  r;   r   r   s   @r0   r  r    s
    ++| &ELL &  15ae1537,004)-$(,0/3&*5934J
E,,-J
 "%uU5<<;U5VX[5[/\(\"]^J
 !.	J

 u//0J
 ELL)J
  -J
 &J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 c5<</0J
  
uU\\"$EE	F!J
 J
X%ell :;S@AMRM]M]	uU\\5<</0#5	6 r3   r  a  
    The Falcon Model transformer with a sequence classification head on top (linear layer).

    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                     ^  \ rS rSrS\4U 4S jjr\          SS\\R                     S\\
\
\R                  \R                  4   S4      S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\
\R                     \4   4S jj5       rSrU =r$ )FalconForSequenceClassificationi  rU   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
r]   r^   
num_labelsr`  r:  r   rB  r   scorerm  r  s     r0   r^   (FalconForSequenceClassification.__init__  sV      ++&v.YYv1163D3D5Q
 	r3   rz  r{  .r   r   r|  r  r   r   r}  r~  r(   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr{  r   r   r|  r   r   r}  r~  r   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r=   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationr  )rU   r  r:  r  rA   pad_token_idr   rt   rh   r9   r   r   argmaxr   r   ri   r5   problem_typer  rq   r  r  r   squeezer	   r   r   r{  r/   r  )r.   rz  r{  r   r   r|  r  r   r   r}  r~  r  r/   r  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr6  s                         r0   r1   'FalconForSequenceClassification.forward  s   @ &1%<k$++B]B]"..+)'/!5# / 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r3   )r  r  r:  
NNNNNNNNNN)r5   r6   r7   r8   r!   r^   r   r   r9   r   r   r:   r   r   r   r1   r;   r   r   s   @r0   r  r    s0   |   15SW15,004)-$(,0/3&*g
E,,-g
 "%ellELL.H(I3(N"OPg
 !.	g

 ELL)g
  -g
 &g
 D>g
 $D>g
 'tng
 d^g
 
uU\\"$DD	Eg
 g
r3   r  c                     ^  \ rS rSrS\4U 4S jjr\          SS\\R                     S\\
\
\R                  \R                  4   S4      S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\
\R                     \4   4S jj5       rSrU =r$ )FalconForTokenClassificationi
  rU   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        USS 5      b  UR                  nO[        USS 5      b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)r]   r^   r  r`  r:  rO  r  r   r   r   r   rB  r   
classifierrm  )r.   rU   r  ri   s      r0   r^   %FalconForTokenClassification.__init__  s      ++&v.6/6B!'!:!:V-t4@!'!6!6!$zz"45))F$6$68I8IJ 	r3   rz  r{  .r   r   r|  r  r   r   r}  r~  r(   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUbQ  UR
                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )r  Nr  r   r>   )r  r  r/   r  )rU   r  r:  r   r  rA   r	   r   r  r   r/   r  )r.   rz  r{  r   r   r|  r  r   r   r}  r~  r  r/   r  r  r   r   r  r6  s                      r0   r1   $FalconForTokenClassification.forward  s   @ &1%<k$++B]B]"..+)'/!5# / 

 ,A.]3/%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r3   )r  r   r  r:  r  )r5   r6   r7   r8   r!   r^   r   r   r9   r   r   r:   r   r   r   r1   r;   r   r   s   @r0   r  r  
  s0   | "  15SW15,004)-$(,0/3&*B
E,,-B
 "%ellELL.H(I3(N"OPB
 !.	B

 ELL)B
  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\\"$99	:B
 B
r3   r  c                   2  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\
   S
\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )FalconForQuestionAnsweringic  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr>   )	r]   r^   r`  r:  r   rB  r   
qa_outputsrm  r  s     r0   r^   #FalconForQuestionAnswering.__init__e  sA     &v.))F$6$6: 	r3   rz  r   r   r|  start_positionsend_positionsr   r}  r~  r(   c
           
         U	b  U	OU R                   R                  n	U R                  UUUUUUU	S9n
U
S   nU R                  U5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU	(       d  X4U
SS -   nUb  U4U-   $ U$ [        UUUU
R                  U
R                  S	9$ )
r  N)r   r   r|  r   r}  r~  r   r    r=   r?   )ignore_indexr>   )r  start_logits
end_logitsr/   r  )rU   r  r:  r  splitr  r   lensizeclampr	   r   r/   r  )r.   rz  r   r   r|  r  r  r   r}  r~  r4  sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr6  s                        r0   r1   "FalconForQuestionAnswering.forwardm  s   4 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r3   )r  r:  )	NNNNNNNNN)r5   r6   r7   r8   r^   r   r   r9   r   FloatTensorr   r   r   r   r1   r;   r   r   s   @r0   r  r  c  s      156:15596:48,0/3&*G
E,,-G
 !!2!23G
 E--.	G

   1 12G
 "%"2"23G
   0 01G
 $D>G
 'tnG
 d^G
 
u22	3G
 G
r3   r  )r  r`  r9  r  r  r  )Nr    )Mr  r   typingr   r   r   r   r9   torch.utils.checkpointr   torch.nnr   r	   r
   r   r   r   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   configuration_falconr!   configuration_utilsr"   r#   
get_loggerr5   r   rB  r%   rF   rQ   r]  rS   r:   r  rq   r   rr   r   r   r   r  r  r$  r   r9  r`  r  r  r  r  __all__r4   r3   r0   <module>r(     sF     8 8    L L $ ) ; ; ) i  L - / 7J			H	%
)299 )(8<BII <DJu|| J JEKK J\a\h\h J:5<< 5<< u PT Y^YeYe &m/bii m/`e5O e5P		 " . _ _D 'O ' 'T P' P Pf
 
t- t
tn r
&; r
r
j U
#8 U
 U
p Q
!6 Q
 Q
hr3   