
    fTh|                        S r SSKJrJrJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
JrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJr  SSKJr  \" 5       (       a  SSKJr  SSKJr  \R@                  " \!5      r"S\#S\#S\RH                  4S jr%S\RH                  S\RH                  4S jr&S\RH                  S\RH                  S\RH                  S\RH                  4S jr' " S S\RP                  5      r) " S S\RP                  5      r* " S S \RP                  5      r+\ " S! S"\5      5       r,\ " S# S$\,5      5       r-\" S%S&9 " S' S(\,\5      5       r./ S)Qr/g)*zPyTorch CodeGen model.    )OptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )CodeGenConfig)	BlockMask)make_flex_block_causal_masknum_posdimreturnc           	         SS[         R                  " SUS[         R                  S9U-  -  -  n[         R                  " S[         R                  " U [         R                  S9R	                  5       U5      R	                  5       n[         R
                  " [         R                  " U5      [         R                  " U5      4SS9$ )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inps       d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positionsr+   /   s    eQQekk JS PQRH<<WEKK0X0^0^0`bjkqqsL99eii-uyy/FGQOO    xc                     U S S 2S S 2S S 2S S S24   nU S S 2S S 2S S 2SS S24   n[         R                  " U* U4SS9n U R                  S5      $ )Nr   r   r   )r    stackflatten)r-   x1x2s      r*   rotate_every_twor5   6   sS    	
1aCaC<B	
1aADqD=	BbS"I2&A99R=r,   tensorr&   r'   c                     [         R                  " US S 2S S 2S S S 24   SS5      n[         R                  " US S 2S S 2S S S 24   SS5      nX-  [        U 5      U-  -   $ )Nr   r   )r    repeat_interleaver5   )r6   r&   r'   s      r*   apply_rotary_pos_embr9   >   s\    

!
!#aD!m"4a
;C

!
!#aD!m"4a
;CL-f5;<<r,   c                     ^  \ rS rSrSU 4S jjrS rS r  SS jr       SS\\	R                     S\\   S\\	R                     S	\\	R                     S
\\	R                     S\\   S\\   S\\	R                     S\\\	R                   \\	R                      4   \\\	R                   \\	R                      \\	R                   S4   4      4   4S jjrSrU =r$ )CodeGenAttentionD   c                 f  > [         TU ]  5         UR                  n[        R                  " UR
                  5      U l        [        R                  " UR                  5      U l        X l	        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                   U l        U R                  U R                   -  U l        U R"                  U R                   -  U R                  :w  a&  [%        SU R                   SU R                    S35      e[&        R(                  " [&        R*                  " U R"                  [&        R,                  S95      R/                  [&        R0                  " 5       5      U l        [        R4                  " U R                  U R                  S-  SS	9U l        [        R4                  " U R                  U R                  SS	9U l        UR:                  U l        U R:                  =(       d    U R                  n[=        X45      U l        g )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__max_position_embeddingsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr    sqrtr6   float32toget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr+   embed_positions)selfconfigrG   max_positionspos_embd_dimrJ   s        r*   r@   CodeGenAttention.__init__E   s   66JJv'8'89ZZ(:(:;" !8!8 9 :, ,  ++#)#=#= $*B*BB==4333t~~EWX\XfXfWg h++/+C+C*DBH   **U\\$--u}}%UVYYZ_ZqZqZst		$..$..12D5Q		$..$..uM ++8$..:=Wr,   c                     UR                  UR                  S S X$-  U4-   5      nUR                  UR                  S S S-   UR                  SS  -   5      nU$ )Nr/   r0   )r/   )reshapeshape)r[   r-   n_headdim_headmp_numreshapeds         r*   _split_headsCodeGenAttention._split_headsc   s[    99QWWSb\V-=x,HHI##AGGCRL5$88>>"#;N$NOr,   c                    [        UR                  5      S:X  a$  UR                  SSSSS5      R                  5       nO][        UR                  5      S:X  a#  UR                  SSSS5      R                  5       nO![	        S[        UR                  5       35      eUR                  5       SS	 X#-  4-   nUR                  U5      $ )
z=
Merges attn_head_size dim and num_attn_heads dim into n_ctx
   r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr0   )lenrb   permute
contiguousrP   sizeview)r[   r6   rN   attn_head_size	new_shapes        r*   _merge_headsCodeGenAttention._merge_headsh   s     v||!^^Aq!Q2==?F!#^^Aq!Q/::<FRSVW]WcWcSdRefggKKM#2&*=*N)PP	{{9%%r,   c                    UR                  [        R                  5      nUR                  [        R                  5      n[        R                  " XR	                  SS5      5      nUb"  US S 2S S 2S S 2S UR
                  S   24   nXg-  nX`R                  -  n[        R                  " SS9" U5      nUR                  UR                  5      nU R                  U5      nUb  Xe-  n[        R                  " Xc5      nX4$ )Nr/   r0   r   )rS   r    rR   matmul	transposerb   rU   r   Softmaxr   rD   )	r[   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputs	            r*   _attnCodeGenAttention._attnu   s     'ffU]]#||E==R+@A%(Aq/CIIbM/)ABK'L#oo5zzb),7#u{{3((6  '3Lll<7((r,   hidden_states
layer_pastr|   position_idsr}   	use_cacheoutput_attentionscache_positionr   .c	                 L   U R                  U5      n	Sn
U	R                  U	R                  S S U
S4-   5      nU R                  U R                  -  U
-  n[
        R                  " XSS9u  pnU R                  XR                  U R                  U
S9nU R                  XR                  U R                  U
S9nU R                  XR                  U R                  U
S9nUR                  SSSS5      nU R                  nUR                  UR                  :w  a"  UR                  UR                  5      nUU l	        UU   n[
        R                  " UUR                  S   S-  SS9u  nnU R                  b  US S 2S S 2S S 2S U R                  24   nUS S 2S S 2S S 2U R                  S 24   nUS S 2S S 2S S 2S U R                  24   nUS S 2S S 2S S 2U R                  S 24   n[        UUU5      n[        UUU5      n[
        R                  " UU/SS9n[
        R                  " UU/SS9nO[        UUU5      n[        UUU5      nUR                  SSSS5      nUR                  SSSS5      nUbI  UUU R                  US	.nUR                  UR                  UR                   5      XR"                  U5      u  pU R%                  XXU5      u  nnU R'                  UU R                  U R                  5      nU R)                  U5      nU R+                  U5      nUU4nU(       a  UU4-  nU$ )
Nrk   r/   r   )re   r   r   r   r   )r&   r'   partial_rotation_sizer   )rW   ra   rb   rO   rN   r    splitrg   rm   rZ   devicerS   rY   r9   r%   updater   rG   r   rs   rX   rF   )r[   r   r   r|   r   r}   r   r   r   qkvre   	qkv_split	local_dimry   r{   rz   rZ   sincosr&   r'   k_rotk_passq_rotq_passcache_kwargsr   r~   outputss                               r*   forwardCodeGenAttention.forward   s
    mmM*KK		#2&" =>	MMD$<$<<F	!KK	"Ec!!%)A)A4==Y_!`%=%=t}}U[\!!%)A)A4==Y_!`aAq)..!!\%8%88-001D1DEO#2D  .;;vv||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&sC5C(S9Ekk!Q1%aAq) !)-"0	L $**366-2E2E+F~~_klJC %)JJu5R[$\!\''T5M5Mt}}]mmK0((5
+&Gr,   )rD   rM   rZ   rO   rG   rN   rX   rW   rF   rY   rU   N)NNNNNNFFN)rK   
__module____qualname____firstlineno__r@   rg   rs   r   r   r    FloatTensorr	   
LongTensorboolr   r   Tensorr   __static_attributes____classcell__rJ   s   @r*   r;   r;   D   s<   X<
&$ )D '+6:3715$),159L 1 12L UOL !!2!23	L
 u//0L E--.L D>L $D>L !!1!12L 
ellE%,,//0u||U5<<%8%c@Q:RRST	V
L Lr,   r;   c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )
CodeGenMLP   c                    > [         TU ]  5         UR                  n[        R                  " X15      U l        [        R                  " X5      U l        [        UR                     U l	        [        R                  " UR                  5      U l        g r   )r?   r@   n_embdr   rV   fc_infc_outr   activation_functionactrB   rE   dropout)r[   intermediate_sizer\   rM   rJ   s       r*   r@   CodeGenMLP.__init__   s`    MM	YYy<
ii 1=&445zz&"4"45r,   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   )r[   r   s     r*   r   CodeGenMLP.forward   s@    

=1/M2]3r,   )r   r   r   r   )rK   r   r   r   r@   r   r    r   r   r   r   r   s   @r*   r   r      s1    6Xe.?.?%@ UEVEV  r,   r   c                   f  ^  \ rS rSrSU 4S jjr       SS\\R                     S\\   S\\R                     S\\R                     S\\R                     S\\
   S	\\
   S
\\R                     S\\\R                     \\\R                  \\R                  S4   4      4   4S jjrSrU =r$ )CodeGenBlock   c                   > [         TU ]  5         UR                  b  UR                  OSUR                  -  n[        R
                  " UR                  UR                  S9U l        [        X5      U l	        [        X15      U l        g )Nrk   eps)r?   r@   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r;   attnr   mlp)r[   r\   rG   	inner_dimrJ   s       r*   r@   CodeGenBlock.__init__   s_    &,nn&@FNNa&--FW	LLF4M4MN	$V7	i0r,   r   r   r|   r   r}   r   r   r   r   .c	                     Un	U R                  U5      nU R                  UUUUUUUUS9n
U
S   nU
SS  nU R                  U5      nX-   U	-   nU(       a  U4U-   nU$ U4USS  -   nU$ )Nr   r   r|   r   r}   r   r   r   r   r   )r   r   r   )r[   r   r   r|   r   r}   r   r   r   residualattn_outputsr   r   feed_forward_hidden_statess                 r*   r   CodeGenBlock.forward  s     !		-0yy'!)%/) ! 	
 #1oqr"%)XXm%<"#@8K$&0G  %&4Gr,   )r   r   r   r   r   )rK   r   r   r   r@   r   r    r   r	   r   r   r   r   r   r   r   r   r   s   @r*   r   r      s    1 '+6:3715$),159" 1 12" UO" !!2!23	"
 u//0" E--." D>" $D>" !!1!12" 
uU\\"HU5<<uGXGXZ]G]A^3^-_$``	a" "r,   r   c                   P   ^  \ rS rSr\rSrSrS/rSr	Sr
SrSrU 4S jrS rSrU =r$ )	CodeGenPreTrainedModeli&  transformerTr   past_key_valuesc                 &   > [         TU ]  " U0 UD6  g r   )r?   r@   )r[   inputskwargsrJ   s      r*   r@   CodeGenPreTrainedModel.__init__1  s    &+F+r,   c                    [        U[        R                  45      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.        )meanstdNr   )
isinstancer   rV   weightdatanormal_r\   initializer_ranger>   zero_	Embeddingpadding_idxr   fill_)r[   modules     r*   _init_weights$CodeGenPreTrainedModel._init_weights4  s   fryyl++ MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r,    )rK   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_quantized_cache_supports_static_cacher@   r   r   r   r   s   @r*   r   r   &  sD     L%&*#'("3  $!,* *r,   r   c                   r  ^  \ rS rSrU 4S jrS rS r\            SS\\	R                     S\\\\\\	R                        4      S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       r SS\\	R                  S4   S\	R                  S\	R                  S\S\4
S jjr\S\	R                  S\S\S\	R,                  S\	R                  S\4S j5       rSrU =r$ )CodeGenModeliE  c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " UR                  5      U l
        [
        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [
        R                   " U R                  UR"                  S9U l        ['        UR(                  UR*                  UR,                  -  5      U l        SU l        U R1                  5         g s  snf )N)rG   r   F)r?   r@   r   rM   
vocab_sizer   r   wterB   
embd_pdropdrop
ModuleListrangen_layerr   hr   r   ln_fminrY   n_ctxrN   gradient_checkpointing	post_init)r[   r\   irJ   s      r*   r@   CodeGenModel.__init__G  s      ++<< 1 14>>BJJv001	5QWQ_Q_K`aK`aV AK`abLLV5N5NO	f//A[A[1[\&+# 	  bs   /Ec                     U R                   $ r   r   r[   s    r*   get_input_embeddings!CodeGenModel.get_input_embeddingsW  s    xxr,   c                     Xl         g r   r   r[   new_embeddingss     r*   set_input_embeddings!CodeGenModel.set_input_embeddingsZ  s    !r,   	input_idsr   r|   token_type_idsr   r}   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nSnU(       aP  [        U[        5      (       d;  SnUc  [        5       nO+[        R                  " U5      n[        R                  S5        UR                  S   nUc8  Ub  UR!                  5       OSn["        R$                  " UUU-   UR&                  S	9nUc  UR)                  S5      nU R+                  X7XU	5      nU R-                  X`R                   R.                  5      nUnUb(  UR1                  S
U5      nU R                  U5      nUU-   nU R3                  U5      nS
UUR5                  S
5      4nSnU	(       a  SOSnU
(       a  SOSn[7        U R8                  5       H  u  nnU
(       a  UU4-   nU R                  (       a8  U R                  (       a'  U R;                  UR<                  USUUUU   UU	U5	      nOU" UUUUUU   UU	US9nUS   nUSL a  US   nU	(       d  M  UUU(       a  SOS   4-   nM     U R?                  U5      nUR1                  U5      nU
(       a  UU4-   nU(       a  UOSnU(       a  URA                  5       nU(       d  [C        S UUUU4 5       5      $ [E        UUUUS9$ )au  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   r   r/   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r*   	<genexpr>'CodeGenModel.forward.<locals>.<genexpr>  s      ^a^s   	)last_hidden_stater   r   
attentions)#r\   r   r  r   use_return_dictrP   r   trainingrH   rI   r   r   r	   r
   from_legacy_cacherb   get_seq_lengthr    r!   r   	unsqueeze_update_causal_maskget_head_maskr   rp   r   ro   	enumerater   _gradient_checkpointing_func__call__r   to_legacy_cachetupler   )r[   r  r   r|   r  r   r}   r  r   r   r  r	  r   r   return_legacy_cache
seq_lengthpast_seen_tokensr   r   token_type_embedsoutput_shapenext_decoder_cacheall_self_attentionsall_hidden_statesr   blockr   
next_caches                               r*   r   CodeGenModel.forward]  s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  HHY/M $Z??"&&".."."@"@"Q##^ #((+
!CRC^==?de"\\*:<Lz<YbobvbvwN)33A6L..>L]
 &&y++2E2EF	%%+00Z@N $ 8),==M		-0J(:(:2(>?!$5b4"6BD!$&&)HAu#$58H$H!**t}};;NN! aL%"
  "/.#.!-'l'&7#1	 $AJMD %,QZ"  &9W)QYZ=[<]&]#C *F 		-0%**<8 1]4D D+4'$
#335J ):7HJ]^   '+&+*	
 	
r,   r   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fsdpa)r  past_key_values_lengthis_trainingr   r/   )sequence_lengthtarget_lengthr   r   
batch_size)cudaxpunpu)r\   _attn_implementationanyr   r    r   r   r  is_compileabler   _ignore_causal_mask_sdpar  r   rb   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typefinfor   _unmask_unattended)r[   r|   r*  r   r   r   r!  using_compilable_cacher   r1  r2  r   	min_dtypes                r*   r   CodeGenModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr,   r1  r2  r   r3  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nrk   )
fill_valuer   r   r   )diagonalr  r/   r   )r   r    r>  r   fullr   triur!   ra   expandclonerb   rS   masked_fill)r|   r1  r2  r   r   r3  r   r   rA  mask_lengthpadding_masks              r*   r<  BCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position3  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r,   )r   rM   r   r   r   rY   r   r   )NNNNNNNNNNNN)F)rK   r   r   r   r@   r   r  r   r   r    r   r   r	   r   r   r   r   r   r   r  staticmethodintr   r<  r   r   r   s   @r*   r   r   E  s    "  15NR6:59371559$(,0/3&*59N
E,,-N
 "%uU5<<5H/I(I"JKN
 !!2!23	N

 !!1!12N
 u//0N
 E--.N
   1 12N
 D>N
 $D>N
 'tnN
 d^N
 !!1!12N
 
u--	.N
 N
n #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r,   r   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                    :  ^  \ rS rSrS/rU 4S jrS rS r\             SS\	\
R                     S\	\\\\\
R                        4      S\	\
R                      S	\	\
R                     S
\	\
R                     S\	\
R                      S\	\
R                      S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4S jj5       r\S\\\
R                        S\
R                  S\\\
R                        4S j5       rSrU =r$ )CodeGenForCausalLMil  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g r   )
r?   r@   r   r   r   rV   r   r   lm_headr   )r[   r\   rJ   s     r*   r@   CodeGenForCausalLM.__init__t  sE     '/yy0A0AB 	r,   c                     U R                   $ r   rT  r   s    r*   get_output_embeddings(CodeGenForCausalLM.get_output_embeddings|  s    ||r,   c                     Xl         g r   rW  r  s     r*   set_output_embeddings(CodeGenForCausalLM.set_output_embeddings  s    %r,   r  r   r|   r  r   r}   r  labelsr   r   r  r	  r   r   c                 (   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUUS9nUS   nU R                  U5      R	                  [
        R                  5      nSnUb`  UR	                  UR                  5      nU R                  " UU4SU R                   R                  0UD6nUR	                  UR                  5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)r   r|   r  r   r}   r  r   r   r  r	  r   r   r   r   )losslogitsr   r   r  )r\   r  r   rT  rS   r    rR   r   loss_functionr   r   r   r   r   r  )r[   r  r   r|   r  r   r}   r  r]  r   r   r  r	  r   r   transformer_outputsr   	lm_logitsr_  outputs                       r*   r   CodeGenForCausalLM.forward  sD   8 &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.
 LL/225==A	YYy//0F%%  ;;11 	D 77=../D\$7$;;F)-)9TGf$EvE%/??-;;*55
 	
r,   beam_idxc                 .   ^ [        U4S jU  5       5      $ )z
This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
[`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
c              3   N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectrS   r   )r  
past_staterf  s     r*   r  >CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>  s1     j_iQ[))!X[[9J9J-KLL_is   7:Nr  )r  r   rf  s     r*   r  4CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>  s'      
-
 j_ijjj-s   "%rm  )r   rf  s    `r*   _reorder_cache!CodeGenForCausalLM._reorder_cache  s      
-
 
 	
r,   )rT  r   )NNNNNNNNNNNNN)rK   r   r   r   _tied_weights_keysr@   rX  r[  r   r   r    r   r   r	   r   r   r   r   r   r   rN  ro  r   r   r   s   @r*   rR  rR  l  s    ++&  15NR6:59371559-1$(,0/3&*59J
E,,-J
 "%uU5<<5H/I(I"JKJ
 !!2!23	J

 !!1!12J
 u//0J
 E--.J
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
  
u,,	-!J
 J
X 
uU\\23
?D||
	uU\\"	#
 
r,   rR  )rR  r   r   )0__doc__typingr   r   r   r    torch.utils.checkpointr   activationsr   cache_utilsr	   r
   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   configuration_codegenr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrK   rH   rO  r   r+   r5   r9   Moduler;   r   r   r   r   rR  __all__r   r,   r*   <module>r     sq    ) )    ! . ) > O - 
 1  !!;J 
		H	%P P3 P5<< P  = =ELL =u|| =X]XdXd =\ryy \@ (+299 +\ *_ * *< c) c cL	 
j
/ j

j
Z Kr,   