
    fTh                        S r SSKrSSKJrJrJr  SSKrSSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+  \$" 5       (       a  SSK,J-r-  SSK.J/r/  \" 5       (       a  SSKJ0r0  \&Rb                  " \25      r3S\4S\4S\Rj                  4S jr6\Rn                  Rp                  S 5       r9S\Rj                  S\Rj                  4S jr:S\Rj                  S\Rj                  S\Rj                  S\Rj                  4S jr; " S  S!\	Rx                  5      r= " S" S#\=5      r>\=\>S$.r? " S% S&\	Rx                  5      r@ " S' S(\	Rx                  5      rA\# " S) S*\ 5      5       rBS+rCS,rD\# " S- S.\B5      5       rE\#" S/S09 " S1 S2\B\5      5       rF\#" S3S09 " S4 S5\B5      5       rG\# " S6 S7\B5      5       rH/ S8QrIg)9zPyTorch GPT-J model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)!flash_attn_supports_top_left_maskis_flash_attn_available)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)PreTrainedModel)add_start_docstringsauto_docstringis_torch_flex_attn_availableis_torch_fx_proxylogging)assert_device_mapget_device_map   )
GPTJConfig)	BlockMask)make_flex_block_causal_mask)_flash_attention_forwardnum_posdimreturnc           	         SS[         R                  " SUS[         R                  S9U-  -  -  n[         R                  " S[         R                  " U [         R                  S9R	                  5       U5      R	                  5       n[         R
                  " [         R                  " U5      [         R                  " U5      4SS9$ )	N      ?i'  r      dtypezi , j -> i jr   r$   )torcharangeint64einsumfloatcatsincos)r#   r$   inv_freqsinusoid_inps       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/gptj/modeling_gptj.pycreate_sinusoidal_positionsr7   >   s    eQQekk JS PQRH<<WEKK0X0^0^0`bjkqqsL99eii-uyy/FGQOO    c                 t    U R                  UR                  5      R                  UR                  S   SS5      $ Nr   r   )todevicerepeatshape)embed_positionsposition_idss     r6   get_embed_positionsrA   D   s5    l11299,:L:LQ:OQRTUVVr8   xc                     U S S 2S S 2S S 2S S S24   nU S S 2S S 2S S 2SS S24   n[         R                  " U* U4SS9n U R                  S5      $ )Nr(   r   r+   )r,   stackflatten)rB   x1x2s      r6   rotate_every_tworJ   I   sS    	
1aCaC<B	
1aADqD=	BbS"I2&A99R=r8   tensorr2   r3   c                     [         R                  " US S 2S S 2S S S 24   SS5      n[         R                  " US S 2S S 2S S S 24   SS5      nX-  [        U 5      U-  -   $ )Nr(   r
   )r,   repeat_interleaverJ   )rK   r2   r3   s      r6   apply_rotary_pos_embrN   P   s\    

!
!#aD!m"4a
;C

!
!#aD!m"4a
;CL-f5;<<r8   c                     ^  \ rS rSrSU 4S jjrS rS r  SS jrS r       SS\	R                  S\\   S	\\	R                     S
\\	R                     S\\	R                     S\\   S\\   S\\	R                     S\\\	R"                  \\	R"                     4   \\\	R"                  \\	R"                     \\	R"                  S4   4      4   4S jjrSrU =r$ )GPTJAttentionV   c                 6  > [         TU ]  5         Xl        UR                  n[        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        SU l
        X l        Uc-  [        R                  SU R                  R                   S35        UR                   U l        UR$                  U l        U R"                  U R$                  -  U l        U R&                  U R$                  -  U R"                  :w  a&  [)        SU R"                   SU R$                   S35      e[*        R,                  " [*        R.                  " U R&                  [*        R0                  S95      R3                  [*        R4                  " 5       5      U l        [        R8                  " U R"                  U R"                  SS	9U l        [        R8                  " U R"                  U R"                  SS	9U l        [        R8                  " U R"                  U R"                  SS	9U l        [        R8                  " U R"                  U R"                  SS	9U l         URB                  U l!        U RB                  =(       d    U R"                  n[E        X45      U l#        g )
NTzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r)   Fbias)$super__init__configmax_position_embeddingsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	is_causal	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr,   sqrtrK   float32r;   get_default_dtype
scale_attnLineark_projv_projq_projout_proj
rotary_dimr7   r?   )selfrW   r_   max_positionspos_embd_dimrb   s        r6   rV   GPTJAttention.__init__W   s   66JJv'8'89ZZ(:(:;" !8!8 9 :, ,  ++#)#=#= $*B*BB==4333t~~EWX\XfXfWg h++/+C+C*DBH   **U\\$--u}}%UVYYZ_ZqZqZstiiUKiiUKiiUK		$..$..uM ++8$..:=Wr8   c                 \   UR                  5       SS X#4-   nUR                  U5      nU(       a  U$ [        UR                  5      S:X  a  UR	                  SSSSS5      $ [        UR                  5      S:X  a  UR	                  SSSS5      $ [        S	[        UR                  5       35      e)
z?
Splits hidden dim into attn_head_size and num_attention_heads
NrD      r   r   r
   r(      3Input tensor rank should be one of [4, 5], but is: )sizeviewlenr>   permuterh   )rs   rK   rf   attn_head_sizerotary	new_shapes         r6   _split_headsGPTJAttention._split_headsz   s     KKM#2&*=)NN	Y'Mv||!>>!Q1a00!#>>!Q1--RSVW]WcWcSdRefggr8   c                    [        UR                  5      S:X  a$  UR                  SSSSS5      R                  5       nO][        UR                  5      S:X  a#  UR                  SSSS5      R                  5       nO![	        S[        UR                  5       35      eUR                  5       SS	 X#-  4-   nUR                  U5      $ )
zB
Merges attn_head_size dim and num_attn_heads dim into hidden dim
rx   r   r   r
   r(   ry   rz   NrE   )r}   r>   r~   
contiguousrh   r{   r|   )rs   rK   rf   r   r   s        r6   _merge_headsGPTJAttention._merge_heads   s     v||!^^Aq!Q2==?F!#^^Aq!Q/::<FRSVW]WcWcSdRefggKKM#2&*=*N)PP	{{9%%r8   c                    UR                  [        R                  5      nUR                  [        R                  5      n[        R                  " XR	                  SS5      5      nX`R
                  -  nUb"  US S 2S S 2S S 2S UR                  S   24   nXg-   n[        R                  R                  USS9nUR                  UR                  5      nU R                  U5      nUb  Xe-  n[        R                  " Xc5      nX4$ )NrD   rE   r+   )r;   r,   rj   matmul	transposerl   r>   r   
functionalsoftmaxr*   r[   )	rs   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputs	            r6   _attnGPTJAttention._attn   s     'ffU]]#||E==R+@A#oo5%(Aq/CIIbM/)ABK'5L}},,\r,B#u{{3((6  '3Lll<7((r8   c                     U R                   nUR                  UR                  :w  a!  UR                  UR                  5      nX l         UR                  UR                  S   SS5      $ r:   )r?   r<   r;   r=   r>   )rs   r@   r?   s      r6   _get_embed_positions"GPTJAttention._get_embed_positions   s]    ..!!\%8%88-001D1DEO#2 %%l&8&8&;QBBr8   hidden_states
layer_pastr   r@   r   	use_cacheoutput_attentionscache_positionr%   .c	                 4   U R                  U5      n	U R                  U5      n
U R                  U5      nU R                  XR                  U R
                  S5      n	U R                  XR                  U R
                  S5      n
U R                  XR                  U R
                  S5      n[        U5      (       d#  [        R                  R                  5       (       a  [        U R                  U5      nOU R                  U5      nUR                  S5      R                  SSUR                  S   5      n[        R                   " USU5      n[        R"                  " XR                  S   S-  SS9u  nnU R$                  b  U
S S 2S S 2S S 2S U R$                  24   nU
S S 2S S 2S S 2U R$                  S 24   nU	S S 2S S 2S S 2S U R$                  24   nU	S S 2S S 2S S 2U R$                  S 24   n['        UUU5      n['        UUU5      n[        R(                  " UU/SS9n
[        R(                  " UU/SS9n	O['        XU5      n
['        XU5      n	U
R+                  SSSS5      n
U	R+                  SSSS5      n	Ub0  UUU R$                  US	.nUR-                  XU R.                  U5      u  pU R1                  XXU5      u  nnU R3                  UU R                  U R
                  5      nU R5                  U5      nU R7                  U5      nUU4nU(       a  UU4-  nU$ )
NTFrD   r   r(   r+   r   r
   r2   r3   partial_rotation_sizer   )rp   rn   ro   r   rf   rg   r   r,   jit
is_tracingrA   r?   r   	unsqueezer=   r>   gathersplitrr   rN   r1   r~   updater_   r   r   rq   r]   )rs   r   r   r   r@   r   r   r   r   r   r   r   r?   repeated_position_idssincosr2   r3   k_rotk_passq_rotq_passcache_kwargsr   r   outputss                            r6   forwardGPTJAttention.forward   s    M*kk-(M*!!%)A)A4==RVW%=%=t}}dS!!%)A)A4==RWX\**eii.B.B.D.D 2$2F2FUO"77EO , 6 6r : A A!QH]H]^`Ha boq2GH;;v||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&s5C(S9Ekk!Q1%aAq)!)-"0	L $**3t~~|TJC %)JJu5R[$\!\''T5M5Mt}}]mmK0((5
+&Gr8   )r[   rW   re   r?   rg   r^   rn   r_   rf   rq   rp   r]   rr   rl   ro   N)NNNNNNFFN)rc   
__module____qualname____firstlineno__rV   r   r   r   r   r,   FloatTensorr   r   
LongTensorboolr   r   Tensorr   __static_attributes____classcell__rb   s   @r6   rP   rP   V   s>   !XFh&$ )>C '+6:3715$),159H((H UOH !!2!23	H
 u//0H E--.H D>H $D>H !!1!12H 
ellE%,,//0u||U5<<%8%c@Q:RRST	V
H Hr8   rP   c                     ^  \ rS rSrSrU 4S jr       SS\R                  S\\	   S\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\R                     S\\\R                  \\R                     4   \\\R                  \\R                     \\R                  S4   4      4   4S jjrSrU =r$ )GPTJFlashAttention2i  a4  
GPTJ flash attention module. This module inherits from `GPTJAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r   )rU   rV   r   _flash_attn_uses_top_left_mask)rs   argskwargsrb   s      r6   rV   GPTJFlashAttention2.__init__  s#    $)&)
 /P.Q+r8   r   r   r   r@   r   r   r   r   r%   .c	                 	   U R                  U5      n	U R                  U5      n
U R                  U5      nU R                  XR                  U R
                  S5      n	U R                  XR                  U R
                  S5      n
U R                  XR                  U R
                  S5      n[        U5      (       d#  [        R                  R                  5       (       a  [        U R                  U5      nOU R                  U5      nUR                  S5      R                  SSUR                  S   5      n[        R                   " USU5      n[        R"                  " XR                  S   S-  SS9u  nnU R$                  b  U
S S 2S S 2S S 2S U R$                  24   nU
S S 2S S 2S S 2U R$                  S 24   nU	S S 2S S 2S S 2S U R$                  24   nU	S S 2S S 2S S 2U R$                  S 24   n['        UUU5      n['        UUU5      n[        R(                  " UU/SS9n
[        R(                  " UU/SS9n	O['        XU5      n
['        XU5      n	U
R+                  SSSS5      n
U	R+                  SSSS5      n	Ub0  UUU R$                  US	.nUR-                  XU R.                  U5      u  pU
R+                  SSSS5      R1                  5       n
U	R+                  SSSS5      R1                  5       n	UR+                  SSSS5      R1                  5       nU	R2                  nU[        R4                  :X  a  [        R6                  " 5       (       a  [        R8                  " 5       nOR[;        U R<                  S
5      (       a  U R<                  R>                  nO U R                   R@                  R2                  n[B        RE                  SU S35        U	RG                  U5      n	U
RG                  U5      n
URG                  U5      nU RH                  (       a  U R<                  RJ                  OSnU	R                  S   n[M        U	U
UUUUU RN                  U RP                  S9nURS                  UR                  S   UR                  S   UR                  S   UR                  S   -  5      nU RU                  U5      nU RW                  U5      nUU4nU(       a  UU4-  nU$ )NTFrD   r   r(   r+   r   r
   r   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .        )dropoutr^   use_top_left_mask),rp   rn   ro   r   rf   rg   r   r,   r   r   rA   r?   r   r   r=   r>   r   r   rr   rN   r1   r~   r   r_   r   r*   rj   is_autocast_enabledget_autocast_gpu_dtypehasattrrW   r   weightr`   ra   r;   trainingrZ   r"   r^   r   reshaperq   r]   )rs   r   r   r   r@   r   r   r   r   r   r   r   r?   r   r   r2   r3   r   r   r   r   r   input_dtypetarget_dtypeattention_dropoutquery_lengthr   r   r   s                                r6   r   GPTJFlashAttention2.forward  sP    M*kk-(M*!!%)A)A4==RVW%=%=t}}dS!!%)A)A4==RWX\**eii.B.B.D.D 2$2F2FUO"77EO , 6 6r : A A!QH]H]^`Ha boq2GH;;v||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&s5C(S9E
 kk!Q1%aAq) !)-"0	L $**3t~~|TJC kk!Q1%002aAq)446aAq)446 kk%--'((**$;;=&?@@#{{BB#{{1177 >$ HH\*E&&&CHH\*E6:mmDKK22{{1~ 0%nn"AA	
 #**q!<#5#5a#8,:L:LQ:OR^RdRdefRg:g
 mmK0((5
+&Gr8   )r   r   )rc   r   r   r   __doc__rV   r,   r   r   r   r   r   r   r   r   r   r   r   r   s   @r6   r   r     s$   R '+6:3715$),159~((~ UO~ !!2!23	~
 u//0~ E--.~ D>~ $D>~ !!1!12~ 
ellE%,,//0u||U5<<%8%c@Q:RRST	V
~ ~r8   r   )eagerflash_attention_2c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )GPTJMLPi  c                    > [         TU ]  5         UR                  n[        R                  " X15      U l        [        R                  " X5      U l        [        UR                     U l	        [        R                  " UR                  5      U l        g r   )rU   rV   n_embdr   rm   fc_infc_outr   activation_functionactrY   r\   r   )rs   intermediate_sizerW   re   rb   s       r6   rV   GPTJMLP.__init__  s`    MM	YYy<
ii 1=&445zz&"4"45r8   r   r%   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   )rs   r   s     r6   r   GPTJMLP.forward  s@    

=1/M2]3r8   )r   r   r   r   )rc   r   r   r   rV   r   r,   r   r   r   r   r   s   @r6   r   r     s1    6Xe.?.?%@ UEVEV  r8   r   c                   f  ^  \ rS rSrSU 4S jjr       SS\\R                     S\\   S\\R                     S\\R                     S\\R                     S\\
   S	\\
   S
\\R                     S\\\R                     \\\R                  \\R                  S4   4      4   4S jjrSrU =r$ )	GPTJBlocki  c                 *  > [         TU ]  5         UR                  b  UR                  OSUR                  -  n[        R
                  " UR                  UR                  S9U l        [        UR                     " X5      U l
        [        X15      U l        g )Nry   eps)rU   rV   n_innerr   r   	LayerNormlayer_norm_epsilonln_1GPTJ_ATTENTION_CLASSES_attn_implementationattnr   mlp)rs   rW   r_   	inner_dimrb   s       r6   rV   GPTJBlock.__init__  sj    &,nn&@FNNa&--FW	LLF4M4MN	*6+F+FGZ	9-r8   r   r   r   r@   r   r   r   r   r%   .c	                     Un	U R                  U5      nU R                  UUUUUUUUS9n
U
S   nU
SS  nU R                  U5      nX-   U	-   nU(       a  U4U-   nU$ U4USS  -   nU$ )Nr   r   r   r@   r   r   r   r   r   r   )r   r   r   )rs   r   r   r   r@   r   r   r   r   residualattn_outputsr   r   feed_forward_hidden_statess                 r6   r   GPTJBlock.forward  s     !		-0yy'!)%/) ! 	
 #1oqr"%)XXm%<"#@8K$&0G  %&4Gr8   )r   r   r   r   r   )rc   r   r   r   rV   r   r,   r   r   r   r   r   r   r   r   r   r   r   s   @r6   r   r     s    . '+6:3715$),159" 1 12" UO" !!2!23	"
 u//0" E--." D>" $D>" !!1!12" 
uU\\"HU5<<uGXGXZ]G]A^3^-_$``	a" "r8   r   c                   \   ^  \ rS rSr\rSrSrSrS/r	Sr
SrSrSrSrSrU 4S jrS rS	rU =r$ )
GPTJPreTrainedModeli  transformerTr   past_key_valuesFc                 &   > [         TU ]  " U0 UD6  g r   )rU   rV   )rs   inputsr   rb   s      r6   rV   GPTJPreTrainedModel.__init__  s    &+F+r8   c                    [        U[        R                  45      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.r   )meanstdNr'   )
isinstancer   rm   r   datanormal_rW   initializer_rangerT   zero_	Embeddingpadding_idxr   fill_)rs   modules     r6   _init_weights!GPTJPreTrainedModel._init_weights  s   fryyl++ MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r8    )rc   r   r   r   r   config_classbase_model_prefixis_parallelizablesupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_cache_class_supports_quantized_cache_supports_static_cache!_supports_param_buffer_assignmentrV   r  r   r   r   s   @r6   r   r     sU    L%&*#$"3!  $!(-%,* *r8   r   a  
    This is an experimental feature and is a subject to change at a moment's notice. Uses a device map to distribute
    attention modules of the model across several devices. If no device map is given, it will evenly distribute blocks
    across all devices.

    Args:
        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
            following number of attention modules:

                - gpt-j-6B: 28

    Example:

    ```python
    # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6],
        1: [7, 8, 9, 10, 11, 12, 13],
        2: [14, 15, 16, 17, 18, 19, 20],
        3: [21, 22, 23, 24, 25, 26, 27],
    }
    model.parallelize(device_map)
    ```
a6  
    Moves the model to CPU from a model parallel state.

    Example:

    ```python
    # On a 4 GPU machine with gpt-j-6B:
    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6],
        1: [7, 8, 9, 10, 11, 12, 13],
        2: [14, 15, 16, 17, 18, 19, 20],
        3: [21, 22, 23, 24, 25, 26, 27],
    }
    model.parallelize(device_map)  # Splits the model across several devices
    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
    ```
c                     ^  \ rS rSrU 4S jr\" \5      SS j5       r\" \5      S 5       r	S r
S r\            SS\\R                     S\\\\\\R&                        4      S	\\R(                     S
\\R                     S\\R                     S\\R(                     S\\R(                     S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       r S S	\\R&                  S4   S\R&                  S\R&                  S\S\4
S jjr\S	\R&                  S\S\S\R6                  S\R&                  S\4S j5       rSrU =r$ )!	GPTJModeli0  c           
      v  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " UR                  5      U l
        [
        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [
        R                   " U R                  UR"                  S9U l        SU l        S U l        SU l        U R-                  5         UR.                  S:H  U l        g s  snf )N)r_   r   Fr   )rU   rV   r   re   
vocab_sizer   r	  wterY   
embd_pdropdrop
ModuleListrangen_layerr   hr   r   ln_fmodel_parallel
device_mapgradient_checkpointing	post_initr   _use_flash_attention_2)rs   rW   irb   s      r6   rV   GPTJModel.__init__2  s      ++<< 1 14>>BJJv001	fnnH]^H]1	& >H]^_LLV5N5NO	 $&+# 	&,&A&AEX&X#  _s   /D6c                    [         R                  " S[        5        UcD  [        [	        U R
                  5      [        [        R                  R                  5       5      5      OUU l
        [        U R                  [	        U R
                  5      5        SU l        SU R                  R                  5       ;   a  SO.S[        [        U R                  R                  5       5      5      -   U l        S[        [#        U R                  R                  5       5      5      -   U l        U R&                  R)                  U R                   5      U l        U R                  R+                  5        HG  u  p#U H<  nS[        U5      -   nU R
                  U   R)                  U5      U R
                  U'   M>     MI     U R,                  R)                  U R$                  5      U l        g )Na6  `GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1, ...}Tcpucuda:)warningswarnFutureWarningr   r}   r%  r#  r,   cudadevice_countr(  r   r'  keysstrminfirst_devicemaxlast_devicer  r;   itemsr&  )rs   r(  kvblockcuda_devices         r6   parallelizeGPTJModel.parallelizeF  sY    	
 NXM_N3tvv;ejj.E.E.G(HIeo 	 	$//3tvv;7"%*doo.B.B.D%DE'TWX[\`\k\k\p\p\rXsTtJt"ST__-A-A-C)D%EE88;;t001OO))+DA%A. $u 0 0 =u  ,
 IILL!1!12	r8   c                    [         R                  " S[        5        SU l        S U l        SU l        SU l        U R                  R                  S5      U l        [        [        U R                  5      5       H.  nU R                  U   R                  S5      U R                  U'   M0     U R                  R                  S5      U l        [        R                  R                  5         g )N\Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.Fr/  )r1  r2  r3  r'  r(  r9  r;  r  r;   r#  r}   r%  r&  r,   r4  empty_cache)rs   indexs     r6   deparallelizeGPTJModel.deparallelize`  s    j	
 $! 88;;u%3tvv;'E FF5M,,U3DFF5M (IILL'	

 r8   c                     U R                   $ r   r  rs   s    r6   get_input_embeddingsGPTJModel.get_input_embeddingsp  s    xxr8   c                     Xl         g r   rJ  rs   new_embeddingss     r6   set_input_embeddingsGPTJModel.set_input_embeddingss  s    !r8   	input_idsr   r   token_type_idsr@   r   inputs_embedsr   r   output_hidden_statesreturn_dictr   r%   c                 	   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nSnU(       aP  [        U[        5      (       d;  SnUc  [        5       nO+[        R                  " U5      n[        R                  S5        UR                  S   nUc7  Ub  UR!                  5       OSn["        R$                  " XU-   UR&                  S	9nUc  UR)                  S5      nU R+                  X7XU	5      nU R-                  X`R                   R.                  5      nUnUb(  UR1                  S
U5      nU R                  U5      nUU-   nU R3                  U5      nS
UUR5                  S
5      4nSnU	(       a  SOSnU
(       a  SOSn[7        U R8                  5       GH  u  nnU R:                  (       a  ["        R<                  R?                  UR&                  5        UbT  UR@                  RC                  UR&                  5      Ul         URD                  RC                  UR&                  5      Ul"        Ub  URC                  UR&                  5      n[        U["        RF                  5      (       a  URC                  UR&                  5      nU
(       a  UU4-   nU R                  (       a8  U R                  (       a'  U RI                  URJ                  USUUUU   UU	U5	      nOU" UUUUUU   UU	US9nUS   nUSL a  US   nU	(       a  UUU(       a  SOS   4-   nU R:                  (       d  GM  U RL                  RO                  5        HO  u  nnUUS
   :X  d  M  S[Q        U5      -   U RR                  :w  d  M/  URC                  S[Q        US-   5      -   5      nMQ     GM     U RU                  U5      nUR1                  U5      nU
(       a  UU4-   nU(       a  UOSnU(       a  URW                  5       nU(       d  [Y        S UUUU4 5       5      $ [[        UUUUS9$ )u  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   r<   rD   r  r   r(   r0  c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r  ).0r>  s     r6   	<genexpr>$GPTJModel.forward.<locals>.<genexpr>  s      ^a^s   	)last_hidden_stater   r   
attentions).rW   r   rV  r   use_return_dictrh   r)  r   r`   ra   r  r  r   r   from_legacy_cacher>   get_seq_lengthr,   r-   r<   r   _update_causal_maskget_head_maskr$  r|   r!  r{   	enumerater%  r'  r4  
set_device	key_cacher;   value_cacher   _gradient_checkpointing_func__call__r(  r<  r7  r;  r&  to_legacy_cachetupler   )rs   rS  r   r   rT  r@   r   rU  r   r   rV  rW  r   return_legacy_cache
seq_lengthpast_key_values_lengthr   r   token_type_embedsoutput_shapenext_decoder_cacheall_self_attentionsall_hidden_statesr,  r?  r   r=  r>  
next_caches                                r6   r   GPTJModel.forwardv  s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  HHY/M $Z??"&&".."."@"@"Q##^ #((+
!IXId_%C%C%Ejk""\\&(KTaThThN )33A6L..>L]
 &&y++2E2EF	%%+00Z@N $ 8),==M		-0J(:(:2(>?!$5b4"6BD!$&&)HAu""

%%m&:&:; #.0?0I0I0L0L]MaMa0bO-2A2M2M2P2PQ^QeQe2fO/ *"-..1E1E"FKi66 )]-A-A BI#$58H$H!**t}};;NN! aL%"
  "/.#.!-'l'&7#1	 $AJMD %,QZ" &9W)QYZ=[<]&]# """ OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4g *n 		-0%**<8 1]4D D+4'$
#335J ):7HJ]^   '+&+*	
 	
r8   r    input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nr   r   flex_attentionr   Fsdpa)rU  rp  is_trainingr   rD   )sequence_lengthtarget_lengthr*   r   
batch_size)r4  xpunpu)rW   r   anyr  r,   r   r!   rc  is_compileabler   _ignore_causal_mask_sdpar   r*   r>   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr<   typefinfor8  _unmask_unattended)rs   r   rx  r   r   r   past_seen_tokensusing_compilable_cacher*   r}  r~  r   	min_dtypes                r6   rd  GPTJModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr8   r}  r~  r*   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nry   )
fill_valuer*   r<   r   )diagonalrZ  rD   r   )r$   r,   r  r8  fullr<   triur-   r   expandcloner>   r;   masked_fill)r   r}  r~  r*   r   r  r   r   r  mask_lengthpadding_masks              r6   r  ?GPTJModel._prepare_4d_causal_attention_mask_with_cache_positiona  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r8   )r+  r(  r!  re   r9  r)  r%  r;  r&  r'  r  r  r   NNNNNNNNNNNN)F)rc   r   r   r   rV   r   PARALLELIZE_DOCSTRINGrA  DEPARALLELIZE_DOCSTRINGrG  rL  rQ  r   r   r,   r   r   r   r   r   r   r   r   r   rd  staticmethodintr*   r  r   r   r   s   @r6   r  r  0  sE   Y( /03 132 12! 3!"  15NR6:59371559$(,0/3&*59c
E,,-c
 "%uU5<<5H/I(I"JKc
 !!2!23	c

 !!1!12c
 u//0c
 E--.c
   1 12c
 D>c
 $D>c
 'tnc
 d^c
 !!1!12c
 
u--	.c
 c
X #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r8   r  zK
    The GPT-J Model transformer with a language modeling head on top.
    )custom_introc                    v  ^  \ rS rSrS/rU 4S jr\" \5      SS j5       r\" \	5      S 5       r
S rS r\             SS\\R                      S	\\\\\\R(                        4      S
\\R*                     S\\R                      S\\R                      S\\R*                     S\\R*                     S\\R                      S\\   S\\   S\\   S\\   S\\R                      S\\\4   4S jj5       r\S	\\\R(                        S\R(                  S\\\R(                        4S j5       rSrU =r$ )GPTJForCausalLMi  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        SU l	        S U l
        U R                  5         g NF)rU   rV   r  r   r   rm   r   r  lm_headr'  r(  r*  rs   rW   rb   s     r6   rV   GPTJForCausalLM.__init__  sV     $V,yy0A0AB $ 	r8   c                    [         R                  " S[        5        UcN  [        [	        U R
                  R                  5      [        [        R                  R                  5       5      5      OUU l        [        U R                  [	        U R
                  R                  5      5        U R
                  R                  U R                  5        U R                  R                  U R
                  R                   5      U l        SU l        g )NaT  `GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0': 0, 'transformer.h.1': 1, ...}T)r1  r2  r3  r   r}   r   r%  r#  r,   r4  r5  r(  r   rA  r  r;   r9  r'  )rs   r(  s     r6   rA  GPTJForCausalLM.parallelize  s    - 	
 ! 3t//112E%**:Q:Q:S4TU 	
 	$//3t/?/?/A/A+BC$$T__5||t'7'7'D'DE"r8   c                 8   [         R                  " S[        5        U R                  R	                  5         U R                  R                  S5      U l        U R                  R                  S5      U l        SU l        [        R                  R                  5         g )NrD  r/  F)r1  r2  r3  r   rG  r;   r  r'  r,   r4  rE  rK  s    r6   rG  GPTJForCausalLM.deparallelize  sm    j	
 	&&(++..u5||u-#

 r8   c                     U R                   $ r   r  rK  s    r6   get_output_embeddings%GPTJForCausalLM.get_output_embeddings  s    ||r8   c                     Xl         g r   r  rO  s     r6   set_output_embeddings%GPTJForCausalLM.set_output_embeddings  s    %r8   rS  r   r   rT  r@   r   rU  labelsr   r   rV  rW  r   r%   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUUS9nUS   nU R                  (       ab  [        R
                  R                  U R                  R                  5        UR                  U R                  R                  R                  5      nU R                  U5      R                  [        R                  5      nSnUb`  UR                  UR                  5      nU R                  " UU4SU R                   R                  0UD6nUR                  UR                  5      nU(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  UR&                  S9$ )a  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)r   r   rT  r@   r   rU  r   r   rV  rW  r   r   r  r   losslogitsr   r   r`  )rW   ra  r   r'  r,   r4  rg  r9  r;   r  r   r<   rj   loss_functionr  r*   r   r   r   r`  )rs   rS  r   r   rT  r@   r   rU  r  r   r   rV  rW  r   r   transformer_outputsr   	lm_logitsr  outputs                       r6   r   GPTJForCausalLM.forward  s   8 &1%<k$++B]B]"..+))%'/!5#) / 
 ,A. JJ!!$"2"2"?"?@),,T\\-@-@-G-GHM
 LL/225==A	YYy//0F%%  ;;11 	D 77=../D\$7$;;F)-)9TGf$EvE%/??-;;*55
 	
r8   beam_idxc                 .   ^ [        U4S jU  5       5      $ )z
This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
[`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
c              3   N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr;   r<   )r\  
past_stater  s     r6   r]  ;GPTJForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>/  s1     j_iQ[))!X[[9J9J-KLL_is   7:Nrm  )r\  r   r  s     r6   r]  1GPTJForCausalLM._reorder_cache.<locals>.<genexpr>.  s'      
-
 j_ijjj-s   "%r  )r   r  s    `r6   _reorder_cacheGPTJForCausalLM._reorder_cache%  s      
-
 
 	
r8   )r(  r  r'  r   r   )NNNNNNNNNNNNN)rc   r   r   r   _tied_weights_keysrV   r   r  rA  r  rG  r  r  r   r   r,   r   r   r   r   r   r   r   r   r   r  r  r   r   r   s   @r6   r  r    s    ++
 /0# 1#$ 12	! 3	!&  15NR6:59371559-1$(,0/3&*59O
E,,-O
 "%uU5<<5H/I(I"JKO
 !!2!23	O

 !!1!12O
 u//0O
 E--.O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
  
u,,	-!O
 O
b 
uU\\23
?D||
	uU\\"	#
 
r8   r  a  
    The GPT-J Model transformer with a sequence classification head on top (linear layer).

    [`GPTJForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT, GPT-2, GPT-Neo) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\	\	\R                           S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\	\4   4S jj5       rSrU =r$ )GPTJForSequenceClassificationi4  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        SU l	        S U l
        U R                  5         g )NFrS   )rU   rV   
num_labelsr  r   r   rm   r   scorer'  r(  r*  r  s     r6   rV   &GPTJForSequenceClassification.__init__C  sc      ++$V,YYv}}dooEJ
 $ 	r8   rS  r   r   rT  r@   r   rU  r  r   r   rV  rW  r%   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " UUR                  S	9U4   nSnUGb  UR                  UR                  5      nU R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )a  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)
r   r   rT  r@   r   rU  r   r   rV  rW  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rD   )r<   r*   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rZ  
regressionsingle_label_classificationmulti_label_classificationr  )rW   ra  r   r  r>   pad_token_idrh   r;   r<   r,   int32r-   argmaxr`   ra   rb   rc   problem_typer  r*   longr  r	   squeezer   r|   r   r   r   r   r`  )rs   rS  r   r   rT  r@   r   rU  r  r   r   rV  rW  r  r   r  r  last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                           r6   r   %GPTJForSequenceClassification.forwardP  s"   4 &1%<k$++B]B]"..+))%'/!5# / 
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYY}334F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r8   )r(  r'  r  r  r   r  )rc   r   r   r   rV   r   r   r,   r   r   r   r   r   r   r   r   r   r   r   s   @r6   r  r  4  sL     15@D6:59371559-1$(,0/3&*c
E,,-c
 "%ell(;"<=c
 !!2!23	c

 !!1!12c
 u//0c
 E--.c
   1 12c
 ))*c
 D>c
 $D>c
 'tnc
 d^c
 
u66	7c
 c
r8   r  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )GPTJForQuestionAnsweringi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        SU l	        S U l
        U R                  5         g r  )rU   rV   r  r  r   r   rm   rd   
qa_outputsr'  r(  r*  r  s     r6   rV   !GPTJForQuestionAnswering.__init__  se      ++$V,))F$6$68I8IJ $ 	r8   rS  r   rT  r@   r   rU  start_positionsend_positionsr   rV  rW  r%   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a*  UR                  S5      R                  UR                  5      n[        UR                  5       5      S:  a*  UR                  S5      R                  UR                  5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
rY  N)r   rT  r@   r   rU  r   rV  rW  r   r   rD   r+   )ignore_indexr(   )r  start_logits
end_logitsr   r`  )rW   ra  r   r  r   r  r   r}   r{   r;   r<   clampr   r   r   r`  )rs   rS  r   rT  r@   r   rU  r  r  r   rV  rW  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r6   r    GPTJForQuestionAnswering.forward  s   * &1%<k$++B]B]""))%'/!5# # 

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r8   )r(  r'  r  r  r   )NNNNNNNNNNN)rc   r   r   r   rV   r   r   r,   r   r   r   r   r   r   r   r   r   r   s   @r6   r  r    s2     156:593715596:48,0/3&*D
E,,-D
 !!2!23D
 !!1!12	D

 u//0D
 E--.D
   1 12D
 "%"2"23D
   0 01D
 $D>D
 'tnD
 d^D
 
u22	3D
 D
r8   r  )r  r  r  r  r   )Jr   r1  typingr   r   r   r,   torch.fxtorch.utils.checkpointr   torch.nnr   r   r	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.model_parallel_utilsr   r   configuration_gptjr   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   r"   
get_loggerrc   r`   r  r   r7   fxwraprA   rJ   rN   ModulerP   r   r   r   r   r   r  r  r  r  r  r  __all__r  r8   r6   <module>r     s2     ) )     A A ! . ) > h  .  L *  !!;J J 
		H	%P P3 P5<< P W W  = =ELL =u|| =X]XdXd =nBII nbM- Mb , bii &*		 *Z */ * *B : ( f# f fR 
R
)? R

R
j r
$7 r
r
j S
2 S
 S
lr8   