
    fTh*                     L   S r SSKrSSKrSSKrSSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"J#r#  SSK$J%r%  \"RL                  " \'5      r(S r) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r- " S S\RT                  5      r.\! " S S\5      5       r/\! " S S\/5      5       r0\!" SS9 " S S \/\5      5       r1\!" S!S9 " S" S#\/5      5       r2/ S$Qr3g)%zPyTorch OpenAI ImageGPT model.    N)AnyOptionalTupleUnion)nn)autocast)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                 f    SSK nSSKn[
        R                  R                  U5      n[        R                  SR                  U5      5        UR                  R                  U5      n/ n/ nU Ht  u  p[        R                  SR                  X5      5        UR                  R                  XY5      nUR                  U	5        UR                  UR                  5       5        Mv     [        Xx5       GH  u  pU	SS n	U	R!                  S5      n	[#        S U	 5       5      (       d	  U	S	   S
;   a5  [        R                  SR                  SR%                  U	5      5      5        Mq  U nU	S	   S;  a  ['        US5      nU	 GHg  nUR)                  SU5      (       a  UR!                  SU5      nOU/nUS   S:X  d	  US   S:X  a  ['        US5      nOUS   S:X  a  ['        US5      nOUS   S:X  d	  US   S:X  a  ['        XS   5      n['        US5      nOUS   S;   a  ['        US5      n['        US5      nO[+        U	5      S:X  a-  U	S   S:X  a$  US   S:X  a  ['        XS   5      n['        US5      nORUS   S:X  a  ['        US5      n['        US5      nO0US   S:X  a  ['        US5      n['        US5      nO['        XS   5      n[+        U5      S :  d  GMU  [-        US   5      nX   nGMj     [+        U	5      S:  a	  U	S   S:X  d  U	S	   S:X  d  U	S	   S:X  d	  U	S	   S:X  a  O UR.                  UR.                  :X  d   e [        R                  S!R                  U	5      5        U	S	   S":X  ad  [4        R6                  " UR9                  UR:                  UR:                  5      5      R<                  UR>                  SS2SUR:                  24'   GM  U	S	   S#:X  aq  [4        R6                  " UR9                  UR:                  UR:                  5      5      R<                  UR>                  SS2UR:                  S UR:                  -  24'   GMS  U	S	   S$:X  ag  [4        R6                  " UR9                  UR:                  UR:                  5      5      R<                  UR>                  SS2S UR:                  -  S24'   GM  [+        U	5      S:X  aT  U	S   S:X  aK  U	S    S:X  aB  [4        R6                  " UR9                  UR:                  UR:                  5      5      Ul        GM&  U	S	   S:X  a  [4        R6                  " U5      Ul        GMM  U	S	   S:X  a9  [4        R6                  " U5      UR>                  SUR@                  S-
  2SS24'   GM  U	S	   S:X  a&  [4        R6                  " U5      UR>                  S	'   GM  [4        R6                  " U5      Ul        GM     U $ ! [         a    [        R	                  S5        e f = f! [0         a1  nU=R2                  UR.                  UR.                  4-  sl        e SnAff = f)%z(
Load tf checkpoints in a pytorch model
r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z(Converting TensorFlow checkpoint from {}z"Loading TF weight {} with shape {}   /c              3   ,   #    U  H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>.load_tf_weights_in_imagegpt.<locals>.<genexpr>Q   s      
 nns   )_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr   r   attnc_projr,   lm_headsos   zInitialize PyTorch weight {}r5   r6   r7   )!re
tensorflowImportErrorloggererrorospathabspathinfoformattrainlist_variablesload_variableappendsqueezezipsplitanyjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr>   tftf_path	init_varsnamesarraysnamerU   arraypointerm_namescope_namesnumes                    r'   load_tf_weights_in_imagegptrn   0   sf   	 ggoo67G
KK:AA'JK''0IEF 8??LM&&w5Temmo&	 ! 5)ABxzz#  

 
 
 "X"KK,,SXXd^<=88#g}5GF||OV44 hhx8%h1~$A#(=!'84Q3&!'62Q5(KNe,C!'q>:!'84Q#AA!'84!'84TaDGv$5+a.H:T!'q>:!'84Q6)!'95!'84Q5(!'51!'84!'q>:;1$+a.)!,; > t9q=T!W.$r(f2DRTYHY]abd]ein]n}}333
 	299$?@8x/4/?/?fmm]c]j]j@k/l/n/nGLLOfmmO+,"X!AFAQAQfmmV]];Ba LLFMMA,===> "X!383C3CEMMRXR_R_agananDo3p3r3rGLLA-//0Y!^Q6 1d1g6I ++EMM&--,WXGL"X ++E2GL"X7<7G7G7NGLL06,,q00!34"X$//6GLL ++E2GLY *\ LC  Q	
 	P " 7==%++66s#   W W5!W25
X0?,X++X0c                   x   ^  \ rS rSrS	S\\   S\4U 4S jjjrS\R                  S\R                  4S jr
SrU =r$ )
ImageGPTLayerNorm   hidden_sizeepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__rs   r   	ParameterrX   Tensorr0   )selfrr   rs   	__class__s      r'   rw   ImageGPTLayerNorm.__init__   s,    ll5<<#<=    tensorreturnc           	          U[         R                  " [         R                  " [         R                  " U5      SSS9U R                  -   5      -  nXR
                  -  nU$ )Nr*   T)axiskeepdim)rX   sqrtmeansquarers   r0   )rz   r~   s     r'   forwardImageGPTLayerNorm.forward   sI    %**UZZV0D2W[%\_c_g_g%ghh++%r}   )rs   r0   )gh㈵>)__name__
__module____qualname____firstlineno__r   rT   floatrw   rX   ry   r   __static_attributes____classcell__r{   s   @r'   rp   rp      s?    >E#J >U > >
ell u||  r}   rp   c                   8  ^  \ rS rSrSS\\   S\\   4U 4S jjjrS rSS jr	SS jr
S rS	 r       SS
\R                  S\\   S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\4S jjrSrU =r$ )ImageGPTAttention   is_cross_attention	layer_idxc           
        > [         TU ]  5         UR                  nU R                  S[        R
                  " [        R                  " XD4[        R                  S95      R                  SSXD5      SS9  U R                  S[        R                  " S5      SS9  UR                  U l        UR                  U l        U R                  U R                  -  U l        U R                  U l        U R                  U R                  -  U R                  :w  a&  [!        SU R                   S	U R                   S
35      eUR"                  U l        X l        UR&                  U l        X0l        UR*                  U l        U R$                  (       aN  [-        SU R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        O([-        SU R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        [4        R6                  " UR8                  5      U l        [4        R6                  " UR<                  5      U l        [A        5       U l!        g )Nr2   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r=   r   )"rv   rw   max_position_embeddingsregister_bufferrX   trilonesboolviewr~   rr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r8   q_attnr:   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)rz   r`   r   r   max_positionsr{   s        r'   rw   ImageGPTAttention.__init__   s   66JJuzz="@

STYY1m  	 	
 	]ELL,>5Q++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;Er}   c                 8   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[
        R                  " X"U R                  -   USU R                  -  -   /5      n[        U R                  USS9U l	        [        U R                  USS9U l
        U R                  U R                  -  U R                  [        U5      -
  -  U l        U R                  [        U5      -
  U l        U R                  R                  U5      U l        g )Nr   r=   r   dim)rS   r   r   r   r   rX   catr   r   r8   r:   union)rz   headsindex
index_attns       r'   prune_headsImageGPTAttention.prune_heads   s    u:?7~~t}}^b^o^opYYt'>T__I\@]^_
 )jaH(eC  ??dnn<RUV[R\A\]#e*4 --33E:r}   c                 T   [         R                  " XR                  SS5      5      nU R                  (       a   U[	        UR                  S5      S-  5      -  nU R                  (       a  U[        U R                  S-   5      -  nU R                  (       d  UR                  S5      UR                  S5      pU R                  S S 2S S 2X-
  U2S U24   n	[         R                  " UR                  5      R                  n
[         R                  " XR                  UR                  S9n
[         R                   " XU
5      nUb  Xd-   n["        R$                  " SS9" U5      nUR'                  UR                  5      nU R)                  U5      nUb  Xe-  n[         R                  " Xc5      nX4$ )Nr*         ?r   r   devicer   )rX   matmul	transposer   r   sizer   r   r   r   r2   finfor   minr~   r   wherer   Softmaxtyper   )rz   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs               r'   _attnImageGPTAttention._attn   s`   ||E==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*))Aq**Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{*ML%'8Lzzb),7 $((5((6  '3Lll<7((r}   c           	      P   UR                  5       u  pgpUR                  5       u    pn
[        R                  " Xg-  X[        R                  UR                  S9nSnU R
                  (       a   U[        UR                  S5      5      S-  -  nU R                  (       a  U[        U R                  S-   5      -  n[        SS9   UR                  SX5      UR                  SS5      R                  SX5      p[        R                  " XR                  5       UR                  5       S	US
9nUR                  XgX5      nS S S 5        U R                  (       d  UR                  S5      UR                  S5      nnU R                  S S 2S S 2UU-
  U2S U24   n[        R                  " UR                   5      R"                  n[        R$                  " UUR                   UR                  S9n[        R&                  " UUU5      nUb  X-   n[(        R*                  " SS9" U5      nUR                   [        R                  :w  a  [-        S5      eUR/                  UR                   5      nU R1                  U5      nUb  X-  n[        R2                  " X5      nUU4$ ! , (       d  f       GNc= f)Nr         ?r*   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rX   emptyfloat32r   r   r   r   r   r   rZ   r   baddbmmr   r2   r   r   r   r~   r   r   r   RuntimeErrorr   r   r   )rz   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r'   _upcast_and_reordered_attn,ImageGPTAttention._upcast_and_reordered_attn
  s7   (-

%	 XXZ1 {{3?IPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L e$==Y3S]]2r5J5R5RSUWY5eq ==wwy!'')RS[ghL'//	UL %
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'8Lzzb),7 .eff#((5((6  '3Lll<7L((C %$s   A9J
J%c                 v    UR                  5       SS X#4-   nUR                  " U6 nUR                  SSSS5      $ )z:
Splits hidden_size dim into attn_head_size and num_heads
Nr*   r   r=   r   r   )r   r   permuterz   r~   r   attn_head_size	new_shapes        r'   _split_headsImageGPTAttention._split_heads>  sA     KKM#2&))DD	i(~~aAq))r}   c                     UR                  SSSS5      R                  5       nUR                  5       SS X#-  4-   nUR                  U5      $ )zC
Merges attn_head_size dim and num_attn_heads dim into hidden_size
r   r=   r   r   Nr   )r   
contiguousr   r   r   s        r'   _merge_headsImageGPTAttention._merge_headsF  sM     1a+668KKM#2&)*D)FF	{{9%%r}   hidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionsr   c	                    Ub[  [        U S5      (       d  [        S5      eU R                  U5      n	U R                  U5      R	                  U R
                  SS9u  pUnO,U R                  U5      R	                  U R
                  SS9u  pnU R                  XR                  U R                  5      n	U R                  XR                  U R                  5      n
U R                  XR                  U R                  5      nUb0  Uu  p[        R                  " X4SS9n
[        R                  " X4SS9nUSL a  X4nOS nU R                  (       a  U R                  XXU5      u  nnOU R                  XXU5      u  nnU R                  XR                  U R                  5      nU R                  U5      nU R!                  U5      nX4nU(       a  UU4-  nU$ )Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r=   r   r   T)hasattrr   r   r8   rN   r   r   r   r   rX   r   r   r   r   r   r:   r   )rz   r   r   r   r   r   r   r   r   r   r   r   past_key
past_valuepresentr   r   outputss                     r'   r   ImageGPTAttention.forwardN  s    !,4** t 
 KK.E%:;AA$//WXAYJC3N $M : @ @VW @ XE!!%G^^T]]C!!%G!#- H))XO4CIIz1r:ElGG''(,(G(GTYkt(u%K(,

5uV_(`%K''^^T]]Skk+.((5(&Gr}   )r   r8   r:   r   r   r   r   r   r   r   r   r   r   r   r   )FN)NNNNNNNFF)r   r   r   r   r   r   rT   rw   r   r   r   r   r   rX   ry   tupler   r   r   r   s   @r'   r   r      s    )"8D> )"V^_bVc )" )"V;$)L2)h*& &*15,08<9=$),13||3 TN3 !.	3
 ELL)3  (53 !) 63 D>3 $D>3 
3 3r}   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ImageGPTMLPi  c                    > [         TU ]  5         UR                  n[        X5      U l        [        X15      U l        [        UR                     U l        [        R                  " UR                  5      U l        g ru   )rv   rw   rr   r   c_fcr:   r   activation_functionactr   r   r   dropout)rz   intermediate_sizer`   r   r{   s       r'   rw   ImageGPTMLP.__init__  sZ    &&	,8	Y:&445zz&"4"45r}   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ ru   )r  r	  r:   r
  )rz   r   s     r'   r   ImageGPTMLP.forward  s@    		-0/M2]3r}   )r	  r  r:   r
  )
r   r   r   r   rw   rX   ry   r   r   r   r   s   @r'   r  r    s(    6U\\ ell  r}   r  c                      ^  \ rS rSrSU 4S jjr       SS\R                  S\\   S\\R                     S\\R                     S\\R                     S\\R                     S	\\   S
\\   S\	4S jjr
SrU =r$ )ImageGPTBlocki  c                   > [         TU ]  5         UR                  nUR                  b  UR                  OSU-  n[	        X1R
                  S9U l        [        XS9U l        [	        X1R
                  S9U l	        UR                  (       a(  [        USUS9U l        [	        X1R
                  S9U l        [        XA5      U l        g )N   rs   r   T)r   r   )rv   rw   rr   n_innerrp   layer_norm_epsilonln_1r   r9   ln_2add_cross_attentioncrossattentionln_cross_attnr  mlp)rz   r`   r   rr   	inner_dimr{   s        r'   rw   ImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%k7P7PQ	%fB	%k7P7PQ	%%"3Ft_h"iD!2;D]D]!^Dy1r}   r   r   r   r   r   r   r   r   r   c	           	         Un	U R                  U5      nU R                  UUUUUUS9n
U
S   nU
SS  nX-   nUbW  [        U S5      (       d  [        SU  S35      eUn	U R	                  U5      nU R                  UUUUUUS9nUS   nX-   nXSS  -   nUn	U R                  U5      nU R                  U5      nX-   nU4U(       a  U-   nU$ USS  -   nU$ )	N)r   r   r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r=   )r  r9   r   r   r  r  r  r  )rz   r   r   r   r   r   r   r   r   residualattn_outputsr   r   cross_attn_outputsfeed_forward_hidden_statess                  r'   r   ImageGPTBlock.forward  sG    !		-0yy!)/ ! 
 #1oqr"#. ,4!122 =dV DZ Z  %H ..}=M!%!4!4-#&;'="3 "5 " -Q/K$2M12 66G 		-0%)XXm%<" = "gL AHLr}   )r9   r  r  r  r  r  ru   r  )r   r   r   r   rw   rX   ry   r   r   r  r   r   r   r   s   @r'   r  r    s    2$ &*15,08<9=$),18||8 TN8 !.	8
 ELL)8  (58 !) 68 D>8 $D>8 
8 8r}   r  c                   H   ^  \ rS rSr\r\rSrSr	Sr
S/rU 4S jrS rSrU =r$ )	ImageGPTPreTrainedModeli  r-   	input_idsTr  c                 &   > [         TU ]  " U0 UD6  g ru   )rv   rw   )rz   inputskwargsr{   s      r'   rw    ImageGPTPreTrainedModel.__init__  s    &+F+r}   c           	         [        U[        R                  [        45      (       aj  UR                  R
                  R                  SU R                  R                  S9  UR                  b$  UR                  R
                  R                  5         O[        U[        R                  5      (       aw  UR                  R
                  R                  SU R                  R                  S9  UR                  b1  UR                  R
                  UR                     R                  5         O:[        U[        5      (       a%  UR                  R
                  R                  S5        UR                  5        Hq  u  p#SU;   d  M  SU;   d  M  UR
                  R                  SU R                  R                  [         R"                  " SU R                  R$                  -  5      -  S9  Ms     g)zInitialize the weights.g        )r   stdNr   r:   r0   r=   )
isinstancer   Linearr   r0   r]   normal_r`   initializer_ranger2   zero_	Embeddingpadding_idxrp   fill_named_parametersmathr   n_layer)rz   modulerg   ps       r'   _init_weights%ImageGPTPreTrainedModel._init_weights  sS   fryy&122 MM&&CT[[5R5R&S{{&  &&(--MM&&CT[[5R5R&S!!-""6#5#56<<> 122MM$$S) ..0GD4H$4Cdkk.K.KdiiXY\`\g\g\o\oXoNp.pr 1r}   r$   )r   r   r   r   r   config_classrn   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrw   r;  r   r   r   s   @r'   r&  r&    s8    !L1O%!O&*#(),s sr}   r&  c            "         ^  \ rS rSrS\4U 4S jjrS rS rS r\	             SS\
\R                     S\
\\\R                           S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\S\\\4   4S jj5       rSrU =r$ )ImageGPTModeli  r`   c           
      v  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  U R                  5      U l        [        R
                  " UR                  U R                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        XS9PM     sn5      U l        [%        U R                  UR&                  S9U l        SU l        S U l        SU l        U R1                  5         g s  snf )Nr  r  F)rv   rw   rr   r   r   r3  r^   r4   r   r3   r   
embd_pdropdrop
ModuleListrangenum_hidden_layersr  hrp   r  ln_fmodel_parallel
device_mapgradient_checkpointing	post_init)rz   r`   ir{   s      r'   rw   ImageGPTModel.__init__	  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklLkqf BLklm%dnn&:S:ST	 $&+#  ms   D6c                     U R                   $ ru   r4   rz   s    r'   get_input_embeddings"ImageGPTModel.get_input_embeddings  s    xxr}   c                     Xl         g ru   rT  rz   new_embeddingss     r'   set_input_embeddings"ImageGPTModel.set_input_embeddings  s    !r}   c                     UR                  5        H-  u  p#U R                  U   R                  R                  U5        M/     g)zf
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
N)itemsrK  r9   r   )rz   heads_to_prunelayerr   s       r'   _prune_headsImageGPTModel._prune_heads"  s5     +002LEFF5M**51 3r}   r'  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr*  r   c                   ^$ SU;   a:  [         R                  " S[        5        Ub  [        S5      eUR	                  S5      nUb  UOU R
                  R                  nUb  UOU R
                  R                  nU
b  U
OU R
                  R                  n
Ub  UOU R
                  R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nUb  UR                  SUS   5      nUc%  Sn[        S/[!        U R"                  5      -  5      nOUS   S   R                  S	5      nUc<  [$        R&                  " UUS   U-   [$        R(                  US
9nUR+                  S5      nUby  US::  a  [        S5      eUR                  US5      nUSS2SSSS24   nUR-                  U R.                  S9nSU-
  [$        R0                  " U R.                  5      R2                  -  nU R
                  R4                  (       aE  UbB  UR                  5       u  nnnUU4nU	c  [$        R6                  " UUS9n	U R9                  U	5      n	OSn	U R;                  X`R
                  R<                  5      nUc  U R?                  U5      nU RA                  U5      nUUR-                  UR                  5      -   m$Ub  U R?                  U5      nT$U-   m$U RC                  T$5      m$UT$R                  S5      4-   nU RD                  (       a/  U RF                  (       a  U
(       a  [H        RK                  S5        Sn
U
(       a  SOSnU(       a  SOSnU(       a  U R
                  R4                  (       a  SOSnU(       a  SOSn[M        [O        U R"                  U5      5       GH  u  nu  nn U RP                  (       a  [$        RR                  RU                  T$R                  5        U b  [        U$4S jU  5       5      n Ub  UR-                  T$R                  5      n[W        U[$        RX                  5      (       a  UR-                  T$R                  5      nU(       a  UT$4-   nU RD                  (       a8  U RF                  (       a'  U R[                  UR\                  T$SUUU   UU	U
U5	      n!OU" T$U UUU   UU	U
US9n!U!S   m$U
SL a	  UU!S   4-   nU(       a?  UU!U
(       a  SOS   4-   nU R
                  R4                  (       a  UU!U
(       a  SOS   4-   nU RP                  (       d  GM  U R^                  Ra                  5        HO  u  n"n#UU#S   :X  d  M  S[c        U"5      -   U Rd                  :w  d  M/  T$R-                  S[c        U"S-   5      -   5      m$MQ     GM     U Rg                  T$5      m$T$R                  " U6 m$U(       a  UT$4-   nU(       d  [        S T$UUUU4 5       5      $ [i        T$UUUUS9$ )aI  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTModel
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```pixel_values`The `pixel_values` argument is deprecated and will be removed in v4.47, use `input_ids` instead.N_You cannot pass both `pixel_values` and `input_ids`. Please make sure to only pass `input_ids`.zDYou cannot specify both input_ids and inputs_embeds at the same timer*   r   z5You have to specify either input_ids or inputs_embedsr   r   z$batch_size has to be defined and > 0r   r   )r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr$   c              3   X   >#    U  H  oR                  TR                  5      v   M!     g 7fru   )tor   )r%   
past_stater   s     r'   r(   (ImageGPTModel.forward.<locals>.<genexpr>  s#     &h]gz}}]5I5I'J'J]gs   '*)r   r   r   r   r   r   r   Tr   r=   r   zcuda:c              3   0   #    U  H  nUc  M  Uv   M     g 7fru   r$   )r%   vs     r'   r(   rp    s      pA ps   	)last_hidden_staterc  r   
attentionscross_attentions)5warningswarnFutureWarningr   popr`   r   rg  r   use_return_dict%warn_if_padding_and_no_attention_maskr   r   rU   r   r  rS   rK  rX   arangelong	unsqueezern  r   r   r   r  r   invert_attention_maskget_head_maskr8  r4   r3   rG  rO  trainingrA   warning_once	enumeraterM   rM  cuda
set_devicer.  ry   _gradient_checkpointing_func__call__rN  r^  strlast_devicerL  r   )%rz   r'  rc  r   rd  re  r   rf  r   r   r   r   rg  rh  r*  input_shape
batch_sizer   past_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedstoken_type_embedsoutput_shapepresentsall_self_attentionsall_cross_attentionsall_hidden_statesrQ  blockr   r   r   rr  r   s%                                       @r'   r   ImageGPTModel.forward)  s+   ^ V#MMr
 $ u  

>2I1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T%+00[_EN"K#TFS[$89O)!,Q/44R8K <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)" &&y++2E2EF	  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>&&4==##p "	"2$5b4%64;;;Z;Zr`d"6BD&/DFFO0L&M"A"z""

%%m&:&:;)!&&h]g&h!hJ!-%3%6%6}7K7K%LNi66 )]-A-A BI#$58H$H!**t}};;NN!"aL)*%
  !)#1'l*?+A'&7	 $AJMD #wqzm3 &9W)QYZ=[<]&]#;;22+?7PY1_`CaBc+c( """ OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4e 'Nl 		-0%**L9 1]4D D '3DFY[op   9+$+*1
 	
r}   )	rN  rG  r   rO  rK  rL  rM  r3   r4   )NNNNNNNNNNNNN)r   r   r   r   r   rw   rV  r[  ra  r   r   rX   ry   r   r   r   r   r   r   r   r   r   s   @r'   rD  rD    sv   ~ &"2  -1@D1515/3,0048<9=$(,0/3&*d
ELL)d
 "%ell(;"<=d
 !.	d

 !.d
 u||,d
 ELL)d
  -d
  (5d
 !) 6d
 D>d
 $D>d
 'tnd
 d^d
 d
  
u??	@!d
 d
r}   rD  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            $       \  ^  \ rS rSrS/rS\4U 4S jjrS rS r\	              SS\
\R                     S\
\\\R                           S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\S\\\4   4 S jj5       r\S\\\R                        S\R                  S\\\R                        4S j5       rSrU =r$ )ImageGPTForCausalImageModelingi  zlm_head.weightr`   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  S-
  SS9U l        SU l	        S U l
        U R                  5         g )Nr   Fr2   )rv   rw   rD  r-   r   r/  r[   r^   r;   rM  rN  rP  rz   r`   r{   s     r'   rw   'ImageGPTForCausalImageModeling.__init__  s[     (0yy0A0AA0EER $r}   c                     U R                   $ ru   r;   rU  s    r'   get_output_embeddings4ImageGPTForCausalImageModeling.get_output_embeddings%  s    ||r}   c                     Xl         g ru   r  rY  s     r'   set_output_embeddings4ImageGPTForCausalImageModeling.set_output_embeddings(  s    %r}   r'  rc  r   rd  re  r   rf  r   r   labelsr   r   rg  rh  r*  r   c                    SU;   a:  [         R                  " S[        5        Ub  [        S5      eUR	                  S5      nUb  UOU R
                  R                  nU R                  UUUUUUUUU	UUUUS9nUS   nU R                  U5      nSnU
br  USSS2SS24   R                  5       nU
SS	S24   R                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4US	S -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   UR"                  S
9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
>>> import torch
>>> import matplotlib.pyplot as plt
>>> import numpy as np

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> model.to(device)  # doctest: +IGNORE_RESULT

>>> # unconditional generation of 8 images
>>> batch_size = 4
>>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
>>> context = context.to(device)
>>> output = model.generate(
...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
... )

>>> clusters = image_processor.clusters
>>> height = image_processor.size["height"]
>>> width = image_processor.size["width"]

>>> samples = output[:, 1:].detach().cpu().numpy()
>>> samples_img = [
...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
... ]  # convert color cluster tokens back to pixels
>>> f, axes = plt.subplots(1, batch_size, dpi=300)

>>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
...     ax.axis("off")
...     ax.imshow(img)
```rj  rk  Nrl  )rc  r   rd  re  r   rf  r   r   r   r   rg  rh  r   .r*   r   )losslogitsrc  r   rt  ru  )rv  rw  rx  r   ry  r`   rz  r-   r;   r   r
   r   r   r   rc  r   rt  ru  )rz   r'  rc  r   rd  re  r   rf  r   r   r  r   r   rg  rh  r*  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputs                           r'   r   &ImageGPTForCausalImageModeling.forward+  s   L V#MMr
 $ u  

>2I%0%<k$++B]B]"..+))%'"7#9/!5# / 
 ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r}   beam_idxc                 .   ^ [        U4S jU  5       5      $ )z
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
c              3   N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectrn  r   )r%   ro  r  s     r'   r(   JImageGPTForCausalImageModeling._reorder_cache.<locals>.<genexpr>.<genexpr>  s1     j_iQ[))!X[[9J9J-KLL_is   7:Nr  )r%   r   r  s     r'   r(   @ImageGPTForCausalImageModeling._reorder_cache.<locals>.<genexpr>  s'      
-
 j_ijjj-s   "%r  )rc  r  s    `r'   _reorder_cache-ImageGPTForCausalImageModeling._reorder_cache  s      
-
 
 	
r}   )rN  r;   rM  r-   )NNNNNNNNNNNNNN)r   r   r   r   _tied_weights_keysr   rw   r  r  r   r   rX   ry   r   r   r   r   r   r   staticmethodr  r   r   r   s   @r'   r  r    s    ++	~ 	&  -1@D1515/3,0048<9=)-$(,0/3&*{
ELL){
 "%ell(;"<={
 !.	{

 !.{
 u||,{
 ELL){
  -{
  (5{
 !) 6{
 &{
 D>{
 $D>{
 'tn{
 d^{
  !{
" 
u77	8#{
 {
z 
uU\\23
?D||
	uU\\"	#
 
r}   r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                      ^  \ rS rSrS\4U 4S jjr\            SS\\R                     S\\
\
\R                           S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\S\\
\4   4S jj5       rSrU =r$ )ImageGPTForImageClassificationi  r`   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr  )
rv   rw   
num_labelsrD  r-   r   r/  r[   scorerP  r  s     r'   rw   'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r}   r'  rc  r   rd  re  r   rf  r  r   r   rg  rh  r*  r   c                    SU;   a:  [         R                  " S[        5        Ub  [        S5      eUR	                  S5      nUb  UOU R
                  R                  nU R                  UUUUUUUU	U
UUS9nUS   nUR                  SS9nU R                  U5      nSnUGb  U R
                  R                  c  U R                  S:X  a  S	U R
                  l
        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  S
U R
                  l
        OSU R
                  l
        U R
                  R                  S	:X  aJ  [!        5       nU R                  S:X  a&  U" UR#                  5       UR#                  5       5      nOU" UU5      nOU R
                  R                  S
:X  a=  [%        5       nU" UR'                  SU R                  5      UR'                  S5      5      nO-U R
                  R                  S:X  a  [)        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [+        UUUR,                  UR.                  UR0                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
```rj  rk  Nrl  )
rc  r   rd  re  r   rf  r   r   rg  rh  r   r   r   
regressionsingle_label_classificationmulti_label_classificationr*   )r  r  rc  r   rt  )rv  rw  rx  r   ry  r`   rz  r-   r   r  problem_typer  r   rX   r}  rT   r   rL   r
   r   r	   r   rc  r   rt  )rz   r'  rc  r   rd  re  r   rf  r  r   r   rg  rh  r*  r  r   pooled_hidden_statesr  r  r  r  s                        r'   r   &ImageGPTForImageClassification.forward  sG   d V#MMr
 $ u  

>2I%0%<k$++B]B]"..+))%'/!5# / 
 ,A.,11a1801{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r}   )r  r  r-   )NNNNNNNNNNNN)r   r   r   r   r   rw   r   r   rX   ry   r   r   r   r   r   r   r   r   r   s   @r'   r  r    sP   ~   -1@D1515/3,004)-$(,0/3&*s
ELL)s
 "%ell(;"<=s
 !.	s

 !.s
 u||,s
 ELL)s
  -s
 &s
 D>s
 $D>s
 'tns
 d^s
 s
 
u66	7s
 s
r}   r  )r  r  rD  r&  rn   )4__doc__r7  rC   rv  typingr   r   r   r   rX   torch.utils.checkpointr   torch.cuda.ampr   torch.nnr	   r
   r   activationsr   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   rA   rn   Modulerp   r   r  r  r&  rD  r  r  __all__r$   r}   r'   <module>r     sP   %  	  . .    # A A ! ) 
 . Y Y 
 3 
		H	%iX
		 
X		 Xv")) "HBII HV #so #s #sL F
+ F
 F
R ^
%<o ^
^
B ~
%< ~
~
Br}   