
    fTh۰              	          S r SSKrSSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJr  \R8                  " \5      r\ " S S\5      5       r\ " S S\5      5       r  " S S\RB                  5      r" " S S\RB                  5      r# " S S\RB                  5      r$ " S S\RB                  5      r%S;S\RL                  S\'S\(S\RL                  4S jjr) " S S\RB                  5      r* " S  S!\RB                  5      r+ " S" S#\RB                  5      r, " S$ S%\RB                  5      r- " S& S'\RB                  5      r. " S( S)\RB                  5      r/\ " S* S+\5      5       r0\ " S, S-\05      5       r1S.\RL                  S/\2S\RL                  4S0 jr3S.\RL                  S1\2S2\2S\RL                  4S3 jr4 " S4 S5\RB                  5      r5\" S6S79 " S8 S9\05      5       r6/ S:Qr7g)<zPyTorch SegGpt model.    N)	dataclass)DictListOptionalTupleUnion)nn)
functional   )ACT2FN)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )SegGptConfigc                       \ rS rSr% Sr\R                  \S'   Sr\	\
\R                        \S'   Sr\	\
\R                        \S'   Sr\	\
\R                        \S'   Srg)	SegGptEncoderOutput#   a  
Output type of [`SegGptEncoderOutput`].
Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of *torch.FloatTensor* (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
        Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
        Additionally, each feature passes through a LayerNorm.
last_hidden_stateNhidden_states
attentionsintermediate_hidden_states )__name__
__module____qualname____firstlineno____doc__torchFloatTensor__annotations__r   r   r   r   r   __static_attributes__r       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/seggpt/modeling_seggpt.pyr   r   #   sd    " (((8<M8E%"3"345<59Ju00129EIu/@/@)A BIr%   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	SegGptImageSegmentationOutput<   a   
Output type of [`SegGptImageSegmentationOutput`].

Args:
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
        The loss value.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        The predicted masks.
    hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
Nloss
pred_masksr   r   r   )r   r   r   r   r    r*   r   r!   r"   r#   r+   r   r   r   r$   r   r%   r&   r(   r(   <   sg      )-D(5$$
%,.2J**+28<M8E%"3"345<59Ju00129r%   r(   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )SegGptPatchEmbeddingsU   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr	   Conv2d
projection)selfconfigr4   r5   r6   r7   r<   	__class__s          r&   r3   SegGptPatchEmbeddings.__init__\   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir%   c                 J   UR                   u  p#pEX0R                  :w  a  [        S5      eX@R                  S   :w  d  XPR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      R                  SSS	S5      nU$ )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().   r   )shaper6   
ValueErrorr4   r>   permute)r?   pixel_values
batch_sizer6   heightwidth
embeddingss          r&   forwardSegGptPatchEmbeddings.forwardj   s    2>2D2D/
&,,,w  __Q''5OOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  __\2::1aAF
r%   )r4   r6   r<   r5   r>   )	r   r   r   r   r    r3   rO   r$   __classcell__rA   s   @r&   r-   r-   U   s    j r%   r-   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\S\S\R                  4S	 jr
  SS
\R                  S\R                  S\\R                     S\\   S\R                  4
S jjrSrU =r$ )SegGptEmbeddingsx   zP
Construct the embeddings from patch, position embeddings for input and prompt.
r@   returnNc                   > [         TU ]  5         [        R                  " [        R
                  " SSSUR                  5      5      U l        [        R                  " [        R
                  " SSSUR                  5      5      U l        [        R                  " [        R
                  " SSSUR                  5      5      U l	        [        R                  " [        R
                  " SSSUR                  5      5      U l
        [        R                  " [        R
                  " SSSUR                  5      5      U l        [        U5      U l        UR                  UR                  -  S-  S-   n[        R                  " [        R                   " SX!R                  5      5      U l        [        R$                  " UR&                  5      U l        g )Nr   rF   )r2   r3   r	   	Parameterr!   zerosr7   
mask_tokensegment_token_inputsegment_token_prompttype_token_semantictype_token_instancer-   patch_embeddingspretrain_image_sizer5   randnposition_embeddingsDropouthidden_dropout_probdropout)r?   r@   num_positionsrA   s      r&   r3   SegGptEmbeddings.__init__}   s1   ,,u{{1aF<N<N'OP#%<<Aq!VEWEW0X#Y $&LLQ1fFXFX1Y$Z!#%<<Aq!VEWEW0X#Y #%<<Aq!VEWEW0X#Y  5f =33v7H7HHQNQRR#%<<A}N`N`0a#b zz&"<"<=r%   rL   rM   c                    U R                   S S 2SS 24   nUR                  S   n[        US-  5      n[        R                  R                  5       (       d
  XQ:w  d  XR:w  aO  [        R                  " UR                  SXUS5      R                  SSSS5      X4SSS	9nUR                  SSSS5      $ UR                  SXS5      $ )
Nr         ?r   r   rF   bicubicF)sizemodealign_corners)
rb   rG   r   r!   jit
is_tracingFinterpolatereshaperI   )r?   rL   rM   patch_pos_embedr<   pretrain_patch_sizes         r&   interpolate_pos_encoding)SegGptEmbeddings.interpolate_pos_encoding   s    221ab59%++A.'S(89 99!!%8%BFYFbmm''+>UWX``abdeghjkl_#	O #**1aA66"**1fR@@r%   rJ   prompt_pixel_valuesbool_masked_posembedding_typec                 :   U R                  U5      nU R                  U5      nUR                  u  pxpU R                  R                  XxU	S5      nUR	                  S5      R                  U5      R                  SXS5      nUSU-
  -  X-  -   nUb  UOSnU R                  X5      nXPR                  -   nX`R                  -   nX]-   nXm-   nUS:X  a  U R                  nO!US:X  a  U R                  nO[        SU 35      eX^-   nXn-   n[        R                  " XV4SS9nU$ )Nrj   r   instancesemanticzBEmbedding type should be either 'semantic' or 'instance', but got r   dim)r_   rG   rZ   expand	unsqueezetype_asrs   rv   r[   r\   r]   r^   rH   r!   cat)r?   rJ   rx   ry   rz   input_embeddingsprompt_embeddingsrK   patch_heightpatch_width_rZ   w	pos_embedtype_embeddingrN   s                   r&   rO   SegGptEmbeddings.forward   sL     00> 112EF3C3I3I0
+__++JkSUV
%%b)11*=EEb,efg-Q7*.H+9+E: 11,L	 ,.F.FF-0I0II ,7-9 Z'!55Nz)!55Nabpaqrss+<->YY 0D!L
r%   )re   rZ   r_   rb   r[   r\   r^   r]   )NN)r   r   r   r   r    r   r3   intr!   Tensorrv   r   
BoolTensorstrrO   r$   rQ   rR   s   @r&   rT   rT   x   s    >| > > As A3 A5<< A, 7;(,+ll+ #\\+ "%"2"23	+
 !+ 
+ +r%   rT   c                   J  ^  \ rS rSrSrU 4S jrS\S\S\R                  S\R                  4S jr	S	\R                  S
\R                  S\R                  S\R                  S\
\\4   S\
\\4   S\R                  4S jrSS\R                  S\R                  4S jjrSrU =r$ )SegGptAttention   z=Multi-head Attention block with relative position embeddings.c                   > [         TU ]  5         UR                  UR                  p2[	        U[
        R                  R                  5      (       a  UOX"4n[	        U[
        R                  R                  5      (       a  UOX34nUS   UR                  -  US   UR                  -  4nUR                  UR                  -  nUR                  U l	        US-  U l
        [        R                  " UR                  UR                  S-  UR                  S9U l        [        R                  " UR                  UR                  5      U l        UR                   U l        U R                   (       a  Uc  [#        S5      e[        R$                  " [&        R(                  " SUS   -  S-
  U5      5      U l        [        R$                  " [&        R(                  " SUS   -  S-
  U5      5      U l        g g )Nr   r   g      r   biaszBInput size must be provided if using relative positional encoding.rF   )r2   r3   r4   r5   r8   r9   r:   r;   r7   num_attention_headsscaler	   Linearqkv_biasqkvproj use_relative_position_embeddingsrH   rX   r!   rY   	rel_pos_h	rel_pos_w)r?   r@   r4   r5   
input_sizehead_dimrA   s         r&   r3   SegGptAttention.__init__   s   !'!2!2F4E4EJ#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
 mv'8'88*Q-6K\K\:\]
%%)C)CC#)#=#= t^
99V//1C1Ca1Gfoo^IIf00&2D2DE	060W0W-00! !eff  \\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r%   q_sizek_sizerel_posrV   c                 
   [        S[        X5      -  S-
  5      n[        R                  " UR	                  SUR
                  S   S5      R                  SSS5      USS9nUR	                  SU5      R                  SS5      n[        R                  " U5      SS2S4   [        X!-  S5      -  n[        R                  " U5      SSS24   [        X-  S5      -  nXg-
  US-
  [        X-  S5      -  -   nXXR                  5          $ )	aa  
Get relative positional embeddings according to the relative positions of
    query and key sizes.

Args:
    q_size (int):
        size of the query.
    k_size (int):
        size of key k.
    rel_pos (`torch.Tensor`):
        relative position embeddings (L, channel).

Returns:
    Extracted positional embeddings according to relative positions.
rF   r   r   rj   linear)rl   rm   N      ?)
r   maxrq   rr   rs   rG   rI   r!   arangelong)	r?   r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss	            r&   get_rel_posSegGptAttention.get_rel_pos   s      1s622Q67--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ <<'403v3LL<<'a03v3LL#.6A:V_VYAZ2ZZ33566r%   attnqueryr   r   c                    Uu  pxUu  pU R                  XyU5      nU R                  XU5      nUR                  u  pnUR                  XX5      n[        R                  " SUU5      n[        R                  " SUU5      nUR                  XXU
5      nUUSS2SS2SS2SS2S4   -   USS2SS2SS2SSS24   -   nUR                  XU-  X-  5      nU$ )aZ  
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

Args:
    attn (`torch.Tensor`):
        attention map.
    query (`torch.Tensor`):
        query q in the attention layer with shape (batch_size, query_height * query_width, channel).
    rel_pos_h (`torch.Tensor`):
        relative position embeddings (Lh, channel) for height axis.
    rel_pos_w (`torch.Tensor`):
        relative position embeddings (Lw, channel) for width axis.
    q_size (tuple):
        spatial sequence size of query q with (query_height, query_width).
    k_size (tuple):
        spatial sequence size of key k with (key_height, key_width).

Returns:
    attn (`torch.Tensor`):
        attention map with added relative positional embeddings.
zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   rG   rs   r!   einsum)r?   r   r   r   r   r   r   query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthrK   r   r   reshaped_queryrel_hrel_ws                      r&   add_decomposed_rel_pos&SegGptAttention.add_decomposed_rel_pos  s    > %+! &
#'#3#3Li#X "&"2"2;9"U"[[
szR-~?WX-~?VW||JkyYeAq!Q,--aAtQ6F0GG||J{(BJDZ[r%   r   c           	         UR                   u  p4pVU R                  U5      R                  X4U-  SU R                  S5      R	                  SSSSS5      nUR                  SX0R                  -  XE-  S5      R                  S5      u  pn
XR                  -  U	R                  SS5      -  nU R                  (       a+  U R                  XU R                  U R                  XE4XE45      n[        R                  R                  R                  U[        R                   SS9R#                  UR$                  5      nU(       aA  UR'                  X0R                  XE-  S5      nUR'                  X0R                  -  XE-  S5      nOS nX-  R                  X0R                  XES5      nUR	                  SSSSS5      R                  X4US5      nU R)                  U5      nX4$ )	Nr   rj   rF   r   r      )dtyper   )rG   r   rs   r   rI   unbindr   	transposer   r   r   r   r!   r	   r
   softmaxfloat32tor   viewr   )r?   r   output_attentionsrK   rL   rM   r   r   r   keyvalueattn_weightsattn_weights_reshapedattn_outputs                 r&   rO   SegGptAttention.forward5  s   '4':':$
E HH]#WZ%D4L4LbQWQ1a# 	  KK:8P8P+PRXR`bdellmnoE

*cmmB.CC0066T^^T^^f_W]VeL xx**22<u}}Z\2]``afalalm
 %1$5$5jBZBZ\b\jln$o!055jC[C[6[]c]kmopL$(!#+44ZAYAY[ajlm!))!Q1a8@@UZ\^_ii,33r%   )r   r   r   r   r   r   r   )F)r   r   r   r   r    r3   r   r!   r   r   r   r   rO   r$   rQ   rR   s   @r&   r   r      s    GX07# 7s 7U\\ 7ell 7@+ll+ ||+ <<	+
 <<+ c3h+ c3h+ 
+Z#4U\\ #4u|| #4 #4r%   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	SegGptMlpi\  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  5      U l        [        UR                     U l
        g N)r2   r3   r	   r   r7   mlp_dimlin1lin2r   
hidden_actactr?   r@   rA   s     r&   r3   SegGptMlp.__init__]  sX    IIf00&..A	IIfnnf.@.@A	&++,r%   r   rV   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r?   r   s     r&   rO   SegGptMlp.forwardc  s2    		-0/		-0r%   )r   r   r   )
r   r   r   r   r3   r!   r   rO   r$   rQ   rR   s   @r&   r   r   \  s(    -U\\ ell  r%   r   input	drop_probtrainingrV   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   r   device)rG   ndimr!   randr   r   floor_div)r   r   r   	keep_probrG   random_tensoroutputs          r&   	drop_pathr   k  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )SegGptDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rV   c                 .   > [         TU ]  5         Xl        g r   )r2   r3   r   )r?   r   rA   s     r&   r3   SegGptDropPath.__init__  s    "r%   r   c                 B    [        XR                  U R                  5      $ r   )r   r   r   r   s     r&   rO   SegGptDropPath.forward  s    FFr%   c                 8    SR                  U R                  5      $ )Nzp={})formatr   r?   s    r&   
extra_reprSegGptDropPath.extra_repr  s    }}T^^,,r%   )r   r   )r   r   r   r   r    r   floatr3   r!   r   rO   r   r   r$   rQ   rR   s   @r&   r   r     sQ    b#(5/ #T # #GU\\ Gell G-C - -r%   r   c                      ^  \ rS rSrS\S\SS4U 4S jjr  SS\R                  S\	S	\
S
\
S\\\R                  \R                  4   \\R                     4   4
S jjrSrU =r$ )SegGptLayeri  r@   drop_path_raterV   Nc                 p  > [         TU ]  5         [        U5      U l        [	        U5      U l        US:  a  [        U5      O[        R                  " 5       U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r2   r3   r   	attentionr   mlpr   r	   Identityr   	LayerNormr7   layer_norm_epslayernorm_beforelayernorm_after)r?   r@   r   rA   s      r&   r3   SegGptLayer.__init__  s    (0V$;IC;O7UWU`U`Ub "V-?-?VEZEZ [!||F,>,>FDYDYZr%   r   ensemble_condfeature_ensembler   c                    U R                  U R                  U5      US9nUS   nUSS  nU(       a  UR                  S   S-  U:  a  UR                  UR                  S   S-  SS9u  pUS:X  a^  UR                  S   S-  n
U	R	                  SU
S5      n	U	R                  SSS9R                  U	5      n	U	R                  " UR                  6 n	OU	R                  SSS9R                  U	5      n	[        R                  " X/SS9nU R                  U5      U-   nUnU R                  U5      nU R                  U5      nXR                  U5      -   nU4U-   nU$ )	N)r   r   r   rF   r~   rj   T)r   keepdim)r  r  rG   splitrs   mean	expand_asr!   r   r   r  r  )r?   r   r
  r  r   self_attention_outputsattention_outputoutputspromptinputsnum_promptsresiduals               r&   rO   SegGptLayer.forward  sr    "&!!-0/ "0 "
 2!4(, 0 6 6q 9Q >- O-334D4J4J14MQR4RXY3ZNF!.44Q71<;;D9CCFK6D9CCFK$yy&)9qA '78=H ,,];/ >>-#@@ "W,r%   )r  r   r  r  r  )FF)r   r   r   r   r   r   r3   r!   r   r   boolr   r   rO   r$   rQ   rR   s   @r&   r   r     s    [| [U [t [ "'"'#||# # 	#
  # 
uU\\5<</0%2EE	F# #r%   r   c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\S\S	\S
\S\	\
\4   4S jjrSrU =r$ )SegGptEncoderi  r@   rV   Nc           
        > [         TU ]  5         Xl        [        R                  " SUR
                  UR                  SS9 Vs/ s H  o"R                  5       PM     nn[        R                  " [        UR                  5       Vs/ s H  n[        XU   5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        SU l        g s  snf s  snf )Nr   cpu)r   r   F)r2   r3   r@   r!   linspacer   num_hidden_layersitemr	   
ModuleListranger   layersr  r7   r  	layernormgradient_checkpointing)r?   r@   xdprirA   s        r&   r3   SegGptEncoder.__init__  s    !&63H3H&JbJbkp!qr!qAvvx!qrmm%PVPhPhJi$jJiQ[Q%@Ji$jkf&8&8f>S>ST&+# s$js   CC$r   r  r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n/ n[        U R                  5       GH  u  pU(       a  Xa4-   nU R                  R                  U	:  a  SOSnU R                  (       a1  U R
                  (       a   U R                  U
R                  UUUU5      nO	U
" XX#5      nUS   nXR                  R                  :X  a-  US UR                  S   S-   XR                  S   S-  S  -   S-  nXR                  R                  ;   a   UR                  U R                  U5      5        U(       d  GM	  X|S   4-   nGM     U(       a  Xa4-   nU(       d  [        S XXx4 5       5      $ [        UUUUS9$ )Nr   rF   r   r   ri   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   ).0vs     r&   	<genexpr>(SegGptEncoder.forward.<locals>.<genexpr>  s      lA ls   	)r   r   r   r   )	enumerater#  r@   merge_indexr%  r   _gradient_checkpointing_func__call__rG   !intermediate_hidden_state_indicesappendr$  tupler   )r?   r   r  r   r*  r+  all_hidden_statesall_self_attentionsr   r(  layer_moduler
  layer_outputss                r&   rO   SegGptEncoder.forward  s    #7BD$5b4%'"(5OA#$58H$H! "&!8!81!<A!M**t}} $ A A ))!!$%! !-]K[ o)!,MKK+++!"?M$7$7$:a$?@=QdQdefQgklQlQnCoo! KKAAA*11$..2OP  &91=M<O&O#;  6>   14D D '<Ol  
 #++*'A	
 	
r%   )r@   r%  r$  r#  )FFFT)r   r   r   r   r   r3   r!   r   r  r   r8  r   rO   r$   rQ   rR   s   @r&   r  r    sw    ,| , , "'"'%* 9
||9
 9
  	9

 #9
 9
 
u))	*9
 9
r%   r  c                   j   ^  \ rS rSrSrSU 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	SegGptLayerNormi  a5  LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
c                 V  > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        [        R                  " [        R                  " U5      5      U l        X l	        X0l
        U R                  S;  a  [        SU R                   35      eU4U l        g )N)channels_lastchannels_firstzUnsupported data format: )r2   r3   r	   rX   r!   onesweightrY   r   r  data_formatNotImplementedErrornormalized_shape)r?   rG  r  rE  rA   s       r&   r3   SegGptLayerNorm.__init__	  s    ll5::.>#?@LL-=!>?	&#FF%(A$BRBRAS&TUU!1 3r%   r&  rV   c                 P   U R                   S:X  aV  [        R                  R                  R	                  XR
                  U R                  U R                  U R                  5      nU$ U R                   S:X  a  UR                  nUR                  5       nUR                  SSS9nX-
  R                  S5      R                  SSS9nX-
  [        R                  " X@R                  -   5      -  nUR                  US9nU R                  S S 2S S 4   U-  U R                  S S 2S S 4   -   nU$ )NrA  rB  r   T)r  rF   )r   )rE  r!   r	   r
   
layer_normrG  rD  r   r  r   r   r  powsqrtr   )r?   r&  input_dtypeuss        r&   rO   SegGptLayerNorm.forward  s   .##..q2G2GVZV_V_aeaiaijA  !11''K	Aq$'AA##At#4A%**Q\22A;'AAtTM*Q.1dD=1IIAr%   )r   rE  r  rG  rD  )gư>rA  )r   r   r   r   r    r3   r!   r   rO   r$   rQ   rR   s   @r&   r?  r?    s-    
4 %,,  r%   r?  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )SegGptDecoderHeadi!  c                 F  > [         TU ]  5         [        R                  " UR                  UR                  SSS9U l        [        UR                  UR                  SS9U l        [        UR                     U l        [        R                  " UR                  SSSS9U l        g )Nr   r   )r0   paddingrB  )rG  r  rE  T)r0   r   )r2   r3   r	   r=   decoder_hidden_sizeconvr?  r  r$  r   r   act_fctheadr   s     r&   r3   SegGptDecoderHead.__init__"  s    II&&&&	
	 )#77V=R=R`p
 f//0IIf88!QUV	r%   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rV  r$  rW  rX  r   s     r&   rO   SegGptDecoderHead.forward0  s@    		-0}5]3		-0r%   )rW  rV  rX  r$  )
r   r   r   r   r3   r!   r"   rO   r$   rQ   rR   s   @r&   rR  rR  !  s     WU%6%6  r%   rR  c                      ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrS\R                  4S jrSr	U =r
$ )SegGptDecoderi9  c                 <  > [         TU ]  5         [        R                  " UR                  [        UR                  5      -  UR                  S-  UR                  -  SS9U l	        [        U5      U l        UR                  U l        UR                  U l        Xl        g )NrF   Tr   )r2   r3   r	   r   r7   lenr6  r5   rU  decoder_embedrR  decoder_predr@   r   s     r&   r3   SegGptDecoder.__init__:  s    YYV%M%M!NNq 6#=#==

 .f5 ++#)#=#= r%   r   rV   c                 
   UR                   u  p#pEUR                  X#X@R                  U R                  U R                  5      nUR	                  SSSSSS5      nUR                  USX0R                  -  X@R                  -  4S9nU$ )	Nr      r   r   rF   r   rj   rG   )rG   rs   r5   rU  rI   )r?   r   rK   r   r   r   s         r&   _reshape_hidden_states$SegGptDecoder._reshape_hidden_statesF  s    3@3F3F0
+%--k??DOOUYUmUm
 &--aAq!Q?%--r<//#A;Q`Q`C`a . 
 r%   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r`  rf  ra  r   s     r&   rO   SegGptDecoder.forwardR  s8    **=933MB))-8r%   )r@   r`  rU  ra  r5   )r   r   r   r   r3   r!   r"   rf  rO   r$   rQ   rR   s   @r&   r]  r]  9  s>    

E4E4E 
%J[J[ 
U%6%6  r%   r]  c                       \ rS rSr\rSrSrSrSS/r	S\
\R                  \R                  \R                  4   SS	4S
 jrSrg	)SegGptPreTrainedModeliZ  modelrJ   TrT   r   modulerV   Nc                 &   U R                   R                  n[        U[        R                  [        R
                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SUS9R                  UR                  R                  5      UR                  l	        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       Ga  [        R                  R                  UR&                  R                  R                  [        R                  5      SUS9R                  UR&                  R                  5      UR&                  l	        [        R                  R                  UR(                  R                  R                  [        R                  5      SUS9R                  UR(                  R                  5      UR(                  l	        g[        U[*        5      (       Ga}  [        R                  R                  UR,                  R                  R                  [        R                  5      SUS9R                  UR,                  R                  5      UR,                  l	        [        R                  R                  R/                  UR0                  US9  [        R                  R                  R/                  UR2                  US9  [        R                  R                  R/                  UR4                  US9  [        R                  R                  R/                  UR6                  US9  [        R                  R                  R/                  UR8                  US9  gg)zInitialize the weightsr   )r  stdNr   )ro  )r@   initializer_ranger8   r	   r   r=   inittrunc_normal_rD  datar   r!   r   r   r   zero_r  fill_r   r   r   rT   rb   normal_rZ   r[   r\   r]   r^   )r?   rm  ro  s      r&   _init_weights#SegGptPreTrainedModel._init_weightsb  s   kk++fryy"))455 "$!6!6v}}7I7I7L7LU]]7[bekn!6!o!r!r##"FMM {{&  &&( '--KK""$MM$$S)00$&GG$9$9  %%((7 %: % b!!''(	 ! %'GG$9$9  %%((7 %: % b!!''(	 !  011.0gg.C.C**//225==A /D / b++112	 &&+ HHMM!!&"3"3!=HHMM!!&"<"<#!FHHMM!!&"="=3!GHHMM!!&"<"<#!FHHMM!!&"<"<#!F 2r%   r   )r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r	   r   r=   r  rw  r$   r   r%   r&   rk  rk  Z  sR    L$O&*#+];&GE"))RYY*L$M &GRV &Gr%   rk  c                   J  ^  \ rS rSrS\4U 4S jjrS\4S jrS\\	\
\	   4   SS4S jr\       SS	\R                  S
\R                  S\R                  S\\R                      S\\   S\\   S\\R&                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )SegGptModeli  r@   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r2   r3   r@   rT   rN   r  encoder	post_initr   s     r&   r3   SegGptModel.__init__  s9     *62$V, 	r%   rV   c                 .    U R                   R                  $ r   )rN   r_   r   s    r&   get_input_embeddings SegGptModel.get_input_embeddings  s    ///r%   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  layerr  prune_heads)r?   r  r  headss       r&   _prune_headsSegGptModel._prune_heads  s<    
 +002LELLu%//;;EB 3r%   rJ   rx   prompt_masksry   r  rz   labelsr   r*  r+  c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOSnU R                  R
                  R                  R                  R                  nUR                  U5      nUR                  U5      n[        R                  " X!4SS9nUc  [        R                  " X34SS9O[        R                  " X74SS9nUc  Ub  [        R                  S5        Uc  U R                  R
                  R                  n[        R                  " US-  [        R                   UR"                  S9n[        R$                  " XS-  -
  [        R                   UR"                  S9n[        R                  " X/5      nUR'                  S5      nU R	                  XXdS9nU R)                  UUUU	U
S9nU$ )	a	  
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
    [`SegGptImageProcessor.__call__`] for details.
prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
    details.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
feature_ensemble (`bool`, *optional*):
    Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
    if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
    be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
embedding_type (`str`, *optional*):
    Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
    instance or semantic.
labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
    Ground truth mask for input images.

Examples:

```python
>>> from transformers import SegGptImageProcessor, SegGptModel
>>> from PIL import Image
>>> import requests

>>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
>>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
>>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

>>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
>>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
>>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

>>> checkpoint = "BAAI/seggpt-vit-large"
>>> model = SegGptModel.from_pretrained(checkpoint)
>>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

>>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")

>>> outputs = model(**inputs)
>>> list(outputs.last_hidden_state.shape)
[1, 56, 28, 1024]
```
FrF   r~   zLabels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos.r   r   )rz   ry   )r  r   r*  r+  )r@   r   r*  use_return_dictrN   r_   r>   rD  r   r   r!   r   loggerwarning_oncer<   rY   r  r   rC  r   r  )r?   rJ   rx   r  ry   r  rz   r  r   r*  r+  expected_dtyper<   bool_masked_pos_zerosbool_masked_pos_onesembedding_outputencoder_outputss                    r&   rO   SegGptModel.forward  s   v 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]/?/K+QV99DDKKQQ#~6144^D yy"5!D!L ~ II|2:L1q9 	 "v'9 m "//::FFK$)KKq0@

[g[n[n$o!#(::Q..ejjI\I\$  $ii)>(UVO-77:O??n + 
 ,,-/!5# ' 
 r%   )r@   rN   r  NNNNNNN)r   r   r   r   r   r3   r-   r  r   r   r   r  r   r!   r   r   r   r  r   r"   r   r   r   rO   r$   rQ   rR   s   @r&   r  r    s$   | 0&; 0C4T#Y+? CD C  7;+/(,.2,0/3&*kllk #\\k ll	k
 "%"2"23k #4.k !k **+k $D>k 'tnk d^k 
u))	*k kr%   r  tensorr5   c                     U R                   u  p#pEXA-  nXQ-  nU R                  X#XaXq4S9n U R                  SSSSSS5      n U R                  X&U-  US-  S-  4S9n U $ )Nre  r   rF   r   r   rd  r   )rG   rs   rI   )r  r5   rK   r6   rL   rM   r   r   s           r&   patchifyr    sz    .4ll+Jf'L%K^^:\Wb"o^pF^^Aq!Q1-F^^:k/I:WX=[\K\"]^^FMr%   r   r   c           	      N   U R                   S   n[        U R                   S   S-  S-  5      nX-  U R                   S   :w  a"  [        SU R                   S    SU SU S	35      eU R                  X1X$US4S
9n U R	                  SSSSSS5      n U R                  USX-  X$-  4S
9n U $ )Nr   rj   r   ri   r   zNumber of patches z does not match patch height (z) and width (rE   re  rd  rF   r   )rG   r   rH   rs   rI   )r  r   r   rK   r5   s        r&   
unpatchifyr    s    aJfll2&*s23J!V\\!_4 a 11OP\~]jkvjwwyz
 	
 ^^:[V`bc"d^eF^^Aq!Q1-F^^:q,2K[Me"f^gFMr%   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  S\R                  4S jrSr	U =r
$ )	
SegGptLossi,  c                 f   > [         TU ]  5         UR                  U l        UR                  U l        g r   )r2   r3   betar5   r   s     r&   r3   SegGptLoss.__init__-  s&    KK	 ++r%   r  r+   r  ry   c                    [         R                  " X4SS9nUSS2SS2S4   R                  SSU R                  S-  S-  5      n[	        XeR
                  S   U R                  -  UR
                  S   U R                  -  5      n[        R                  " X%SU R                  S9nXv-  R                  5       UR                  5       -  nU$ )a  Computes the L1 loss between the predicted masks and the ground truth masks.

Args:
    prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Pixel values from mask prompt.

    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
        Predicted masks.

    labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Ground truth mask for input images.

    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
        Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Returns:
    `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
rF   r~   Nr   r   none)	reductionr  )
r!   r   repeatr5   r  rG   rq   smooth_l1_lossr  sum)r?   r  r+   r  ry   ground_truthmaskr*   s           r&   rO   SegGptLoss.forward2  s    2 yy,!7Q?q!Tz*11!Q8JQ8NO$ 2 21 5 H,J\J\]^J_cgcrcrJrs
FQUQZQZ[  "TXXZ/r%   )r  r5   )r   r   r   r   r3   r!   r"   r   rO   r$   rQ   rR   s   @r&   r  r  ,  sP    ,
!''! %%! !!	!
 ))! !r%   r  zM
    SegGpt model with a decoder on top for one-shot image segmentation.
    )custom_introc                     ^  \ rS rSrS\4U 4S jjr\       SS\R                  S\R                  S\R                  S\	\R                     S\	\   S	\	\   S
\	\R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )SegGptForImageSegmentationiV  r@   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r2   r3   r@   r  rl  r]  decoderr  r   s     r&   r3   #SegGptForImageSegmentation.__init__\  s9      (
$V, 	r%   rJ   rx   r  ry   r  rz   r  r   r*  r+  rV   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Uc  U R                  R
                  R                  R                  n[        R                  " US-  [        R                  UR                  S9n[        R                  " XS-  -
  [        R                  UR                  S9n[        R                  " X/5      nUR                  S5      nU R	                  UUUUUUUUU	U
S9
nU
(       a  UR                  OUS   n[        R                  " USS9nU R!                  U5      nSnUb  [#        U R                   5      nU" UUXt5      nU
(       d9  U4nU	(       a	  UUS   4-   nU(       a  U	(       a  SOSnUUU   4-   nUb  U4U-   nU$ [%        UUUR&                  UR(                  S	9$ )
a)
  
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
    [`SegGptImageProcessor.__call__`] for details.
prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
    details.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
feature_ensemble (`bool`, *optional*):
    Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
    if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
    be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
embedding_type (`str`, *optional*):
    Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
    instance or semantic.
labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
    Ground truth mask for input images.

Examples:

```python
>>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
>>> from PIL import Image
>>> import requests

>>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
>>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
>>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

>>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
>>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
>>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

>>> checkpoint = "BAAI/seggpt-vit-large"
>>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
>>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

>>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
>>> outputs = model(**inputs)
>>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
>>> print(list(result.shape))
[170, 297]
```
NrF   r   r   )
rJ   rx   r  ry   r  rz   r  r   r*  r+  rj   r~   r   )r*   r+   r   r   )r@   r   r*  r  rl  rN   r_   r<   r!   rY   r  r   rC  r   r   r   r  r  r(   r   r   )r?   rJ   rx   r  ry   r  rz   r  r   r*  r+  r<   r  r  r  r   r+   r*   loss_fnr   idxs                        r&   rO   "SegGptForImageSegmentation.forwardf  s   v 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"**//@@LLK$)KKq0@

[g[n[n$o!#(::Q..ejjI\I\$  $ii)>(UVO-77:O**% 3%+-)/!5#  
 LWW%G%G\cdf\g"%*YY/Ir%R"\\"<=
 -G<VMD ]F#71:-/ /aQ73</16)M,!!//))	
 	
r%   )r@   r  rl  r  )r   r   r   r   r   r3   r   r!   r   r   r   r  r   r"   r   r   r(   rO   r$   rQ   rR   s   @r&   r  r  V  s    |   7;+/(,.2,0/3&*q
llq
 #\\q
 ll	q

 "%"2"23q
 #4.q
 !q
 **+q
 $D>q
 'tnq
 d^q
 
u33	4q
 q
r%   r  )r  rk  r  )r   F)8r    collections.abcr9   dataclassesr   typingr   r   r   r   r   r!   torch.utils.checkpointr	   torch.nnr
   rq   activationsr   modeling_utilsr   utilsr   r   r   r   configuration_seggptr   
get_loggerr   r  r   r(   Moduler-   rT   r   r   r   r   r  r   r   r   r  r?  rR  r]  rk  r  r   r  r  r  r  __all__r   r%   r&   <module>r     s!     ! 5 5    $ ! - D D . 
		H	% J+ J J0 :K : :0 BII  FRryy RjK4bii K4^		 U\\ e T V[VbVb *-RYY -,")) ,^B
BII B
Lbii <		 0BII B -GO -G -G` B' B BJ	U\\ 	s 	u|| 	u|| 3 S U\\ ' 'T 
}
!6 }

}
@ Qr%   