
    fThbR                    8   S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
r
SSK
JrJr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJ r   \" 5       (       a  SSK!J"r"  \RF                  " \$5      r%\ " S S\5      5       r&\ " S S\5      5       r'\ " S S\5      5       r(\ " S S\5      5       r) " S S\RT                  5      r+S r, " S S\RT                  5      r- " S S\RT                  5      r. " S S \RT                  5      r/ " S! S"\RT                  5      r0S# r1 " S$ S%\RT                  5      r2 " S& S'\RT                  5      r3 " S( S)\RT                  5      r4\ " S* S+\5      5       r5 " S, S-\55      r6 " S. S/\55      r7\" S0S19 " S2 S3\55      5       r8 " S4 S5\RT                  5      r9\" S6S19 " S7 S8\55      5       r:\" S9S19 " S: S;\55      5       r;S<\<4S= jr= " S> S?\RT                  5      r> " S@ SA\RT                  5      r?/ SBQr@g)CzPyTorch DETR model.    N)	dataclass)DictListOptionalTupleUnion)Tensornn   )ACT2FN)_prepare_4d_attention_mask)BaseModelOutput"BaseModelOutputWithCrossAttentionsSeq2SeqModelOutput)PreTrainedModel)ModelOutputauto_docstringis_timm_availableloggingrequires_backends)load_backbone   )
DetrConfig)create_modelc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)DetrDecoderOutput.   ab  
Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
        used to compute the weighted average in the cross-attention heads.
    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
        layernorm.
Nintermediate_hidden_states __name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations____static_attributes__r       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/detr/modeling_detr.pyr   r   .   s     2 ?C):): ;Br*   r   c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)DetrModelOutputL   a  
Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
        layer plus the initial embedding outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
        used to compute the weighted average in the cross-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
        layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
        layernorm.
Nr   r   r    r   r*   r+   r-   r-   L   s!    !F ?C):): ;Br*   r-   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\      \	S'   Sr\\R                     \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)DetrObjectDetectionOutputt   a  
Output type of [`DetrForObjectDetection`].

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
        layer plus the initial embedding outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
        used to compute the weighted average in the cross-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
        layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statedecoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsr   )r!   r"   r#   r$   r%   r2   r   r&   r'   r(   r3   r   r4   r5   r6   r   r7   r8   r   r9   r:   r;   r<   r=   r)   r   r*   r+   r0   r0   t   s   -^ )-D(5$$
%, $Ix~$*.FHU&&'..2J**+2.2xT
+259x 1 129@D8E%*;*;$<=D=Au'8'8!9:A;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:Ar*   r0   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\      \	S	'   Sr\\R                     \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)DetrSegmentationOutput   a  
Output type of [`DetrForSegmentation`].

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
        Segmentation masks logits for all queries. See also
        [`~DetrImageProcessor.post_process_semantic_segmentation`] or
        [`~DetrImageProcessor.post_process_instance_segmentation`]
        [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
        segmentation masks respectively.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
        layer plus the initial embedding outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
        used to compute the weighted average in the cross-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
        layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
Nr2   r3   r4   r5   
pred_masksr6   r7   r8   r9   r:   r;   r<   r=   r   )r!   r"   r#   r$   r%   r2   r   r&   r'   r(   r3   r   r4   r5   rA   r6   r   r7   r8   r   r9   r:   r;   r<   r=   r)   r   r*   r+   r?   r?      s3   3j )-D(5$$
%, $Ix~$*.FHU&&'..2J**+2.2J**+2.2xT
+259x 1 129@D8E%*;*;$<=D=Au'8'8!9:A;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:Ar*   r?   c                   >   ^  \ rS rSrSrU 4S jrU 4S jrS rSrU =r	$ )DetrFrozenBatchNorm2d   z
BatchNorm2d where the batch statistics and the affine parameters are fixed.

Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
torchvision.models.resnet[18,34,50,101] produce nans.
c                 R  > [         TU ]  5         U R                  S[        R                  " U5      5        U R                  S[        R
                  " U5      5        U R                  S[        R
                  " U5      5        U R                  S[        R                  " U5      5        g )Nweightbiasrunning_meanrunning_var)super__init__register_bufferr&   oneszeros)selfn	__class__s     r+   rK   DetrFrozenBatchNorm2d.__init__  sn    Xuzz!}5VU[[^4^U[[^<]EJJqM:r*   c           	      B   > US-   nX;   a  X	 [         T	U ]  XX4XVU5        g )Nnum_batches_tracked)rJ   _load_from_state_dict)
rO   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsnum_batches_tracked_keyrQ   s
            r+   rU   +DetrFrozenBatchNorm2d._load_from_state_dict
  s4     #)+@"@"03%Wa	
r*   c                 <   U R                   R                  SSSS5      nU R                  R                  SSSS5      nU R                  R                  SSSS5      nU R                  R                  SSSS5      nSnX$U-   R                  5       -  nX5U-  -
  nX-  U-   $ )Nr   gh㈵>)rF   reshaperG   rI   rH   rsqrt)rO   xrF   rG   rI   rH   epsilonscales           r+   forwardDetrFrozenBatchNorm2d.forward  s     $$QAq1yy  B1-&&..q"a;((00B1=/6688U**y4r*   r   )
r!   r"   r#   r$   r%   rK   rU   rf   r)   __classcell__rQ   s   @r+   rC   rC      s    ;	

  
 r*   rC   c                    U R                  5        GHg  u  p[        U[        R                  5      (       Ga  [	        UR
                  5      nUR                  R                  [        R                  " S5      :X  d  UR                  R                  R                  UR                  5        UR                  R                  R                  UR                  5        UR                  R                  R                  UR                  5        UR                  R                  R                  UR                  5        X0R                  U'   [        [!        UR#                  5       5      5      S:  d  GM\  [%        U5        GMj     g)z
Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.

Args:
    model (torch.nn.Module):
        input model
metar   N)named_children
isinstancer
   BatchNorm2drC   num_featuresrF   devicer&   datacopy_rG   rH   rI   _moduleslenlistchildrenreplace_batch_norm)modelnamemodule
new_modules       r+   rw   rw   "  s     ,,.fbnn--.v/B/BCJ==''5<<+??!!&&,,V]];$$**6;;7'',,2263F3FG&&++11&2D2DE#-NN4 tFOO%&'!+v& /r*   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )DetrConvEncoderi:  z
Convolutional backbone, using either the AutoBackbone API or one from the timm library.

nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.

c                   > [         T	U ]  5         Xl        UR                  (       a  [	        U S/5        [        US0 5      nUc  0 OUR                  5       nUR                  SS5      nUR                  SUR                  5      nUR                  (       a  UR                  SS5      US'   [        UR                  4UR                  SUUS	.UD6nO[        U5      n[        R                   " 5          [#        U5        S S S 5        XPl        UR                  (       a$  U R$                  R&                  R)                  5       OU R$                  R(                  U l        S nUR                  b  UR                  nO/UR,                  b  UR,                  R.                  nO[1        S
5      eSU;   a  U R$                  R3                  5        Hj  u  pxUR                  (       a+  SU;  a#  SU;  a  SU;  a  UR5                  S5        M;  M=  M?  MA  SU;  d  MI  SU;  d  MQ  SU;  d  MY  UR5                  S5        Ml     g g ! , (       d  f       GN@= f)Ntimmbackbone_kwargsout_indices)r      r      in_chansoutput_stride   T)
pretrainedfeatures_onlyr   r   zGEither `backbone` or `backbone_config` should be provided in the configresnetlayer2layer3layer4Fzstage.1zstage.2zstage.3)rJ   rK   configuse_timm_backboner   getattrcopypopnum_channelsdilationgetr   backboneuse_pretrained_backboner   r&   no_gradrw   rx   feature_infochannelsintermediate_channel_sizesbackbone_config
model_type
ValueErrornamed_parametersrequires_grad_)
rO   r   kwargsr   r   r   backbone_model_typery   	parameterrQ   s
            r+   rK   DetrConvEncoder.__init__B  s    ## dVH-V%6;F!>Rv{{}F **]LAK!::j&2E2EFL*0**_b*I'#!99"'% H %V,H ]]_x( 
282J2JDJJ##,,.PTPZPZPcPc 	' #??&"(//##/"("8"8"C"Cfgg**#'::#>#>#@++t+0DY]I]!007 J^0D+ !,$1F9\`K`!007 $A + _s   0H55
Ipixel_values
pixel_maskc                    U R                   R                  (       a  U R                  U5      OU R                  U5      R                  n/ nU Hq  n[        R
                  R                  US    R                  5       UR                  SS  S9R                  [        R                  5      S   nUR                  XV45        Ms     U$ )N)sizer   )r   r   rx   feature_mapsr
   
functionalinterpolatefloatshapetor&   boolappend)rO   r   r   featuresoutfeature_mapmasks          r+   rf   DetrConvEncoder.forwardv  s    /3{{/L/L4::l+RVR\R\]iRjRwRw#K==,,Z-=-C-C-EKL]L]^`^aLb,cffglgqgqrstuDJJ*+ $ 
r*   )r   r   rx   )r!   r"   r#   r$   r%   rK   r&   r	   rf   r)   rh   ri   s   @r+   r}   r}   :  s.    28h	ELL 	ell 	 	r*   r}   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )DetrConvModeli  zh
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
c                 :   > [         TU ]  5         Xl        X l        g N)rJ   rK   conv_encoderposition_embedding)rO   r   r   rQ   s      r+   rK   DetrConvModel.__init__  s    ("4r*   c                     U R                  X5      n/ nU H>  u  pVUR                  U R                  XV5      R                  UR                  5      5        M@     X44$ r   )r   r   r   r   dtype)rO   r   r   r   posr   r   s          r+   rf   DetrConvModel.forward  sU    9!$KJJt..{ADD[EVEVWX "% xr*   )r   r   	r!   r"   r#   r$   r%   rK   rf   r)   rh   ri   s   @r+   r   r     s    5
 r*   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )DetrSinePositionEmbeddingi  z
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
need paper, generalized to work on images.
c                    > [         TU ]  5         Xl        X l        X0l        Ub  USL a  [        S5      eUc  S[        R                  -  nX@l        g )NFz+normalize should be True if scale is passedr   )	rJ   rK   embedding_dimtemperature	normalizer   mathpire   )rO   r   r   r   re   rQ   s        r+   rK   "DetrSinePositionEmbedding.__init__  sO    *&"e!3JKK=KE
r*   c           
      N   Uc  [        S5      eUR                  S[        R                  S9nUR                  S[        R                  S9nU R                  (       a@  X3S S 2SS 2S S 24   S-   -  U R
                  -  nXDS S 2S S 2SS 24   S-   -  U R
                  -  n[        R                  " U R                  [        R                  UR                  S9R                  5       nU R                  S[        R                  " USSS	9-  U R                  -  -  nUS S 2S S 2S S 2S 4   U-  nUS S 2S S 2S S 2S 4   U-  n[        R                  " US S 2S S 2S S 2S
S S24   R                  5       US S 2S S 2S S 2SS S24   R                  5       4SS9R!                  S5      n[        R                  " US S 2S S 2S S 2S
S S24   R                  5       US S 2S S 2S S 2SS S24   R                  5       4SS9R!                  S5      n[        R"                  " Xv4SS9R%                  S
SSS5      nU$ )NzNo pixel mask providedr   )r   r   r`   gư>)r   rp   floor)rounding_moder   r   dimr   )r   cumsumr&   float32r   re   aranger   int64rp   r   r   divstacksincosflattencatpermute)	rO   r   r   y_embedx_embeddim_tpos_xpos_yr   s	            r+   rf   !DetrSinePositionEmbedding.forward  s   566##AU]]#;##AU]]#;>>BC!3d!:;djjHGArs!3d!:;djjHGT//u{{<K^K^_eeg  Q5!7)S%SVZVhVh%hi1a&.1a&.U1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgU1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgiiA.66q!QB
r*   )r   r   re   r   )@   i'  FNr   ri   s   @r+   r   r     s    
	 r*   r   c                   :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )DetrLearnedPositionEmbeddingi  zF
This module learns positional embeddings up to a fixed maximum size.
c                    > [         TU ]  5         [        R                  " SU5      U l        [        R                  " SU5      U l        g )N2   )rJ   rK   r
   	Embeddingrow_embeddingscolumn_embeddings)rO   r   rQ   s     r+   rK   %DetrLearnedPositionEmbedding.__init__  s4     ll2}=!#b-!@r*   c                    UR                   SS  u  p4[        R                  " XAR                  S9n[        R                  " X1R                  S9nU R	                  U5      nU R                  U5      n[        R                  " UR                  S5      R                  USS5      UR                  S5      R                  SUS5      /SS9n	U	R                  SSS5      n	U	R                  S5      n	U	R                  UR                   S   SSS5      n	U	$ )Nr   rp   r   r   r`   r   r   )
r   r&   r   rp   r   r   r   	unsqueezerepeatr   )
rO   r   r   heightwidthwidth_valuesheight_valuesx_emby_embr   s
             r+   rf   $DetrLearnedPositionEmbedding.forward  s    $**23/||E2E2EFV4G4GH&&|4##M2ii+2261a@%//RSBTB[B[\]_dfgBhioqrkk!Q"mmAjj++A.1a8
r*   )r   r   )   r   r   ri   s   @r+   r   r     s    A

 
r*   r   c                     U R                   S-  nU R                  S:X  a  [        USS9nU$ U R                  S:X  a  [        U5      nU$ [	        SU R                   35      e)Nr   sineT)r   learnedzNot supported )d_modelposition_embedding_typer   r   r   )r   n_stepsr   s      r+   build_position_encodingr     sq    nn!G%%/6w$O  
	'	'9	49'B  >&*H*H)IJKKr*   c                     ^  \ rS rSrSr  SS\S\S\S\4U 4S jjjrS\	R                  S	\S
\4S jrS\	R                  S\\
   4S jr     SS\	R                  S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )DetrAttentioni  z
Multi-headed attention from 'Attention Is All You Need' paper.

Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
	embed_dim	num_headsdropoutrG   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        [        R                  " XUS9U l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      ࿩rG   )rJ   rK   r   r   r  head_dimr   scalingr
   Lineark_projv_projq_projout_proj)rO   r   r   r  rG   rQ   s        r+   rK   DetrAttention.__init__  s     	""!.==9$6MdnnM] ^;b"  }}d*ii	4@ii	4@ii	4@		)TBr*   tensorseq_len
batch_sizec                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )viewr   r  	transpose
contiguous)rO   r  r  r  s       r+   _shapeDetrAttention._shape  s5    {{:NXXYZ\]^iikkr*   object_queriesc                     Uc  U$ X-   $ r   r   )rO   r  r  s      r+   with_pos_embedDetrAttention.with_pos_embed  s    '/vLV5LLr*   hidden_statesattention_maskkey_value_statesspatial_position_embeddingsoutput_attentionsreturnc                 z   USLnUR                  5       u  pn
Ub  UnU R                  X5      nUb  UnU R                  XE5      nU R                  U5      U R                  -  nU(       aE  U R	                  U R                  U5      SU5      nU R	                  U R                  W5      SU5      nODU R	                  U R                  U5      SU5      nU R	                  U R                  W5      SU5      nXR                  -  SU R                  4nU R	                  XU5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  U	U4:w  a.  [        SXR                  -  U	U4 SUR                  5        35      eUbz  UR                  5       USU	U4:w  a#  [        SUSU	U4 SUR                  5        35      eUR                  XR                  U	U5      U-   nUR                  XR                  -  U	U5      n[        R                  R!                  USS9nU(       a=  UR                  XR                  U	U5      nUR                  XR                  -  U	U5      nOSn[        R                  R#                  UU R"                  U R$                  S	9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a5  [        S
XR                  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR'                  XU
5      nU R)                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr`   r   r   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   ptrainingz `attn_output` should be of size )r   r  r
  r  r  r  r	  r   r  r  r&   bmmr  r   r
   r   softmaxr  r#  ra   r  )rO   r  r  r  r  r  r  is_cross_attentionr  
target_lenr   hidden_states_originalkey_value_states_originalquery_states
key_statesvalue_states
proj_shape
source_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                         r+   rf   DetrAttention.forward  s    .T9,9,>,>,@)
	 %%2" //NM '2(8%#223Ca {{=1DLL@T[[1A%BB
SJ;;t{{3L'MrS]^L T[[%?ZPJ;;t{{3I'JBPZ[L >>12t}}E
{{<ZHMMzZ__j1
#((*5__Q'
yy/C/CAq/IJ:#>
J"WW6
^^8SU_ak7l6m n %%'(* 
 %""$Q
J(OO 7Q
T^8_7` a&++-.0  (,,ZU_`cqqL',,Z..-H*V`aL}},,\r,B
 %1$5$5j..R\^h$i!055j>>6QS]_ijL$(!]]**<4<<RVR_R_*`
ii
L9*~~"=z==!YY2JPZ\i\i3j2k l$$&') 
 "&&z>>:}}]!++Aq1!))*)LmmK0111r*   )	r  r   r  r  r   r  r
  r  r	  )        T)NNNNF)r!   r"   r#   r$   r%   intr   r   rK   r&   r	   r  r   r  r   rf   r)   rh   ri   s   @r+   r   r     s>    CC C 	C
 C C0lU\\ lC lS lMU\\ M8FCS M 261537>B"'Y2||Y2 !.Y2 !.	Y2
 #5<<0Y2 &.ell%;Y2  Y2 
u||Xell3XeELL>Q5RR	SY2 Y2r*   r   c            	          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\R                  S\\R                     S\	4S jjr
S	rU =r$ )DetrEncoderLayeria  r   c                 h  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )Nr   r   r  )rJ   rK   r   r   r   encoder_attention_headsattention_dropout	self_attnr
   	LayerNormself_attn_layer_normr  r   activation_functionactivation_fnactivation_dropoutr  encoder_ffn_dimfc1fc2final_layer_normrO   r   rQ   s     r+   rK   DetrEncoderLayer.__init__b  s    &nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r*   r  r  r  r  c                    UnU R                  UUUUS9u  p[        R                  R                  XR                  U R                  S9nXQ-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXQ-   nU R                  U5      nU R                  (       a  [        R                  " U5      R                  5       (       d)  [        R                  " U5      R                  5       (       aC  [        R                  " UR                   5      R"                  S-
  n[        R$                  " X* US9nU4nU(       a  X4-  nU$ )as  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
        values.
    object_queries (`torch.FloatTensor`, *optional*):
        Object queries (also called content embeddings), to be added to the hidden states.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r  r  r  r  r!  i  )minmax)r<  r
   r   r  r#  r>  r@  rC  rA  rD  rE  r&   isinfanyisnanfinfor   rJ  clamp)	rO   r  r  r  r  residualr/  clamp_valueoutputss	            r+   rf   DetrEncoderLayer.forwardr  sy   & !&*nn'))/	 '5 '
# --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<=={{=)--//5;;}3M3Q3Q3S3S#kk-*=*=>BBTI %M|Q\ ] "&Gr*   )	rA  r@  r  r   rC  rD  rE  r<  r>  NFr!   r"   r#   r$   r   rK   r&   r	   r   r   rf   r)   rh   ri   s   @r+   r7  r7  a  sX    =z =( 26"'3||3 3 !.	3
  3 3r*   r7  c                     ^  \ rS rSrS\4U 4S jjr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   4S jjr
SrU =r$ )DetrDecoderLayeri  r   c                   > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  S9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )Nr9  )r  )rJ   rK   r   r   r   decoder_attention_headsr;  r<  r  r   r?  r@  rA  r
   r=  r>  encoder_attnencoder_attn_layer_normr  decoder_ffn_dimrC  rD  rE  rF  s     r+   rK   DetrDecoderLayer.__init__  s   &nn44,,

 ~~#F$>$>?"(";";$&LL$@!)NN**,,

 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r*   r  r  r  query_position_embeddingsr<   encoder_attention_maskr  c           	          UnU R                  UUUUS9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSn
Ub_  UnU R                  UUUUUUS9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU(       a  XU
4-  nU$ )ag  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
        values.
    object_queries (`torch.FloatTensor`, *optional*):
        object_queries that are added to the hidden states
    in the cross-attention layer.
    query_position_embeddings (`torch.FloatTensor`, *optional*):
        position embeddings that are added to the queries and keys
    in the self-attention layer.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
        values.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r  r  r  r  r!  N)r  r  r  r  r  r  )r<  r
   r   r  r#  r>  rZ  r[  r@  rC  rA  rD  rE  )rO   r  r  r  r^  r<   r_  r  rP  self_attn_weightscross_attn_weightsrR  s               r+   rf   DetrDecoderLayer.forward  s   > ! ,0>>'4)/	 ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+8!65,:"3 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< "+=>>Gr*   )rA  r@  r  r   rZ  r[  rC  rD  rE  r<  r>  )NNNNNFrU  ri   s   @r+   rW  rW    s    =z =6 2615<@8<9=,1M||M !.M !.	M
 $,ELL#9M  (5M !) 6M $D>M Mr*   rW  c                   .    \ rS rSr\rSrSr/ SQrS r	Sr
g)DetrPreTrainedModeli  rx   r   )r}   r7  rW  c                 B   U R                   R                  nU R                   R                  n[        U[        5      (       a  [
        R                  R                  UR                  R                  5        [
        R                  R                  UR                  R                  5        [
        R                  R                  UR                  R                  US9  [
        R                  R                  UR                  R                  US9  O{[        U[        5      (       af  [
        R                  R                  UR                  R                  5        [
        R                  R                  UR                   R                  5        [        U[
        R"                  [
        R$                  [
        R&                  45      (       aW  UR                  R(                  R+                  SUS9  UR                  b%  UR                  R(                  R-                  5         g g [        U[
        R.                  5      (       ad  UR                  R(                  R+                  SUS9  UR0                  b2  UR                  R(                  UR0                     R-                  5         g g g )N)gainr4  )meanstd)r   init_stdinit_xavier_stdrm   DetrMHAttentionMapr
   initzeros_k_linearrG   q_linearxavier_uniform_rF   r   uniform_r   r   r  Conv2drn   rq   normal_zero_r   padding_idx)rO   rz   ri  
xavier_stds       r+   _init_weights!DetrPreTrainedModel._init_weights  s   kk""[[00
f011GGNN6??//0GGNN6??//0GG##FOO$:$:#LGG##FOO$:$:#L <==GGV2299:GGV55<<=fryy"))R^^DEE MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r*   r   N)r!   r"   r#   r$   r   config_classbase_model_prefixmain_input_name_no_split_modulesrx  r)   r   r*   r+   re  re    s    L$OV?r*   re  c                   J   ^  \ rS rSrSrS\4U 4S jjr      SS jrSrU =r	$ )DetrEncoderi0  a5  
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`DetrEncoderLayer`].

The encoder updates the flattened feature map through multiple self-attention layers.

Small tweak for DETR:

- object_queries are added to the forward pass.

Args:
    config: DetrConfig
r   c                 "  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        U R                  5         g s  snf r   )rJ   rK   r  encoder_layerdrop	layerdropr
   
ModuleListrangeencoder_layersr7  layers	post_initrO   r   _rQ   s      r+   rK   DetrEncoder.__init__?  sl     ~~11mmuVMbMbGc$dGc!%5f%=Gc$de
 	 %es   Bc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUn[        R
                  R                  XpR                  U R                  S9nUb  [        X!R                  5      nU(       a  SOSnU(       a  SOSn	[        U R                  5       Ht  u  pU(       a  X4-   nSnU R                  (       a'  [        R                  " / 5      nXR                  :  a  SnU(       a  SnOU" UUUUS9nUS   nU(       d  Ml  XS	   4-   n	Mv     U(       a  X4-   nU(       d  [        S
 XxU	4 5       5      $ [!        XxU	S9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.

    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:

        - 1 for pixel features that are real (i.e. **not masked**),
        - 0 for pixel features that are padding (i.e. **masked**).

        [What are attention masks?](../glossary#attention-mask)

    object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Object queries that are added to the queries in each self-attention layer.

    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr!  r   FTNN)r  r  r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   .0vs     r+   	<genexpr>&DetrEncoder.forward.<locals>.<genexpr>  s     e$Sq$Ss   	r7   r  
attentions)r   r  output_hidden_statesuse_return_dictr
   r   r  r#  r   r   	enumerater  r&   randr  tupler   )rO   inputs_embedsr  r  r  r  return_dictr  encoder_statesall_attentionsiencoder_layerto_dropdropout_probabilitylayer_outputss                  r+   rf   DetrEncoder.forwardL  sq   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%--m||VZVcVc-d %7H[H[\N30d )$++ 6A#!/2B!BG}}&+jjn#&7"G , !.!"#1&7	! !.a 0  !/3C2E!E1 !74  +.>>Ne]N$Seee+Vd
 	
r*   )r  r  r  )NNNNNN
r!   r"   r#   r$   r%   r   rK   rf   r)   rh   ri   s   @r+   r  r  0  s4    z  !S
 S
r*   r  c                   P   ^  \ rS rSrSrS\4U 4S jjr         SS jrSrU =r	$ )DetrDecoderi  a  
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].

The decoder updates the query embeddings through multiple self-attention and cross-attention layers.

Some small tweaks for DETR:

- object_queries and query_position_embeddings are added to the forward pass.
- if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

Args:
    config: DetrConfig
r   c                 z  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        [
        R                  " UR                  5      U l        SU l        U R                  5         g s  snf rT  )rJ   rK   r  decoder_layerdropr  r
   r  r  decoder_layersrW  r  r=  r   	layernormgradient_checkpointingr  r  s      r+   rK   DetrDecoder.__init__  s     ~~11mmuVMbMbGc$dGc!%5f%=Gc$defnn5&+# %es   B8c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Un
UR	                  5       SS nSnUb  Ub  U[        X!R                  WS   S9-   nUb  Ub  [        XAR                  WS   S9nU R                   R                  (       a  SOSnU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSn[        U R                  5       H  u  nnU(       a  UW
4-  nU R                  (       a(  [        R                  " / 5      nUU R                  :  a  ML  U R                  (       a2  U R                  (       a!  U R                  UR                   W
UUUS5      nOU" W
UUUUUUS9nUS   n
U R                   R                  (       a  U R#                  U
5      n
X4-  nU(       d  M  UUS   4-  nUc  M  UUS   4-  nM     U R#                  W
5      n
U(       a  X4-  nU R                   R                  (       a  [        R$                  " U5      nU	(       d  ['        S	 XUUU4 5       5      $ [)        U
UUUUS
9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The query embeddings that are passed into the decoder.

    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:

        - 1 for queries that are **not masked**,
        - 0 for queries that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
        in `[0, 1]`:

        - 1 for pixels that are real (i.e. **not masked**),
        - 0 for pixels that are padding (i.e. **masked**).

    object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Object queries that are added to the queries and keys in each cross-attention layer.
    query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
        , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.

    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr`   )tgt_lenr   )r  r  r^  r<   r_  r  r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r  s     r+   r  &DetrDecoder.forward.<locals>.<genexpr>D  s      oA os   	)r7   r  r  r:   r   )r   r  r  r  r   r   r   auxiliary_lossr  r  r#  r&   r  r  r  _gradient_checkpointing_func__call__r  r   r  r   )rO   r  r  r<   r_  r  r^  r  r  r  r  input_shapecombined_attention_maskintermediateall_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerr  r  s                        r+   rf   DetrDecoder.forward  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$)M',,.s3K"&%*A*M&=@Z 3 3[_A '#
 !,1G1S%?&(;(;[QS_&"
 "[[77rT #7BD0d&7<Q<]rdh"+DKK"8C#!m%55!}}&+jjn#&7**t}} $ A A!**!+)*! !.!#:#1.G*?+A&7! *!,M{{)) $} = 00  =#3"55(4(]1-=,??(O #9T }5  !11 ;;%% ;;|4L 'NL`bno  
 !++%1'3
 	
r*   )r  r  r  r  r  	NNNNNNNNNr  ri   s   @r+   r  r    s=    z  "#"&!Q
 Q
r*   r  z
    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
    any specific head on top.
    )custom_introc                   d  ^  \ rS rSrS\4U 4S jjrS rS rS rS r	\
        SS\R                  S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )	DetrModeliR  r   c                   > [         TU ]  U5        [        U5      n[        U5      n[	        X#5      U l        [        R                  " UR                  S   UR                  SS9U l
        [        R                  " UR                  UR                  5      U l        [        U5      U l        [!        U5      U l        U R%                  5         g )Nr`   r   )kernel_size)rJ   rK   r}   r   r   r   r
   rs  r   r   input_projectionr   num_queriesr^  r  encoderr  decoderr  )rO   r   r   r  rQ   s       r+   rK   DetrModel.__init__Y  s      #6*08%h? !#		(*M*Mb*QSYSaSaop q)+f6H6H&..)Y&"6*"6* 	r*   c                     U R                   $ r   )r  rO   s    r+   get_encoderDetrModel.get_encoderl      ||r*   c                     U R                   $ r   )r  r  s    r+   get_decoderDetrModel.get_decodero  r  r*   c                     U R                   R                  R                  R                  5        H  u  pUR	                  S5        M     g rT  r   r   rx   r   r   rO   ry   params      r+   freeze_backboneDetrModel.freeze_backboner  s6    ==55;;LLNKD  ' Or*   c                     U R                   R                  R                  R                  5        H  u  pUR	                  S5        M     g )NTr  r  s      r+   unfreeze_backboneDetrModel.unfreeze_backbonev  s6    ==55;;LLNKD  & Or*   r   r   decoder_attention_maskencoder_outputsr  decoder_inputs_embedsr  r  r  r  c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	UR                  u  ppUR
                  nUc  [        R                  " XU4US9nU R                  X5      u  nnUS   u  nnUc  [        S5      eU R                  U5      nUR                  S5      R                  SSS5      nUS   R                  S5      R                  SSS5      nUR                  S5      nUc  U R                  UUUUUU	S9nORU	(       aK  [        U[        5      (       d6  [        US   [!        U5      S:  a  US   OS[!        U5      S:  a  US   OSS	9nU R"                  R$                  R'                  S5      R)                  U
SS5      n[        R*                  " U5      nU R-                  USUUUS   UUUU	S
9	nU	(       d  UU-   $ [/        UR0                  UR2                  UR4                  UR6                  UR0                  UR2                  UR4                  UR8                  S9$ )a  
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
    Not used by default. Can be used to mask object queries.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
    can choose to directly pass a flattened representation of an image.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
    Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
    embedded representation.

Examples:

```python
>>> from transformers import AutoImageProcessor, DetrModel
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
>>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)

>>> # the last hidden states are the final query embeddings of the Transformer decoder
>>> # these are of shape (batch_size, num_queries, hidden_size)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 100, 256]
```Nr   r`   z/Backbone does not return downsampled pixel maskr   r   r   r  r  r  r  r  r  r  	r  r  r  r^  r<   r_  r  r  r  )r7   r8   r9   r:   r;   r<   r=   r   )r   r  r  r  r   rp   r&   rM   r   r   r  r   r   r  rm   r   rt   r^  rF   r   r   
zeros_liker  r-   r7   r  r  r:   r   )rO   r   r   r  r  r  r  r  r  r  r  r   r   r   rp   r   object_queries_listr   r   projected_feature_mapflattened_featuresr  flattened_maskr^  queriesdecoder_outputss                             r+   rf   DetrModel.forwardz  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]2>2D2D/
&$$j%%@&QJ
 )-l(O%% %RLT<NOO !% 5 5k B 3::1=EEaAN,R088;CCAq!La
 ""ll0--"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO %)$B$B$I$I$S$STU$V$]$]^hjkmn$o!""#<= ,,!)&?"1!"4#1/!5# ' 

 "_44-??"1"?"?.99,==&5&G&G"1"?"?.99'6'Q'Q	
 		
r*   )r   r  r  r  r^  )NNNNNNNN)r!   r"   r#   r$   r   rK   r  r  r  r  r   r&   r'   r   
LongTensorr   r   r   r-   rf   r)   rh   ri   s   @r+   r  r  R  s   z &('  26>B7;59=A,0/3&*A
''A
 U--.A
 !)):): ;	A

 "%"3"34A
   1 12A
  ((9(9:A
 $D>A
 'tnA
 d^A
 
uU&&'8	9A
 A
r*   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )DetrMLPPredictionHeadi   z
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
height and width of a bounding box w.r.t. an image.

Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

c                    > [         TU ]  5         X@l        U/US-
  -  n[        R                  " S [        U/U-   XS/-   5       5       5      U l        g )Nr   c              3   R   #    U  H  u  p[         R                  " X5      v   M     g 7fr   )r
   r  )r  rP   ks      r+   r  1DetrMLPPredictionHead.__init__.<locals>.<genexpr>  s     #g@fBIIaOO@fs   %')rJ   rK   
num_layersr
   r  zipr  )rO   	input_dim
hidden_dim
output_dimr  hrQ   s         r+   rK   DetrMLPPredictionHead.__init__	  sN    $LJN+mm#gYKRSOUVYeUe@f#ggr*   c                     [        U R                  5       HD  u  p#X R                  S-
  :  a%  [        R                  R                  U" U5      5      OU" U5      nMF     U$ )Nr   )r  r  r  r
   r   relu)rO   rc   r  layers       r+   rf   DetrMLPPredictionHead.forward  sI    !$++.HA01OOa4G0G""58,USTXA /r*   )r  r  r   ri   s   @r+   r  r     s    h r*   r  z
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
    such as COCO detection.
    c                   ^  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S\	\R                     S\	\R                     S\	\R                     S\	\R                     S	\	\R                     S
\	\\      S\	\   S\	\   S\	\   S\\\R                     \4   4S jj5       rSrU =r$ )DetrForObjectDetectioni  r   c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  S-   5      U l        [        UR                  UR                  SSS9U l
        U R                  5         g )Nr   r   r   )r  r  r  r  )rJ   rK   r  rx   r
   r  r   
num_labelsclass_labels_classifierr  bbox_predictorr  rF  s     r+   rK   DetrForObjectDetection.__init__  sr      v&
 (*yyNNF--1(
$ 4nnAZ[

 	r*   r   r   r  r  r  r  labelsr  r  r  r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      R                  5       nSu  nnnUb  Su  nnU R                   R                  (       aH  U
(       a  UR                  OUS   nU R                  U5      nU R	                  U5      R                  5       nU R                  XU R                  XR                   UU5      u  nnnU
(       d  Ub
  X4U-   U-   nOX4U-   nUb  UU4U-   $ U$ [        UUUUUUR                  UR                  UR                  UR                  UR                  UR                   UR"                  S9$ )a
  
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
    Not used by default. Can be used to mask object queries.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
    can choose to directly pass a flattened representation of an image.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
    Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
    embedded representation.
labels (`List[Dict]` of len `(batch_size,)`, *optional*):
    Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
    following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
    respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
    in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.

Examples:

```python
>>> from transformers import AutoImageProcessor, DetrForObjectDetection
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
>>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)

>>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
...     0
... ]

>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
...     box = [round(i, 2) for i in box.tolist()]
...     print(
...         f"Detected {model.config.id2label[label.item()]} with confidence "
...         f"{round(score.item(), 3)} at location {box}"
...     )
Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]
```)r   r  r  r  r  r  r  r  r   NNNr  r   )r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   )r   r  rx   r  r  sigmoidr  r   loss_functionrp   r0   r7   r8   r9   r:   r;   r<   r=   )rO   r   r   r  r  r  r  r   r  r  r  rR  sequence_outputr4   r5   r2   r3   r6   outputs_classoutputs_coordr  outputs                         r+   rf   DetrForObjectDetection.forward-  s   @ &1%<k$++B]B] **!#9+'"7/!5#  

 "!* --o>((9AAC
-=*i*+5(M={{))EPwAAV]^_V` $ < <\ J $ 3 3L A I I K151C1CZmUb2.D).  , -0AAGK -7373CT9%.OO(!/%77")"?"?&99$55&-&G&G")"?"?&99
 	
r*   )r  r  rx   r  )r!   r"   r#   r$   r   rK   r   r&   r'   r   r  r   dictr   r   r   r0   rf   r)   rh   ri   s   @r+   r  r    s!   z "  26>B7;59=A'+,0/3&*s
''s
 U--.s
 !)):): ;	s

 "%"3"34s
   1 12s
  ((9(9:s
 d$s
 $D>s
 'tns
 d^s
 
uU&&')BB	Cs
 s
r*   r  z
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
    such as COCO panoptic.
    c                   ^  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S\	\R                     S\	\R                     S\	\R                     S\	\R                     S	\	\R                     S
\	\\      S\	\   S\	\   S\	\   S\\\R                     \4   4S jj5       rSrU =r$ )DetrForSegmentationi  r   c                 j  > [         TU ]  U5        [        U5      U l        UR                  UR
                  p2U R                  R                  R                  R                  R                  n[        X#-   US S S2   SS  U5      U l        [        X"USUR                  S9U l        U R                  5         g )Nr`   r4  )r  ri  )rJ   rK   r  detrr   r:  rx   r   r   r   DetrMaskHeadSmallConv	mask_headrl  rk  bbox_attentionr  )rO   r   hidden_sizenumber_of_headsr   rQ   s        r+   rK   DetrForSegmentation.__init__  s      +62	 (.~~v7U7U_%)YY__%=%=%J%J%e%e".)+Edd+KBC+PR]
 1osH^H^
 	r*   r   r   r  r  r  r  r   r  r  r  r  c                    U
b  U
OU R                   R                  n
UR                  u  ppUR                  nUc  [        R
                  " XU4US9nU R                  R                  R                  XS9u  nnUS   u  nnUR                  u  ppU R                  R                  R                  U5      nUR                  S5      R                  SSS5      nUS   R                  S5      R                  SSS5      nUR                  S5      nUc)  U R                  R                  R                  UUUUU	U
S9nORU
(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS	9nU R                  R                  R                   R"                  R%                  S5      R'                  USS5      n[        R(                  " U5      nU R                  R                  R+                  USUUUS   UUU	U
S
9	nUS   nU R                  R-                  U5      nU R                  R/                  U5      R1                  5       nUS   R                  SSS5      R3                  XR                   R4                  X5      nUR3                  XU5      nU R7                  UUU) S9nU R9                  UUUS   S   US   S   US   S   /5      n U R3                  XR                  R                   R:                  U R                  S   U R                  S   5      n!Su  n"n#n$Ub  Su  n%n&U R                   R<                  (       a\  U
(       a  UR>                  OUS   n'U R                  R-                  U'5      n%U R                  R/                  U'5      R1                  5       n&U RA                  UXUU!U R                   U%U&5      u  n"n#n$U
(       d)  U$b  UUU!4U$-   U-   U-   n(OUUU!4U-   U-   n(U"b  U"U#4U(-   $ U($ [C        U"U#UUU!U$URD                  URF                  URH                  URJ                  URD                  URF                  URH                  S9$ )a`
  
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
    Not used by default. Can be used to mask object queries.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
    can choose to directly pass a flattened representation of an image.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
    Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
    embedded representation.
labels (`List[Dict]` of len `(batch_size,)`, *optional*):
    Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
    dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
    bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
    should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
    `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
    `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.

Examples:

```python
>>> import io
>>> import requests
>>> from PIL import Image
>>> import torch
>>> import numpy

>>> from transformers import AutoImageProcessor, DetrForSegmentation
>>> from transformers.image_transforms import rgb_to_id

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
>>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)

>>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
>>> # Segmentation results are returned as a list of dictionaries
>>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])

>>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
>>> panoptic_seg = result[0]["segmentation"]
>>> # Get prediction score and segment_id to class_id mapping of each segment
>>> panoptic_segments_info = result[0]["segments_info"]
```Nr   )r   r`   r   r   r   r  r  r  )r   r   r  r  )r2   r3   r4   r5   rA   r6   r7   r8   r9   r:   r;   r<   r=   )&r   r  r   rp   r&   rM   r  rx   r   r  r   r   r  rm   r   rt   r^  rF   r   r   r  r  r  r  r  r  r   r  r  r  r  r   r  r?   r7   r  r  r:   ))rO   r   r   r  r  r  r  r   r  r  r  r  r   r   r   rp   r   r  r   r   r  r  r  r  r^  r  r  r  r4   r5   memory	bbox_mask	seg_masksrA   r2   r3   r6   r  r  r  r  s)                                            r+   rf   DetrForSegmentation.forward  sz   B &1%<k$++B]B]2>2D2D/
&$$Z$?OJ )-		(@(@(@(e%% %RLT2=2C2C/
& $		 @ @ M 3::1=EEaAN,R088;CCAq!La
 ""iioo550--"3%9' 6 O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO %)IIOO$M$M$T$T$^$^_`$a$h$h1%
! ""#<= ))//11!)&?"1!"4#1/!5# 2 

 *!, 22?CYY--o>FFH
 #++Aq!499*kkFYFY[ai"":u=
 ''te'L	NN#8)hqkRSnV^_`VabcVdfnopfqrsftEuv	^^J		0@0@0L0Lioo^`Naclcrcrsucvw
-=*i*+5(M={{))MXII^mnp^q $		 A A, O $		 8 8 F N N P151C1C
J]\i2.D).  , *j9<MMP__bqq *j9OKo]373CT9%.OO%!!/-??"1"?"?.99,==&5&G&G"1"?"?.99
 	
r*   )r  r  r  r  )r!   r"   r#   r$   r   rK   r   r&   r'   r   r  r   r
  r   r   r   r?   rf   r)   rh   ri   s   @r+   r  r    s!   z (  26>B7;59=A'+,0/3&*p
''p
 U--.p
 !)):): ;	p

 "%"3"34p
   1 12p
  ((9(9:p
 d$p
 $D>p
 'tnp
 d^p
 
uU&&')??	@p
 p
r*   r  lengthc                 |    U R                  S5      R                  S[        U5      SSS5      R                  SS5      $ )Nr   r   )r   r   r5  r   )r  r  s     r+   _expandr  s  s7    A%%aVaA>FFq!LLr*   c                   H   ^  \ rS rSrSrU 4S jrS\S\S\\   4S jrSr	U =r
$ )	r  ix  zV
Simple convolutional head, using group norm. Upsampling is done using a FPN approach
c                 |  > [         TU ]  5         US-  S:w  a  [        S5      eXS-  US-  US-  US-  US-  /n[        R                  " XSS	S
9U l        [        R                  " SU5      U l        [        R                  " XS	   SS	S
9U l        [        R                  " [        SUS	   5      US	   5      U l
        [        R                  " US	   US   SS	S
9U l        [        R                  " [        SUS   5      US   5      U l        [        R                  " US   US   SS	S
9U l        [        R                  " [        SUS   5      US   5      U l        [        R                  " US   US   SS	S
9U l        [        R                  " [        SUS   5      US   5      U l        [        R                  " US   S	SS	S
9U l        Xl        [        R                  " US   US	   S	5      U l        [        R                  " US	   US   S	5      U l        [        R                  " US   US   S	5      U l        U R-                  5        Hv  n[/        U[        R                  5      (       d  M$  [        R0                  R3                  UR4                  S	S9  [        R0                  R7                  UR8                  S5        Mx     g )N   r   zsThe hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8r   r   r   r   r   r   )padding)a)rJ   rK   r   r
   rs  lay1	GroupNormgn1lay2rI  gn2lay3gn3lay4gn4lay5gn5out_layr   adapter1adapter2adapter3modulesrm   rm  kaiming_uniform_rF   	constant_rG   )rO   r   fpn_dimscontext_dim
inter_dimsmrQ   s         r+   rK   DetrMaskHeadSmallConv.__init__}  s=   7a<) 
 !+[A-={a?OQ\`bQbdosuduv
IIc15	<<3'IIca=!Q?	<<Az!} 5z!}EIIjmZ]AqI	<<Az!} 5z!}EIIjmZ]AqI	<<Az!} 5z!}EIIjmZ]AqI	<<Az!} 5z!}EyyA1a@		(1+z!}a@		(1+z!}a@		(1+z!}a@A!RYY''((Q(7!!!&&!,  r*   rc   r  fpnsc                    [         R                  " [        XR                  S   5      UR	                  SS5      /S5      nU R                  U5      nU R                  U5      n[        R                  R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  U5      nU R                  US   5      nUR                  S5      UR                  S5      :w  a,  [        XAR                  S5      UR                  S5      -  5      nU[        R                  R                  XR                  SS  SS9-   nU R                  U5      nU R!                  U5      n[        R                  R                  U5      nU R#                  US   5      nUR                  S5      UR                  S5      :w  a,  [        XAR                  S5      UR                  S5      -  5      nU[        R                  R                  XR                  SS  SS9-   nU R%                  U5      nU R'                  U5      n[        R                  R                  U5      nU R)                  US   5      nUR                  S5      UR                  S5      :w  a,  [        XAR                  S5      UR                  S5      -  5      nU[        R                  R                  XR                  SS  SS9-   nU R+                  U5      nU R-                  U5      n[        R                  R                  U5      nU R/                  U5      nU$ )Nr   r   r   nearest)r   moder   )r&   r   r  r   r   r#  r%  r
   r   r  r&  r'  r/  r   r   r(  r)  r0  r*  r+  r1  r,  r-  r.  )rO   rc   r  r:  cur_fpns        r+   rf   DetrMaskHeadSmallConv.forward  s    IIwq//!"45y7H7HA7NOQRSIIaLHHQKMMq!IIaLHHQKMMq!--Q(<<?affQi'gvvayGLLO'CDGbmm//bc8JQZ/[[IIaLHHQKMMq!--Q(<<?affQi'gvvayGLLO'CDGbmm//bc8JQZ/[[IIaLHHQKMMq!--Q(<<?affQi'gvvayGLLO'CDGbmm//bc8JQZ/[[IIaLHHQKMMq!LLOr*   )r/  r0  r1  r   r%  r'  r)  r+  r-  r#  r&  r(  r*  r,  r.  )r!   r"   r#   r$   r%   rK   r	   r   rf   r)   rh   ri   s   @r+   r  r  x  s1     -D& &F &$v, & &r*   r  c                   H   ^  \ rS rSrSrSU 4S jjrSS\\   4S jjrSr	U =r
$ )	rl  i  zdThis is a 2D attention module, which only returns the attention softmax (no multiplication by value)c                   > [         TU ]  5         X0l        X l        [        R
                  " U5      U l        [        R                  " XUS9U l        [        R                  " XUS9U l	        [        X R                  -  5      S-  U l        g )Nr  r  )rJ   rK   r   r  r
   Dropoutr  r  rp  ro  r   normalize_fact)rO   	query_dimr  r   r  rG   ri  rQ   s          r+   rK   DetrMHAttentionMap.__init__  se    "$zz'*		)dC		)dC#J$?@DHr*   r   c                     U R                  U5      n[        R                  R                  X R                  R
                  R                  S5      R                  S5      U R                  R                  5      nUR                  UR                  S   UR                  S   U R                  U R                  U R                  -  5      nUR                  UR                  S   U R                  U R                  U R                  -  UR                  S   UR                  S   5      n[        R                  " SX@R                  -  U5      nUbX  UR                  UR                  S5      R                  S5      [        R                   " UR"                  5      R$                  5      n[        R                  R'                  UR)                  S5      SS9R                  UR+                  5       5      nU R-                  U5      nU$ )Nr`   r   r   r   zbqnc,bnchw->bqnhwr   r   )rp  r
   r   conv2dro  rF   r   rG   r  r   r   r  r&   einsumrC  masked_fillrN  r   rI  r%  r   r   r  )rO   qr  r   queries_per_headkeys_per_headweightss          r+   rf   DetrMHAttentionMap.forward  s|   MM!MM  MM$8$8$B$B2$F$P$PQS$TVZVcVcVhVhi66!''!*aggaj$..$//]a]k]kJklqwwqz4>>4??dnn;\^_^e^efh^iklkrkrsukvw,,24DGZGZ4Z\ij))$..*;*E*Ea*H%++V]VcVcJdJhJhiG--''(:'CHHX,,w'r*   )r  r  ro  rC  r   rp  )r4  TNr   )r!   r"   r#   r$   r%   rK   r   r	   rf   r)   rh   ri   s   @r+   rl  rl    s"    n	I(6"2  r*   rl  )r  r  r  re  )Ar%   r   dataclassesr   typingr   r   r   r   r   r&   r	   r
   activationsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.backbone_utilsr   configuration_detrr   r   r   
get_loggerr!   loggerr   r-   r0   r?   ModulerC   rw   r}   r   r   r   r   r   r7  rW  re  r  r  r  r  r  r  r5  r  r  rl  __all__r   r*   r+   <module>r\     sv     ! 5 5   ! B g g -  2 * ! 
		H	% C: C C: $C( $C $CN ;B ;B ;B| BB[ BB BBN$ BII $ N'0Ebii EPBII *"		 "J299 .
~2BII ~2BDryy DNfryy fR ?/ ? ?<o
% o
dm
% m
` d
# d
d
PBII * F
0 F
F
R F
- F
F
RMC M
MBII M` 8r*   