o
    ZhP                     @   sB  d Z ddlZddlmZ ddlmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZ ddlm Z  e!e"Z#eG dd deZ$G dd dej%Z&G dd dej%Z'G dd dej%Z(G dd dej%Z)	d=dej%dej*dej*dej*de	ej* de+de+fd d!Z,G d"d# d#ej%Z-G d$d% d%ej%Z.G d&d' d'ej%Z/G d(d) d)ej%Z0G d*d+ d+ej%Z1G d,d- d-ej%Z2G d.d/ d/ej%Z3eG d0d1 d1eZ4eG d2d3 d3e4Z5G d4d5 d5ej%Z6G d6d7 d7ej%Z7ed8d9G d:d; d;e4Z8g d<Z9dS )>zPyTorch YOLOS model.    N)	dataclass)CallableDictListOptionalSetTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )YolosConfigc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeej ed< dZeej ed< dZeee
  ed< dZeej ed< dZeeej  ed	< dZeeej  ed
< dS )YolosObjectDetectionOutputaH
  
    Output type of [`YolosForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
            boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r   r   r   r   r     r(   r(   W/var/www/auris/lib/python3.10/site-packages/transformers/models/yolos/modeling_yolos.pyr   $   s   
 !r   c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                    s   t    ttdd|j| _ttd|j|j| _	t
|| _| jj}ttd||j d |j| _t|j| _t|| _|| _d S Nr   )super__init__r
   	Parameterr%   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr+   )selfr+   r8   	__class__r(   r)   r/   W   s   



zYolosEmbeddings.__init__pixel_valuesc                 C   s   |j \}}}}| |}| \}}}| j|dd}	| j|dd}
tj|	||
fdd}| | j	||f}|| }| 
|}|S )Nr   dim)shaper7   sizer3   expandr5   r%   catr>   r9   r<   )r?   rB   
batch_sizenum_channelsheightwidth
embeddingsseq_len_Z
cls_tokensr5   r9   r(   r(   r)   forwardf   s   

zYolosEmbeddings.forward
r!   r"   r#   r$   r   r/   r%   TensorrQ   __classcell__r(   r(   r@   r)   r*   Q   s    r*   c                       0   e Zd Zd fddZd	dejfddZ  ZS )
r=   r,   Nc                       t    || _d S Nr.   r/   r+   r?   r+   r@   r(   r)   r/   }      

z-InterpolateInitialPositionEmbeddings.__init__i   i@  c                 C   s  |d d dd d f }|d d d f }|d d | j j d d d f }|d d d| j j d d f }|dd}|j\}}}| j jd | j j | j jd | j j }	}
||||	|
}|\}}|| j j || j j }}tjj	|||fddd}|
ddd}tj|||fdd}|S )Nr   r      bicubicFrG   modeZalign_cornersrD   )r+   r4   	transposerF   
image_size
patch_sizeviewr
   
functionalinterpolateflattenr%   rI   )r?   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrJ   r2   rO   patch_heightpatch_widthrL   rM   new_patch_heightnew_patch_widthscale_pos_embedr(   r(   r)   rQ      s$     z,InterpolateInitialPositionEmbeddings.forwardr,   Nr[   r!   r"   r#   r/   r%   rS   rQ   rT   r(   r(   r@   r)   r=   |       r=   c                       rU   )
 InterpolateMidPositionEmbeddingsr,   Nc                    rV   rW   rX   rY   r@   r(   r)   r/      rZ   z)InterpolateMidPositionEmbeddings.__init__r[   c                 C   sH  |d d d d dd d f }|d d d f }|d d d d | j j d d d f }|d d d d d| j j d d f }|dd}|j\}}}}	| j jd | j j | j jd | j j }
}||| ||
|}|\}}|| j j || j j }}tjj	|||fddd}|
ddd |||| |}tj|||fdd}|S )	Nr   r   r\   r   r]   Fr^   rD   )r+   r4   r`   rF   ra   rb   rc   r
   rd   re   rf   
contiguousr%   rI   )r?   rg   rh   ri   rj   rk   depthrJ   r2   rO   rl   rm   rL   rM   rn   ro   rp   r(   r(   r)   rQ      s,   &&z(InterpolateMidPositionEmbeddings.forwardrq   rr   rs   r(   r(   r@   r)   ru      rt   ru   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r6   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)r.   r/   ra   rb   rK   r2   
isinstancecollectionsabcIterabler8   r
   Conv2d
projection)r?   r+   ra   rb   rK   r2   r8   r@   r(   r)   r/      s   
 zYolosPatchEmbeddings.__init__rB   r,   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r\   r   )rF   rK   
ValueErrorr}   rf   r`   )r?   rB   rJ   rK   rL   rM   rN   r(   r(   r)   rQ      s   
zYolosPatchEmbeddings.forward)	r!   r"   r#   r$   r/   r%   rS   rQ   rT   r(   r(   r@   r)   r6      s    r6           modulequerykeyvalueattention_maskscalingr<   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrC   )rE   dtype)ptrainingr   r\   )r%   matmulr`   r
   rd   ZsoftmaxZfloat32tor   r<   r   rv   )
r   r   r   r   r   r   r<   kwargsZattn_weightsZattn_outputr(   r(   r)   eager_attention_forward   s   r   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )YolosSelfAttentionr+   r,   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r.   r/   r2   num_attention_headshasattrr~   r+   intattention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probr   	is_causalr
   LinearZqkv_biasr   r   r   rY   r@   r(   r)   r/      s"   

zYolosSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )NrC   r   r\   r   r   )rG   r   r   rc   Zpermute)r?   r   Znew_x_shaper(   r(   r)   transpose_for_scores  s   
z'YolosSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )NeagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r<   r   )r   r   r   r   r   r+   Z_attn_implementationloggerZwarning_oncer   r   r   r   r   rG   r   Zreshape)r?   r   r   r   Z	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shapeoutputsr(   r(   r)   rQ     s4   

zYolosSelfAttention.forwardNF)r!   r"   r#   r   r/   r%   rS   r   r   boolr	   r   rQ   rT   r(   r(   r@   r)   r      s    r   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r+   r,   Nc                    s.   t    t|j|j| _t|j| _d S rW   )	r.   r/   r
   r   r2   denser:   r;   r<   rY   r@   r(   r)   r/   @     
zYolosSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S rW   r   r<   r?   r   r   r(   r(   r)   rQ   E     

zYolosSelfOutput.forwardrR   r(   r(   r@   r)   r   :  s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )YolosAttentionr+   r,   Nc                    s*   t    t|| _t|| _t | _d S rW   )r.   r/   r   	attentionr   outputsetpruned_headsrY   r@   r(   r)   r/   N  s   


zYolosAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rD   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r?   r   indexr(   r(   r)   prune_headsT  s   zYolosAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r?   r   r   r   Zself_outputsattention_outputr   r(   r(   r)   rQ   f  s   zYolosAttention.forwardr   )r!   r"   r#   r   r/   r   r   r   r%   rS   r   r   r	   r   rQ   rT   r(   r(   r@   r)   r   M  s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	YolosIntermediater+   r,   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rW   )r.   r/   r
   r   r2   intermediate_sizer   rx   Z
hidden_actstrr   intermediate_act_fnrY   r@   r(   r)   r/   v  s
   
zYolosIntermediate.__init__r   c                 C   r   rW   )r   r   )r?   r   r(   r(   r)   rQ   ~  r   zYolosIntermediate.forward	r!   r"   r#   r   r/   r%   rS   rQ   rT   r(   r(   r@   r)   r   u  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
YolosOutputr+   r,   Nc                    s.   t    t|j|j| _t|j| _	d S rW   )
r.   r/   r
   r   r   r2   r   r:   r;   r<   rY   r@   r(   r)   r/     r   zYolosOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S rW   r   r   r(   r(   r)   rQ     s   

zYolosOutput.forwardr   r(   r(   r@   r)   r     s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )
YolosLayerz?This corresponds to the Block class in the timm implementation.r+   r,   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   Zeps)r.   r/   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r
   	LayerNormr2   layer_norm_epslayernorm_beforelayernorm_afterrY   r@   r(   r)   r/     s   



zYolosLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r?   r   r   r   Zself_attention_outputsr   r   Zlayer_outputr(   r(   r)   rQ     s   


zYolosLayer.forwardr   )r!   r"   r#   r$   r   r/   r%   rS   r   r   r	   r   rQ   rT   r(   r(   r@   r)   r     s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )YolosEncoderr+   r,   Nc                    s   t     | _t fddt jD | _d| _d j	d  j	d   j
d    j } jrAtt jd d| jnd | _ jrNt | _d S d | _d S )Nc                    s   g | ]}t  qS r(   )r   ).0rP   r+   r(   r)   
<listcomp>  s    z)YolosEncoder.__init__.<locals>.<listcomp>Fr   r   r\   )r.   r/   r+   r
   
ModuleListrangenum_hidden_layerslayergradient_checkpointingra   rb   r4   use_mid_position_embeddingsr0   r%   r1   r2   mid_position_embeddingsru   r>   )r?   r+   Z
seq_lengthr@   r   r)   r/     s$   
 &	zYolosEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s
  |rdnd }|r
dnd }	| j jr| | j||f}
t| jD ]J\}}|r)||f }|d ur1|| nd }| jrC| jrC| |j	|||}n||||}|d }| j jr_|| j j
d k r_||
|  }|rh|	|d f }	q|rp||f }|s~tdd |||	fD S t|||	dS )Nr(   r   r   c                 s   s    | ]	}|d ur|V  qd S rW   r(   )r   vr(   r(   r)   	<genexpr>  s    z'YolosEncoder.forward.<locals>.<genexpr>)r   r   r    )r+   r   r>   r   	enumerater   r   r   Z_gradient_checkpointing_func__call__r   tupler   )r?   r   rL   rM   r   r   r   r   Zall_hidden_statesZall_self_attentionsZ$interpolated_mid_position_embeddingsiZlayer_moduleZlayer_head_maskZlayer_outputsr(   r(   r)   rQ     s@   


zYolosEncoder.forward)NFFT)r!   r"   r#   r   r/   r%   rS   r   r   r	   r   r   rQ   rT   r(   r(   r@   r)   r     s&    
	r   c                   @   sJ   e Zd ZeZdZdZdZg ZdZ	dZ
deejejejf ddfddZdS )	YolosPreTrainedModelvitrB   Tr   r,   Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsr   )meanZstdNg      ?)rx   r
   r   r|   weightdataZnormal_r+   Zinitializer_ranger   Zzero_r   Zfill_)r?   r   r(   r(   r)   _init_weights  s   
z"YolosPreTrainedModel._init_weights)r!   r"   r#   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attn_2r	   r
   r   r|   r   r   r(   r(   r(   r)   r     s    &r   c                       s   e Zd Zddedef fddZdefddZd	ee	e
e	 f dd
fddZe	
	
	
	
	
ddeej deej dee dee dee deeef fddZ  ZS )
YolosModelTr+   add_pooling_layerc                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|r#t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r.   r/   r+   r*   rN   r   encoderr
   r   r2   r   	layernormYolosPoolerpooler	post_init)r?   r+   r   r@   r(   r)   r/   '  s   

zYolosModel.__init__r,   c                 C   s   | j jS rW   )rN   r7   )r?   r(   r(   r)   get_input_embeddings8  s   zYolosModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )a	  
        Prunes heads of the model.

        Args:
            heads_to_prune (`dict`):
                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
                list of heads to prune in this layer}
        N)itemsr   r   r   r   )r?   r   r   r   r(   r(   r)   _prune_heads;  s   	zYolosModel._prune_headsrB   r   r   r   r   c              	   C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| || j j}| |}| j||j	d |j	d ||||d}|d }| 
|}| jd urX| |nd }	|so|	d urd||	fn|f}
|
|dd   S t||	|j|jdS )Nz You have to specify pixel_valuesr   rC   )rL   rM   r   r   r   r   r   r   )r   Zpooler_outputr   r    )r+   r   r   use_return_dictr~   Zget_head_maskr   rN   r   rF   r   r   r   r   r    )r?   rB   r   r   r   r   Zembedding_outputZencoder_outputssequence_outputpooled_outputZhead_outputsr(   r(   r)   rQ   G  s:   	
	
zYolosModel.forward)T)NNNNN)r!   r"   r#   r   r   r/   r6   r   r   r   r   r   r   r   r%   rS   r	   r   r   rQ   rT   r(   r(   r@   r)   r   %  s.    
r   c                       s*   e Zd Zdef fddZdd Z  ZS )r   r+   c                    s*   t    t|j|j| _t | _d S rW   )r.   r/   r
   r   r2   r   ZTanh
activationrY   r@   r(   r)   r/   |  s   
zYolosPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r?   r   Zfirst_token_tensorr   r(   r(   r)   rQ     s   

zYolosPooler.forward)r!   r"   r#   r   r/   rQ   rT   r(   r(   r@   r)   r   {  s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )YolosMLPPredictionHeada  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr   c                 s   s     | ]\}}t ||V  qd S rW   )r
   r   )r   nkr(   r(   r)   r     s    z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>)r.   r/   
num_layersr
   r   ziplayers)r?   	input_dim
hidden_dim
output_dimr   hr@   r(   r)   r/     s   
,zYolosMLPPredictionHead.__init__c                 C   s>   t | jD ]\}}|| jd k rtj||n||}q|S r-   )r   r   r   r
   rd   Zrelu)r?   r   r   r   r(   r(   r)   rQ     s   (zYolosMLPPredictionHead.forward)r!   r"   r#   r$   r/   rQ   rT   r(   r(   r@   r)   r     s    r   zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    )Zcustom_introc                       s   e Zd Zdef fddZejjdd Ze					ddej
deee  d	ee d
ee dee deeef fddZ  ZS )YolosForObjectDetectionr+   c                    sX   t  | t|dd| _t|j|j|jd dd| _t|j|jddd| _| 	  d S )NF)r   r   r   )r   r   r   r      )
r.   r/   r   r   r   r2   Z
num_labelsclass_labels_classifierbbox_predictorr   rY   r@   r(   r)   r/     s   z YolosForObjectDetection.__init__c                 C   s$   dd t |d d |d d D S )Nc                 S   s   g | ]	\}}||d qS ))r   r   r(   )r   abr(   r(   r)   r     s    z9YolosForObjectDetection._set_aux_loss.<locals>.<listcomp>rC   )r   )r?   outputs_classoutputs_coordr(   r(   r)   _set_aux_loss  s   $z%YolosForObjectDetection._set_aux_lossNrB   labelsr   r   r   r,   c              
   C   s2  |dur|n| j j}| j||||d}|d }|dd| j j dddf }| |}| | }	d\}
}}|durid\}}| j jrY|rI|jn|d }| |}| | }| 	||| j
|	| j ||\}
}}|s|durx||	f| | }n||	f| }|
dur|
|f| S |S t|
|||	||j|j|jdS )a	  
        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)r   r   r   r   )NNN)NNr   )r   r   r   r   r   r   r   r    )r+   r   r   r4   r   r   ZsigmoidZauxiliary_lossZintermediate_hidden_statesZloss_functionZdevicer   r   r   r    )r?   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r(   r(   r)   rQ     sH   4 



zYolosForObjectDetection.forward)NNNN)r!   r"   r#   r   r/   r%   ZjitZunusedr   r   r&   r   r   r   r   r	   r   r   rQ   rT   r(   r(   r@   r)   r     s,    


r   )r   r   r   )r   ):r$   collections.abcry   dataclassesr   typingr   r   r   r   r   r   r	   r%   Ztorch.utils.checkpointr
   Zactivationsr   Zmodeling_outputsr   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   utilsr   r   r   Zconfiguration_yolosr   Z
get_loggerr!   r   r   Moduler*   r=   ru   r6   rS   floatr   r   r   r   r   r   r   r   r   r   r   r   r   __all__r(   r(   r(   r)   <module>   sn   $
,+!)
?(*NU 