o
    Zh|                  	   @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" e rpddl#m$Z$m%Z% ndd Z%dd Z$e&e'Z(eG dd deZ)eG dd deZ*eG dd deZ+G dd de
j,Z-G dd de
j,Z.G dd  d e
j,Z/dGd#ej0d$e1d%e2d&ej0fd'd(Z3G d)d* d*e
j,Z4G d+d, d,e
j,Z5G d-d. d.e
j,Z6G d/d0 d0e
j,Z7G d1d2 d2e
j,Z8G d3d4 d4e
j,Z9G d5d6 d6e
j,Z:G d7d8 d8e
j,Z;G d9d: d:e
j,Z<eG d;d< d<eZ=eG d=d> d>e=Z>ed?d@G dAdB dBe=Z?edCd@G dDdE dEe=e Z@g dFZAdS )Hz9PyTorch Dilated Neighborhood Attention Transformer model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputOptionalDependencyNotAvailableauto_docstringis_natten_availableloggingrequires_backends)BackboneMixin   )DinatConfig)
natten2davnatten2dqkrpbc                  O      t  Nr   argskwargs r!   W/var/www/auris/lib/python3.10/site-packages/transformers/models/dinat/modeling_dinat.pyr   .      r   c                  O   r   r   r   r   r!   r!   r"   r   1   r#   r   c                   @   sr   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dZee
ejdf  ed< dZee
ejdf  ed< dS )DinatEncoderOutputa  
    Dinat encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&   r   r'   r(   r!   r!   r!   r"   r$   ;   s   
 r$   c                   @      e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	DinatModelOutputaU  
    Dinat model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr%   pooler_output.r&   r'   r(   )r)   r*   r+   r,   r%   r   r-   r.   r/   r2   r&   r   r'   r(   r!   r!   r!   r"   r1   \      
 r1   c                   @   r0   )	DinatImageClassifierOutputa  
    Dinat outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlosslogits.r&   r'   r(   )r)   r*   r+   r,   r5   r   r-   r.   r/   r6   r&   r   r'   r(   r!   r!   r!   r"   r4      r3   r4   c                       s>   e Zd ZdZ fddZdeej deej	 fddZ
  ZS )DinatEmbeddingsz6
    Construct the patch and position embeddings.
    c                    s4   t    t|| _t|j| _t|j	| _
d S r   )super__init__DinatPatchEmbeddingspatch_embeddingsr   	LayerNorm	embed_dimnormDropouthidden_dropout_probdropoutselfconfig	__class__r!   r"   r9      s   

zDinatEmbeddings.__init__pixel_valuesreturnc                 C   s"   |  |}| |}| |}|S r   )r;   r>   rA   )rC   rG   
embeddingsr!   r!   r"   forward   s   


zDinatEmbeddings.forward)r)   r*   r+   r,   r9   r   r-   r.   r   TensorrJ   __classcell__r!   r!   rE   r"   r7      s    &r7   c                       s:   e Zd ZdZ fddZdeej dejfddZ	  Z
S )r:   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
    Transformer.
    c              
      sr   t    |j}|j|j}}|| _|dkrntdttj| j|d ddddtj|d |dddd| _	d S )N   z2Dinat only supports patch size of 4 at the moment.   r
   r
   rN   rN   r   r   )kernel_sizestridepadding)
r8   r9   
patch_sizenum_channelsr=   
ValueErrorr   Z
SequentialConv2d
projection)rC   rD   rU   rV   Zhidden_sizerE   r!   r"   r9      s   

zDinatPatchEmbeddings.__init__rG   rH   c                 C   s>   |j \}}}}|| jkrtd| |}|dddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   rN   r
   r   )shaperV   rW   rY   permute)rC   rG   _rV   heightwidthrI   r!   r!   r"   rJ      s   

zDinatPatchEmbeddings.forward)r)   r*   r+   r,   r9   r   r-   r.   rK   rJ   rL   r!   r!   rE   r"   r:      s    "r:   c                       sL   e Zd ZdZejfdedejddf fddZde	j
de	j
fd	d
Z  ZS )DinatDownsamplerz
    Convolutional Downsampling Layer.

    Args:
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    dim
norm_layerrH   Nc                    s>   t    || _tj|d| ddddd| _|d| | _d S )NrN   rO   rP   rQ   F)rR   rS   rT   bias)r8   r9   r`   r   rX   	reductionr>   )rC   r`   ra   rE   r!   r"   r9      s   
zDinatDownsampler.__init__input_featurec                 C   s0   |  |dddddddd}| |}|S )Nr   r
   r   rN   )rc   r[   r>   )rC   rd   r!   r!   r"   rJ      s   "
zDinatDownsampler.forward)r)   r*   r+   r,   r   r<   intModuler9   r-   rK   rJ   rL   r!   r!   rE   r"   r_      s    "
r_           Finput	drop_probtrainingrH   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    rg   r   r   )r   )dtypedevice)rZ   ndimr-   Zrandrk   rl   Zfloor_div)rh   ri   rj   Z	keep_probrZ   Zrandom_tensoroutputr!   r!   r"   	drop_path   s   
rp   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )DinatDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nri   rH   c                    s   t    || _d S r   )r8   r9   ri   )rC   ri   rE   r!   r"   r9     s   

zDinatDropPath.__init__r&   c                 C   s   t || j| jS r   )rp   ri   rj   rC   r&   r!   r!   r"   rJ     s   zDinatDropPath.forwardc                 C   s   d | jS )Nzp={})formatri   rC   r!   r!   r"   
extra_repr  s   zDinatDropPath.extra_reprr   )r)   r*   r+   r,   r   floatr9   r-   rK   rJ   strru   rL   r!   r!   rE   r"   rq     s
    rq   c                       J   e Zd Z fddZdd Z	ddejdee de	ej fd	d
Z
  ZS )NeighborhoodAttentionc                    s   t    || dkrtd| d| d|| _t|| | _| j| j | _|| _|| _t	
t|d| j d d| j d | _t	j| j| j|jd| _t	j| j| j|jd| _t	j| j| j|jd| _t	|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()rN   r   )rb   )r8   r9   rW   num_attention_headsre   attention_head_sizeall_head_sizerR   dilationr   	Parameterr-   ZzerosrpbLinearZqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   rC   rD   r`   	num_headsrR   r~   rE   r!   r"   r9     s   
*zNeighborhoodAttention.__init__c                 C   s8   |  d d | j| jf }||}|dddddS )Nr   r
   r   rN   rM   )sizer{   r|   viewr[   )rC   xZnew_x_shaper!   r!   r"   transpose_for_scores0  s   
z*NeighborhoodAttention.transpose_for_scoresFr&   output_attentionsrH   c                 C   s   |  | |}|  | |}|  | |}|t| j }t||| j| j	| j
}tjj|dd}| |}t||| j	| j
}|ddddd }| d d | jf }	||	}|re||f}
|
S |f}
|
S )	Nr   r`   r   rN   r
   r   rM   )r   r   r   r   mathsqrtr|   r   r   rR   r~   r   
functionalZsoftmaxrA   r   r[   
contiguousr   r}   r   )rC   r&   r   Zquery_layerZ	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr!   r!   r"   rJ   5  s   

zNeighborhoodAttention.forwardF)r)   r*   r+   r9   r   r-   rK   r   boolr   rJ   rL   r!   r!   rE   r"   ry     s    ry   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )NeighborhoodAttentionOutputc                    s*   t    t||| _t|j| _d S r   )r8   r9   r   r   denser?   r   rA   rC   rD   r`   rE   r!   r"   r9   X  s   
z$NeighborhoodAttentionOutput.__init__r&   input_tensorrH   c                 C      |  |}| |}|S r   r   rA   )rC   r&   r   r!   r!   r"   rJ   ]  s   

z#NeighborhoodAttentionOutput.forwardr)   r*   r+   r9   r-   rK   rJ   rL   r!   r!   rE   r"   r   W  s    $r   c                       rx   )NeighborhoodAttentionModulec                    s4   t    t|||||| _t||| _t | _d S r   )r8   r9   ry   rC   r   ro   setpruned_headsr   rE   r!   r"   r9   e  s   
z$NeighborhoodAttentionModule.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   rC   r{   r|   r   r   r   r   r   ro   r   r}   union)rC   headsindexr!   r!   r"   prune_headsk  s   z'NeighborhoodAttentionModule.prune_headsFr&   r   rH   c                 C   s2   |  ||}| |d |}|f|dd   }|S Nr   r   )rC   ro   )rC   r&   r   Zself_outputsattention_outputr   r!   r!   r"   rJ   }  s   z#NeighborhoodAttentionModule.forwardr   )r)   r*   r+   r9   r   r-   rK   r   r   r   rJ   rL   r!   r!   rE   r"   r   d  s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )DinatIntermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S r   )r8   r9   r   r   re   	mlp_ratior   
isinstanceZ
hidden_actrw   r   intermediate_act_fnr   rE   r!   r"   r9     s
   
zDinatIntermediate.__init__r&   rH   c                 C   r   r   )r   r   rr   r!   r!   r"   rJ        

zDinatIntermediate.forwardr   r!   r!   rE   r"   r     s    r   c                       r   )DinatOutputc                    s4   t    tt|j| || _t|j| _	d S r   )
r8   r9   r   r   re   r   r   r?   r@   rA   r   rE   r!   r"   r9     s   
zDinatOutput.__init__r&   rH   c                 C   r   r   r   rr   r!   r!   r"   rJ     r   zDinatOutput.forwardr   r!   r!   rE   r"   r     s    r   c                	       sR   e Zd Zd fdd	Zdd Z	ddejdee d	e	ejejf fd
dZ
  ZS )
DinatLayerrg   c                    s   t    |j| _|j| _|| _| j| j | _tj||jd| _	t
|||| j| jd| _|dkr4t|nt | _tj||jd| _t||| _t||| _|jdkretj|jtd|f dd| _d S d | _d S )NZeps)rR   r~   rg   r   rN   T)Zrequires_grad)r8   r9   Zchunk_size_feed_forwardrR   r~   window_sizer   r<   layer_norm_epslayernorm_beforer   	attentionrq   Identityrp   layernorm_afterr   intermediater   ro   Zlayer_scale_init_valuer   r-   Zoneslayer_scale_parameters)rC   rD   r`   r   r~   drop_path_raterE   r!   r"   r9     s$   

zDinatLayer.__init__c           
      C   sd   | j }d}||k s||k r.d }}td|| }td|| }	dd||||	f}tj||}||fS )N)r   r   r   r   r   r   r   )r   maxr   r   pad)
rC   r&   r]   r^   r   
pad_valuesZpad_lZpad_tZpad_rZpad_br!   r!   r"   	maybe_pad  s   zDinatLayer.maybe_padFr&   r   rH   c                 C   s  |  \}}}}|}| |}| |||\}}|j\}	}
}}	| j||d}|d }|d dkp5|d dk}|rJ|d d d |d |d d f  }| jd urV| jd | }|| | }| |}| 	| 
|}| jd urv| jd | }|| | }|r||d f}|S |f}|S )N)r   r   r
      r   )r   r   r   rZ   r   r   r   rp   r   ro   r   )rC   r&   r   
batch_sizer]   r^   channelsZshortcutr   r\   Z
height_padZ	width_padZattention_outputsr   Z
was_paddedZlayer_outputlayer_outputsr!   r!   r"   rJ     s,   
$


zDinatLayer.forward)rg   r   )r)   r*   r+   r9   r   r-   rK   r   r   r   rJ   rL   r!   r!   rE   r"   r     s    r   c                       sB   e Zd Z fddZ	d	dejdee deej fddZ	  Z
S )

DinatStagec                    sf   t     | _| _t fddt|D | _|d ur+|tjd| _	nd | _	d| _
d S )Nc              	      s&   g | ]}t  | | d qS ))rD   r`   r   r~   r   )r   .0irD   	dilationsr`   r   r   r!   r"   
<listcomp>  s    z'DinatStage.__init__.<locals>.<listcomp>)r`   ra   F)r8   r9   rD   r`   r   
ModuleListrangelayersr<   
downsampleZpointing)rC   rD   r`   depthr   r   r   r   rE   r   r"   r9     s   

zDinatStage.__init__Fr&   r   rH   c                 C   sn   |  \}}}}t| jD ]\}}|||}|d }q|}	| jd ur'| |	}||	f}
|r5|
|dd  7 }
|
S r   )r   	enumerater   r   )rC   r&   r   r\   r]   r^   r   layer_moduler   !hidden_states_before_downsamplingZstage_outputsr!   r!   r"   rJ     s   



zDinatStage.forwardr   )r)   r*   r+   r9   r-   rK   r   r   r   rJ   rL   r!   r!   rE   r"   r     s    r   c                       sb   e Zd Z fddZ				ddejdee dee dee d	ee d
ee	e
f fddZ  ZS )DinatEncoderc                    sh   t    t j_ _dd tjd jt	 jddD t
 fddtjD _d S )Nc                 S   s   g | ]}|  qS r!   )item)r   r   r!   r!   r"   r     s    z)DinatEncoder.__init__.<locals>.<listcomp>r   cpu)rl   c                    s|   g | ]:}t  t jd |   j|  j|  j| t jd| t jd|d   |jd k r8tnddqS )rN   Nr   )rD   r`   r   r   r   r   r   )	r   re   r=   depthsr   r   sum
num_levelsr_   )r   Zi_layerrD   ZdprrC   r!   r"   r      s    
*)r8   r9   r   r   r   rD   r-   Zlinspacer   r   r   r   r   levelsrB   rE   r   r"   r9     s   
$

zDinatEncoder.__init__FTr&   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrH   c                 C   s  |rdnd }|r
dnd }|rdnd }|r&| dddd}	||f7 }||	f7 }t| jD ]H\}
}|||}|d }|d }|rS|rS| dddd}	||f7 }||	f7 }n|ri|si| dddd}	||f7 }||	f7 }|rs||dd  7 }q+|stdd |||fD S t||||dS )	Nr!   r   r
   r   rN   c                 s   s    | ]	}|d ur|V  qd S r   r!   )r   vr!   r!   r"   	<genexpr>U  s    z'DinatEncoder.forward.<locals>.<genexpr>)r%   r&   r'   r(   )r[   r   r   tupler$   )rC   r&   r   r   r   r   Zall_hidden_statesZall_reshaped_hidden_statesZall_self_attentionsZreshaped_hidden_stater   r   r   r   r!   r!   r"   rJ   .  s<   





zDinatEncoder.forward)FFFT)r)   r*   r+   r9   r-   rK   r   r   r   r   r$   rJ   rL   r!   r!   rE   r"   r     s&    
r   c                   @   s    e Zd ZeZdZdZdd ZdS )DinatPreTrainedModeldinatrG   c                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsrg   )meanZstdNg      ?)r   r   r   rX   weightdataZnormal_rD   Zinitializer_rangerb   Zzero_r<   Zfill_)rC   moduler!   r!   r"   _init_weightse  s   
z"DinatPreTrainedModel._init_weightsN)r)   r*   r+   r   Zconfig_classZbase_model_prefixZmain_input_namer   r!   r!   r!   r"   r   _  s
    r   c                       st   e Zd Zd fdd	Zdd Zdd Ze				dd	eej	 d
ee
 dee
 dee
 deeef f
ddZ  ZS )
DinatModelTc                    s   t  | t| dg || _t|j| _t|jd| jd   | _	t
|| _t|| _tj| j	|jd| _|r=tdnd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        nattenrN   r   r   N)r8   r9   r   rD   r   r   r   re   r=   num_featuresr7   rI   r   encoderr   r<   r   	layernormZAdaptiveAvgPool1dpooler	post_init)rC   rD   Zadd_pooling_layerrE   r!   r"   r9   t  s   

zDinatModel.__init__c                 C      | j jS r   rI   r;   rt   r!   r!   r"   get_input_embeddings     zDinatModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )rC   Zheads_to_pruner   r   r!   r!   r"   _prune_heads  s   zDinatModel._prune_headsNrG   r   r   r   rH   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j||||d}|d }| |}d }| jd urW| |	dd
dd}t	|d}|se||f|dd   }	|	S t|||j|j|jdS )Nz You have to specify pixel_valuesr   r   r   r   r   rN   )r%   r2   r&   r'   r(   )rD   r   r   use_return_dictrW   rI   r   r   r   flattenZ	transposer-   r1   r&   r'   r(   )
rC   rG   r   r   r   embedding_outputZencoder_outputsZsequence_outputpooled_outputro   r!   r!   r"   rJ     s:   


zDinatModel.forward)T)NNNN)r)   r*   r+   r9   r   r   r   r   r-   r.   r   r   r   r1   rJ   rL   r!   r!   rE   r"   r   r  s(    
r   z
    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )Zcustom_introc                       sn   e Zd Z fddZe					ddeej deej dee	 dee	 dee	 d	e
eef fd
dZ  ZS )DinatForImageClassificationc                    s\   t  | t| dg |j| _t|| _|jdkr#t| jj|jnt	 | _
|   d S )Nr   r   )r8   r9   r   
num_labelsr   r   r   r   r   r   
classifierr   rB   rE   r!   r"   r9     s   
"z$DinatForImageClassification.__init__NrG   labelsr   r   r   rH   c                 C   sb  |dur|n| j j}| j||||d}|d }| |}d}	|dur| j jdu rL| jdkr2d| j _n| jdkrH|jtjksC|jtj	krHd| j _nd| j _| j jdkrjt
 }
| jdkrd|
| | }	n+|
||}	n%| j jdkrt }
|
|d| j|d}	n| j jdkrt }
|
||}	|s|f|dd  }|	dur|	f| S |S t|	||j|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr   rN   )r5   r6   r&   r'   r(   )rD   r   r   r   Zproblem_typer   rk   r-   longre   r	   Zsqueezer   r   r   r4   r&   r'   r(   )rC   rG   r   r   r   r   r   r   r6   r5   Zloss_fctro   r!   r!   r"   rJ     sL   


"


z#DinatForImageClassification.forward)NNNNN)r)   r*   r+   r9   r   r   r-   r.   Z
LongTensorr   r   r   r4   rJ   rL   r!   r!   rE   r"   r     s*    
r   zL
    NAT backbone, to be used with frameworks like DETR and MaskFormer.
    c                       s\   e Zd Z fddZdd Ze			ddejdee	 dee	 d	ee	 d
e
f
ddZ  ZS )DinatBackbonec                    s   t    t    t| dg t | _t | _ jg fddt	t
 jD  | _i }t| j| jD ]\}}t|||< q8t|| _|   d S )Nr   c                    s   g | ]}t  jd |  qS )rN   )re   r=   r   rD   r!   r"   r   *  s    z*DinatBackbone.__init__.<locals>.<listcomp>)r8   r9   Z_init_backboner   r7   rI   r   r   r=   r   r   r   r   zipZ_out_featuresr   r   r<   Z
ModuleDicthidden_states_normsr   )rC   rD   r  stagerV   rE   r   r"   r9   "  s   

&zDinatBackbone.__init__c                 C   r   r   r   rt   r!   r!   r"   r   5  r   z"DinatBackbone.get_input_embeddingsNrG   r   r   r   rH   c                 C   s,  |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}| j||dddd}|j}d}t| j|D ]A\}	}
|	| j	v ry|
j
\}}}}|
dddd }
|
||| |}
| j|	 |
}
|
||||}
|
dddd }
||
f7 }q8|s|f}|r||jf7 }|S t||r|jnd|jd	S )
a/  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 512, 7, 7]
        ```NT)r   r   r   r   r!   r   rN   r
   r   )feature_mapsr&   r'   )rD   r   r   r   rI   r   r(   r   Zstage_namesZout_featuresrZ   r[   r   r   r  r&   r   r'   )rC   rG   r   r   r   r   r   r&   r  r  Zhidden_stater   rV   r]   r^   ro   r!   r!   r"   rJ   8  sD   !


zDinatBackbone.forward)NNN)r)   r*   r+   r9   r   r   r-   rK   r   r   r   rJ   rL   r!   r!   rE   r"   r     s$    r   )r   r   r   r   )rg   F)Br,   r   dataclassesr   typingr   r   r   r-   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   Zmodeling_outputsr   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   r   r   r   r   Zutils.backbone_utilsr   Zconfiguration_dinatr   Znatten.functionalr   r   Z
get_loggerr)   loggerr$   r1   r4   rf   r7   r:   r_   rK   rv   r   rp   rq   ry   r   r   r   r   r   r   r   r   r   r   r   __all__r!   r!   r!   r"   <module>   sh    
 ##$ >$G/FRQb