o
    ZŽhnÀ  ã                	   @   s<  d Z ddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZ ddlmZ e e¡ZeG dd„ deƒƒZeG dd„ deƒƒZ eG dd„ deƒƒZ!dd„ Z"dd„ Z#G dd„ dej$ƒZ%G dd„ dej$ƒZ&G dd„ dej$ƒZ'd@de
j(d e)d!e*d"e
j(fd#d$„Z+G d%d&„ d&ej$ƒZ,G d'd(„ d(ej$ƒZ-G d)d*„ d*ej$ƒZ.G d+d,„ d,ej$ƒZ/G d-d.„ d.ej$ƒZ0G d/d0„ d0ej$ƒZ1G d1d2„ d2ej$ƒZ2G d3d4„ d4ej$ƒZ3G d5d6„ d6ej$ƒZ4eG d7d8„ d8eƒƒZ5eG d9d:„ d:e5ƒƒZ6ed;d<G d=d>„ d>e5ƒƒZ7g d?¢Z8dS )Az¢PyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states.é    N)Ú	dataclass)ÚOptionalÚTupleÚUnion)Únné   )ÚACT2FN)ÚPreTrainedModel)Ú find_pruneable_heads_and_indicesÚmeshgridÚprune_linear_layer)ÚModelOutputÚauto_docstringÚloggingÚ	torch_inté   )ÚDonutSwinConfigc                   @   sr   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dZee
ejdf  ed< dZee
ejdf  ed< dS )ÚDonutSwinEncoderOutputa…  
    DonutSwin encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    NÚlast_hidden_state.Úhidden_statesÚ
attentionsÚreshaped_hidden_states)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   © r   r   ú\/var/www/auris/lib/python3.10/site-packages/transformers/models/donut/modeling_donut_swin.pyr   '   s   
 r   c                   @   ó„   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	ÚDonutSwinModelOutputaY  
    DonutSwin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   Úpooler_output.r   r   r   )r   r   r   r   r   r   r   r   r   r#   r   r   r   r   r   r   r   r    r"   I   ó   
 r"   c                   @   r!   )	ÚDonutSwinImageClassifierOutputa  
    DonutSwin outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    NÚlossÚlogits.r   r   r   )r   r   r   r   r&   r   r   r   r   r'   r   r   r   r   r   r   r   r    r%   n   r$   r%   c                 C   sR   | j \}}}}|  ||| ||| ||¡} |  dddddd¡ ¡  d|||¡}|S )z2
    Partitions the given input into windows.
    r   r   r   é   é   é   éÿÿÿÿ©ÚshapeÚviewÚpermuteÚ
contiguous)Úinput_featureÚwindow_sizeÚ
batch_sizeÚheightÚwidthÚnum_channelsÚwindowsr   r   r    Úwindow_partition”   s   ÿ$r8   c                 C   sN   | j d }|  d|| || |||¡} |  dddddd¡ ¡  d|||¡} | S )z?
    Merges windows to produce higher resolution features.
    r+   r   r   r   r(   r)   r*   r,   )r7   r2   r4   r5   r6   r   r   r    Úwindow_reverse¡   s   
$r9   c                
       sr   e Zd ZdZd‡ fdd„	Zdejdededejfd	d
„Z		dde	ej
 de	ej dedeej fdd„Z‡  ZS )ÚDonutSwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                    sž   t ƒ  ¡  t|ƒ| _| jj}| jj| _|r t t	 
dd|j¡¡nd | _|jr5t t	 
d|d |j¡¡| _nd | _t |j¡| _t |j¡| _|j| _|| _d S )Nr   )ÚsuperÚ__init__ÚDonutSwinPatchEmbeddingsÚpatch_embeddingsÚnum_patchesÚ	grid_sizeÚ
patch_gridr   Ú	Parameterr   ÚzerosÚ	embed_dimÚ
mask_tokenZuse_absolute_embeddingsÚposition_embeddingsÚ	LayerNormÚnormÚDropoutÚhidden_dropout_probÚdropoutÚ
patch_sizeÚconfig)ÚselfrM   Úuse_mask_tokenr?   ©Ú	__class__r   r    r<   ±   s   


 
zDonutSwinEmbeddings.__init__Ú
embeddingsr4   r5   Úreturnc                 C   sø   |j d d }| jj d d }tj ¡ s||kr||kr| jS | jdd…dd…f }| jdd…dd…f }|j d }|| j }	|| j }
t|d ƒ}| d|||¡}| dddd¡}t	j
j||	|
fdd	d
}| dddd¡ dd|¡}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr+   g      à?r   r   r(   ZbicubicF)ÚsizeÚmodeZalign_corners©Údim)r-   rF   r   ÚjitÚ
is_tracingrL   r   Zreshaper/   r   Ú
functionalZinterpolater.   Úcat)rN   rR   r4   r5   r?   Znum_positionsZclass_pos_embedZpatch_pos_embedrW   Z
new_heightZ	new_widthZsqrt_num_positionsr   r   r    Úinterpolate_pos_encodingÄ   s(   



üz,DonutSwinEmbeddings.interpolate_pos_encodingNÚpixel_valuesÚbool_masked_posr\   c                 C   s®   |j \}}}}|  |¡\}}	|  |¡}| ¡ \}
}}|d ur8| j |
|d¡}| d¡ |¡}|d|  ||  }| jd urN|rI||  	|||¡ }n|| j }|  
|¡}||	fS )Nr+   ç      ð?)r-   r>   rH   rT   rE   ÚexpandÚ	unsqueezeZtype_asrF   r\   rK   )rN   r]   r^   r\   Ú_r6   r4   r5   rR   Úoutput_dimensionsr3   Zseq_lenZmask_tokensÚmaskr   r   r    Úforwardì   s   



zDonutSwinEmbeddings.forward)F)NF)r   r   r   r   r<   r   ÚTensorÚintr\   r   r   Ú
BoolTensorÚboolr   re   Ú__classcell__r   r   rP   r    r:   ¬   s    +üþýüûr:   c                       sN   e Zd ZdZ‡ fdd„Zdd„ Zdeej de	ej
e	e f fdd	„Z‡  ZS )
r=   zì
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    sÌ   t ƒ  ¡  |j|j}}|j|j}}t|tjj	ƒr|n||f}t|tjj	ƒr)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)r;   r<   Ú
image_sizerL   r6   rD   Ú
isinstanceÚcollectionsÚabcÚIterabler?   r@   r   ÚConv2dÚ
projection)rN   rM   rk   rL   r6   Úhidden_sizer?   rP   r   r    r<     s   
 "z!DonutSwinPatchEmbeddings.__init__c                 C   s€   || j d  dkrd| j d || j d   f}tj ||¡}|| j d  dkr>ddd| j d || j d   f}tj ||¡}|S )Nr   r   )rL   r   rZ   Úpad)rN   r]   r4   r5   Ú
pad_valuesr   r   r    Ú	maybe_pad  s    z"DonutSwinPatchEmbeddings.maybe_padr]   rS   c                 C   sV   |j \}}}}|  |||¡}|  |¡}|j \}}}}||f}| d¡ dd¡}||fS )Nr(   r   )r-   ru   rq   ÚflattenÚ	transpose)rN   r]   rb   r6   r4   r5   rR   rc   r   r   r    re   (  s   
z DonutSwinPatchEmbeddings.forward)r   r   r   r   r<   ru   r   r   r   r   rf   rg   re   rj   r   r   rP   r    r=   	  s
    .	r=   c                	       sh   e Zd ZdZejfdee dedejddf‡ fdd„Z	d	d
„ Z
dejdeeef dejfdd„Z‡  ZS )ÚDonutSwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    Úinput_resolutionrW   Ú
norm_layerrS   Nc                    sB   t ƒ  ¡  || _|| _tjd| d| dd| _|d| ƒ| _d S )Nr)   r(   F©Úbias)r;   r<   ry   rW   r   ÚLinearÚ	reductionrH   )rN   ry   rW   rz   rP   r   r    r<   B  s
   
zDonutSwinPatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j ||¡}|S )Nr(   r   r   )r   rZ   rs   )rN   r1   r4   r5   Z
should_padrt   r   r   r    ru   I  s
   zDonutSwinPatchMerging.maybe_padr1   Úinput_dimensionsc                 C   s   |\}}|j \}}}| ||||¡}|  |||¡}|d d …dd d…dd d…d d …f }|d d …dd d…dd d…d d …f }	|d d …dd d…dd d…d d …f }
|d d …dd d…dd d…d d …f }t ||	|
|gd¡}| |dd| ¡}|  |¡}|  |¡}|S )Nr   r(   r   r+   r)   )r-   r.   ru   r   r[   rH   r~   )rN   r1   r   r4   r5   r3   rW   r6   Zinput_feature_0Zinput_feature_1Zinput_feature_2Zinput_feature_3r   r   r    re   Q  s   $$$$

zDonutSwinPatchMerging.forward)r   r   r   r   r   rG   r   rg   ÚModuler<   ru   r   rf   re   rj   r   r   rP   r    rx   5  s
    **rx   ç        FÚinputÚ	drop_probÚtrainingrS   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }| ¡  |  |¡| }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   ©ÚdtypeÚdevice)r-   Úndimr   Zrandr†   r‡   Zfloor_Údiv)r‚   rƒ   r„   Z	keep_probr-   Zrandom_tensorÚoutputr   r   r    Ú	drop_pathl  s   
r‹   c                       sT   e Zd ZdZddee ddf‡ fdd„Zdejdejfdd	„Z	de
fd
d„Z‡  ZS )ÚDonutSwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nrƒ   rS   c                    s   t ƒ  ¡  || _d S ©N)r;   r<   rƒ   )rN   rƒ   rP   r   r    r<   „  s   

zDonutSwinDropPath.__init__r   c                 C   s   t || j| jƒS r   )r‹   rƒ   r„   ©rN   r   r   r   r    re   ˆ  s   zDonutSwinDropPath.forwardc                 C   s   d  | j¡S )Nzp={})Úformatrƒ   ©rN   r   r   r    Ú
extra_repr‹  s   zDonutSwinDropPath.extra_reprr   )r   r   r   r   r   Úfloatr<   r   rf   re   Ústrr‘   rj   r   r   rP   r    rŒ     s
    rŒ   c                       ób   e Zd Z‡ fdd„Zdd„ Z			ddejdeej d	eej d
ee	 de
ej f
dd„Z‡  ZS )ÚDonutSwinSelfAttentionc                    s
  t ƒ  ¡  || dkrtd|› d|› dƒ‚|| _t|| ƒ| _| j| j | _t|tj	j
ƒr0|n||f| _t t d| jd  d d| jd  d  |¡¡| _t | jd ¡}t | jd ¡}t t||gdd¡}t |d¡}|d d …d d …d f |d d …d d d …f  }	|	 ddd¡ ¡ }	|	d d …d d …df  | jd d 7  < |	d d …d d …df  | jd d 7  < |	d d …d d …df  d| jd  d 9  < |	 d	¡}
|  d
|
¡ tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _t |j¡| _ d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads (ú)r(   r   Zij)Zindexingr+   Úrelative_position_indexr{   )!r;   r<   Ú
ValueErrorÚnum_attention_headsrg   Úattention_head_sizeÚall_head_sizerl   rm   rn   ro   r2   r   rB   r   rC   Úrelative_position_bias_tableZarangeÚstackr   rv   r/   r0   ÚsumZregister_bufferr}   Zqkv_biasÚqueryÚkeyÚvaluerI   Úattention_probs_dropout_probrK   )rN   rM   rW   Ú	num_headsr2   Zcoords_hZcoords_wZcoordsZcoords_flattenZrelative_coordsr—   rP   r   r    r<   ‘  s8   
ÿÿ*ÿ,((,
zDonutSwinSelfAttention.__init__c                 C   s6   |  ¡ d d… | j| jf }| |¡}| dddd¡S )Nr+   r   r(   r   r   )rT   r™   rš   r.   r/   )rN   ÚxZnew_x_shaper   r   r    Útranspose_for_scores¶  s   
z+DonutSwinSelfAttention.transpose_for_scoresNFr   Úattention_maskÚ	head_maskÚoutput_attentionsrS   c                 C   sš  |j \}}}|  |¡}|  |  |¡¡}	|  |  |¡¡}
|  |¡}t ||	 dd¡¡}|t 	| j
¡ }| j| j d¡ }| | jd | jd  | jd | jd  d¡}| ddd¡ ¡ }|| d¡ }|d urŠ|j d }| || || j||¡}|| d¡ d¡ }| d| j||¡}tjj|dd}|  |¡}|d urŸ|| }t ||
¡}| dddd¡ ¡ }| ¡ d d… | jf }| |¡}|rÈ||f}|S |f}|S )Nr+   éþÿÿÿr   r   r(   rV   r   )r-   rŸ   r¥   r    r¡   r   Úmatmulrw   ÚmathÚsqrtrš   rœ   r—   r.   r2   r/   r0   ra   r™   r   rZ   ZsoftmaxrK   rT   r›   )rN   r   r¦   r§   r¨   r3   rW   r6   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZrelative_position_biasZ
mask_shapeZattention_probsZcontext_layerZnew_context_layer_shapeÚoutputsr   r   r    re   »  s@   

&ÿ
ÿ

þzDonutSwinSelfAttention.forward©NNF)r   r   r   r<   r¥   r   rf   r   r   ri   r   re   rj   r   r   rP   r    r•     s"    %ûþýüûúr•   c                       s8   e Zd Z‡ fdd„Zdejdejdejfdd„Z‡  ZS )ÚDonutSwinSelfOutputc                    s*   t ƒ  ¡  t ||¡| _t |j¡| _d S r   )r;   r<   r   r}   ÚdenserI   r¢   rK   ©rN   rM   rW   rP   r   r    r<   ö  s   
zDonutSwinSelfOutput.__init__r   Úinput_tensorrS   c                 C   ó   |   |¡}|  |¡}|S r   ©r°   rK   )rN   r   r²   r   r   r    re   û  s   

zDonutSwinSelfOutput.forward©r   r   r   r<   r   rf   re   rj   r   r   rP   r    r¯   õ  s    $r¯   c                       r”   )ÚDonutSwinAttentionc                    s2   t ƒ  ¡  t||||ƒ| _t||ƒ| _tƒ | _d S r   )r;   r<   r•   rN   r¯   rŠ   ÚsetÚpruned_heads)rN   rM   rW   r£   r2   rP   r   r    r<     s   
zDonutSwinAttention.__init__c                 C   s²   t |ƒdkrd S t|| jj| jj| jƒ\}}t| jj|ƒ| j_t| jj|ƒ| j_t| jj	|ƒ| j_	t| j
j|dd| j
_| jjt |ƒ | j_| jj| jj | j_| j |¡| _d S )Nr   r   rV   )Úlenr
   rN   r™   rš   r¸   r   rŸ   r    r¡   rŠ   r°   r›   Úunion)rN   ÚheadsÚindexr   r   r    Úprune_heads
  s   ÿzDonutSwinAttention.prune_headsNFr   r¦   r§   r¨   rS   c                 C   s6   |   ||||¡}|  |d |¡}|f|dd …  }|S )Nr   r   )rN   rŠ   )rN   r   r¦   r§   r¨   Zself_outputsÚattention_outputr­   r   r   r    re     s   zDonutSwinAttention.forwardr®   )r   r   r   r<   r½   r   rf   r   r   ri   r   re   rj   r   r   rP   r    r¶     s"    ûþýüûúr¶   c                       ó2   e Zd Z‡ fdd„Zdejdejfdd„Z‡  ZS )ÚDonutSwinIntermediatec                    sJ   t ƒ  ¡  t |t|j| ƒ¡| _t|jt	ƒrt
|j | _d S |j| _d S r   )r;   r<   r   r}   rg   Ú	mlp_ratior°   rl   Z
hidden_actr“   r   Úintermediate_act_fnr±   rP   r   r    r<   +  s
   
zDonutSwinIntermediate.__init__r   rS   c                 C   r³   r   )r°   rÂ   rŽ   r   r   r    re   3  ó   

zDonutSwinIntermediate.forwardrµ   r   r   rP   r    rÀ   *  s    rÀ   c                       r¿   )ÚDonutSwinOutputc                    s4   t ƒ  ¡  t t|j| ƒ|¡| _t |j¡| _	d S r   )
r;   r<   r   r}   rg   rÁ   r°   rI   rJ   rK   r±   rP   r   r    r<   ;  s   
zDonutSwinOutput.__init__r   rS   c                 C   r³   r   r´   rŽ   r   r   r    re   @  rÃ   zDonutSwinOutput.forwardrµ   r   r   rP   r    rÄ   :  s    rÄ   c                       s„   e Zd Zd‡ fdd„	Zdd„ Zdd„ Zd	d
„ Z			ddejde	e
e
f deej dee dee de	ejejf fdd„Z‡  ZS )ÚDonutSwinLayerr   r   c                    s”   t ƒ  ¡  |j| _|| _|j| _|| _tj||jd| _	t
|||| jd| _|dkr.t|ƒnt ¡ | _tj||jd| _t||ƒ| _t||ƒ| _d S )N)Zeps)r2   r   )r;   r<   Zchunk_size_feed_forwardÚ
shift_sizer2   ry   r   rG   Zlayer_norm_epsÚlayernorm_beforer¶   Ú	attentionrŒ   ÚIdentityr‹   Úlayernorm_afterrÀ   ÚintermediaterÄ   rŠ   )rN   rM   rW   ry   r£   Údrop_path_raterÆ   rP   r   r    r<   H  s   
zDonutSwinLayer.__init__c                 C   sD   t |ƒ| jkr tdƒ| _tj ¡ rt  t |¡¡nt |ƒ| _d S d S ©Nr   )Úminr2   r   rÆ   r   rX   rY   Ztensor)rN   ry   r   r   r    Úset_shift_and_window_sizeU  s
   
 ÿýz(DonutSwinLayer.set_shift_and_window_sizec              	   C   s  | j dkr‡tjd||df||d}td| j ƒt| j | j  ƒt| j  d ƒf}td| j ƒt| j | j  ƒt| j  d ƒf}d}|D ]}	|D ]}
||d d …|	|
d d …f< |d7 }qEqAt|| jƒ}| d| j| j ¡}| d¡| d¡ }| |dkt	dƒ¡ |dkt	dƒ¡}|S d }|S )Nr   r   r…   r+   r(   g      YÀr   )
rÆ   r   rC   Úslicer2   r8   r.   ra   Zmasked_fillr’   )rN   r4   r5   r†   r‡   Zimg_maskZheight_slicesZwidth_slicesÚcountZheight_sliceZwidth_sliceZmask_windowsÚ	attn_maskr   r   r    Úget_attn_mask]  s.   
ýý
þ$ÿzDonutSwinLayer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj ||¡}||fS rÍ   )r2   r   rZ   rs   )rN   r   r4   r5   Ú	pad_rightZ
pad_bottomrt   r   r   r    ru   y  s
   zDonutSwinLayer.maybe_padNFr   r   r§   r¨   Úalways_partitionrS   c                 C   sÐ  |s|   |¡ n	 |\}}| ¡ \}}	}
|}|  |¡}| ||||
¡}|  |||¡\}}|j\}	}}}	| jdkrGtj|| j | j fdd}n|}t	|| j
ƒ}| d| j
| j
 |
¡}| j|||j|jd}| j||||d}|d }| d| j
| j
|
¡}t|| j
||ƒ}| jdkr–tj|| j| jfdd}n|}|d dkp£|d dk}|r¸|d d …d |…d |…d d …f  ¡ }| ||| |
¡}||  |¡ }|  |¡}|  |¡}||  |¡ }|rã||d	 f}|S |f}|S )
Nr   )r   r(   )ZshiftsÚdimsr+   r…   )r¨   r   r*   r   )rÏ   rT   rÇ   r.   ru   r-   rÆ   r   Zrollr8   r2   rÓ   r†   r‡   rÈ   r9   r0   r‹   rÊ   rË   rŠ   )rN   r   r   r§   r¨   rÕ   r4   r5   r3   rb   ZchannelsZshortcutrt   Z
height_padZ	width_padZshifted_hidden_statesZhidden_states_windowsrÒ   Zattention_outputsr¾   Zattention_windowsZshifted_windowsZ
was_paddedZlayer_outputÚlayer_outputsr   r   r    re   €  sN   

ÿÿ
$

ÿzDonutSwinLayer.forward)r   r   ©NFF)r   r   r   r<   rÏ   rÓ   ru   r   rf   r   rg   r   r   ri   re   rj   r   r   rP   r    rÅ   G  s*    úþ
ýüûúùrÅ   c                       sd   e Zd Z‡ fdd„Z			ddejdeeef deej	 dee
 d	ee
 d
eej fdd„Z‡  ZS )ÚDonutSwinStagec                    sh   t ƒ  ¡  ˆ | _ˆ| _t ‡ ‡‡‡‡fdd„t|ƒD ƒ¡| _|d ur,|ˆˆtjd| _	nd | _	d| _
d S )Nc              
      s:   g | ]}t ˆ ˆˆˆˆ| |d  dkrdnˆ jd  d‘qS )r(   r   )rM   rW   ry   r£   rÌ   rÆ   )rÅ   r2   )Ú.0Úi©rM   rW   r‹   ry   r£   r   r    Ú
<listcomp>Ë  s    	øúÿz+DonutSwinStage.__init__.<locals>.<listcomp>)rW   rz   F)r;   r<   rM   rW   r   Ú
ModuleListÚrangeÚblocksrG   Ú
downsampleZpointing)rN   rM   rW   ry   Údepthr£   r‹   rá   rP   rÜ   r    r<   Æ  s   
	÷ÿ
zDonutSwinStage.__init__NFr   r   r§   r¨   rÕ   rS   c                 C   s¸   |\}}t | jƒD ]\}}	|d ur|| nd }
|	|||
||ƒ}|d }q	|}| jd urE|d d |d d }}||||f}|  ||¡}n||||f}|||f}|rZ||dd … 7 }|S )Nr   r   r(   )Ú	enumeraterà   rá   )rN   r   r   r§   r¨   rÕ   r4   r5   rÛ   Úlayer_moduleÚlayer_head_maskr×   Ú!hidden_states_before_downsamplingZheight_downsampledZwidth_downsampledrc   Zstage_outputsr   r   r    re   à  s"   
ÿ


zDonutSwinStage.forwardrØ   )r   r   r   r<   r   rf   r   rg   r   r   ri   re   rj   r   r   rP   r    rÙ   Å  s$    úþ
ýüûúùrÙ   c                       s„   e Zd Z‡ fdd„Z						ddejdeeef deej	 d	ee
 d
ee
 dee
 dee
 dee
 deeef fdd„Z‡  ZS )ÚDonutSwinEncoderc                    sp   t ƒ  ¡  tˆ jƒˆ_ˆ ˆ_dd„ tjdˆ jt	ˆ jƒddD ƒ‰t
 ‡ ‡‡‡fdd„tˆjƒD ƒ¡ˆ_dˆ_d S )Nc                 S   s   g | ]}|  ¡ ‘qS r   )Úitem)rÚ   r¤   r   r   r    rÝ     s    z-DonutSwinEncoder.__init__.<locals>.<listcomp>r   Úcpu)r‡   c                    s’   g | ]E}t ˆ tˆ jd |  ƒˆd d |  ˆd d |  fˆ j| ˆ j| ˆtˆ jd|… ƒtˆ jd|d … ƒ… |ˆjd k rCtndd‘qS )r(   r   r   N)rM   rW   ry   râ   r£   r‹   rá   )rÙ   rg   rD   Údepthsr£   rž   Ú
num_layersrx   )rÚ   Zi_layer©rM   Zdprr@   rN   r   r    rÝ   	  s    
÷*ùÿF)r;   r<   r¹   rê   rë   rM   r   ZlinspacerÌ   rž   r   rÞ   rß   ÚlayersÚgradient_checkpointing)rN   rM   r@   rP   rì   r    r<     s   
$
öÿ
zDonutSwinEncoder.__init__NFTr   r   r§   r¨   Úoutput_hidden_statesÚ(output_hidden_states_before_downsamplingrÕ   Úreturn_dictrS   c	              	   C   sÜ  |rdnd }	|r
dnd }
|rdnd }|r7|j \}}}|j|g|¢|‘R Ž }| dddd¡}|	|f7 }	|
|f7 }
t| jƒD ]›\}}|d urH|| nd }| jr\| jr\|  |j|||||¡}n||||||ƒ}|d }|d }|d }|d |d f}|r¦|r¦|j \}}}|j|g|d |d f¢|‘R Ž }| dddd¡}|	|f7 }	|
|f7 }
n'|rÍ|sÍ|j \}}}|j|g|¢|‘R Ž }| dddd¡}|	|f7 }	|
|f7 }
|r×||dd … 7 }q<|sæt	dd	„ ||	|fD ƒƒS t
||	||
d
S )Nr   r   r   r   r(   r©   r+   c                 s   s    | ]	}|d ur|V  qd S r   r   )rÚ   Úvr   r   r    Ú	<genexpr>]  s   € z+DonutSwinEncoder.forward.<locals>.<genexpr>)r   r   r   r   )r-   r.   r/   rã   rí   rî   r„   Z_gradient_checkpointing_funcÚ__call__Útupler   )rN   r   r   r§   r¨   rï   rð   rÕ   rñ   Zall_hidden_statesZall_reshaped_hidden_statesZall_self_attentionsr3   rb   rr   Zreshaped_hidden_staterÛ   rä   rå   r×   ræ   rc   r   r   r    re     sp   

ú	
ÿÿÿÿ


€üzDonutSwinEncoder.forward)NFFFFT)r   r   r   r<   r   rf   r   rg   r   r   ri   r   r   re   rj   r   r   rP   r    rç     s6    ÷þ
ýüûúùø	÷

örç   c                   @   s*   e Zd ZeZdZdZdZdgZdd„ Z	dS )ÚDonutSwinPreTrainedModelÚdonutr]   TrÙ   c                 C   sÌ   t |tjtjfƒr#|jjjd| jjd |j	dur!|j	j 
¡  dS dS t |tjƒr8|j	j 
¡  |jj d¡ dS t |tƒrW|jdurH|jj 
¡  |jdurU|jj 
¡  dS dS t |tƒrd|jj 
¡  dS dS )zInitialize the weightsr   )ÚmeanZstdNr_   )rl   r   r}   rp   ÚweightÚdataZnormal_rM   Zinitializer_ranger|   Zzero_rG   Zfill_r:   rE   rF   r•   rœ   )rN   Úmoduler   r   r    Ú_init_weightsp  s"   
ÿ


ÿ
ÿz&DonutSwinPreTrainedModel._init_weightsN)
r   r   r   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesrü   r   r   r   r    rö   g  s    rö   c                       s’   e Zd Zd‡ fdd„	Zdd„ Zdd„ Ze													dd
eej	 deej
 deej	 dee dee dedee deeef fdd„ƒZ‡  ZS )ÚDonutSwinModelTFc                    sv   t ƒ  |¡ || _t|jƒ| _t|jd| jd   ƒ| _t	||d| _
t|| j
jƒ| _|r2t d¡nd| _|  ¡  dS )zû
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        r(   r   )rO   N)r;   r<   rM   r¹   rê   rë   rg   rD   Únum_featuresr:   rR   rç   rA   Úencoderr   ZAdaptiveAvgPool1dÚpoolerÚ	post_init)rN   rM   Zadd_pooling_layerrO   rP   r   r    r<   †  s   zDonutSwinModel.__init__c                 C   s   | j jS r   )rR   r>   r   r   r   r    Úget_input_embeddingsš  s   z#DonutSwinModel.get_input_embeddingsc                 C   s*   |  ¡ D ]\}}| jj| j |¡ qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)Úitemsrÿ   ÚlayerrÈ   r½   )rN   Zheads_to_pruner  r»   r   r   r    Ú_prune_heads  s   ÿzDonutSwinModel._prune_headsNr]   r^   r§   r¨   rï   r\   rñ   rS   c                 C   sò   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&tdƒ‚|  |t| j jƒ¡}| j|||d\}}	| j	||	||||d}
|
d }d}| j
dur_|  
| dd¡¡}t |d¡}|sm||f|
dd…  }|S t|||
j|
j|
jdS )	z¿
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r^   r\   )r§   r¨   rï   rñ   r   r   r(   )r   r#   r   r   r   )rM   r¨   rï   Úuse_return_dictr˜   Zget_head_maskr¹   rê   rR   rÿ   r   rw   r   rv   r"   r   r   r   )rN   r]   r^   r§   r¨   rï   r\   rñ   Zembedding_outputr   Zencoder_outputsZsequence_outputÚpooled_outputrŠ   r   r   r    re   ¥  sB   ÿ
ÿú	
ûzDonutSwinModel.forward)TF©NNNNNFN)r   r   r   r<   r  r  r   r   r   r   rh   ri   r   r   r"   re   rj   r   r   rP   r    rý   „  s:    øþýüûúùø
	÷rý   aì  
    DonutSwin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune DonutSwin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )Zcustom_introc                       s€   e Zd Z‡ fdd„Ze							ddeej deej deej dee	 d	ee	 d
e	dee	 de
eef fdd„ƒZ‡  ZS )ÚDonutSwinForImageClassificationc                    sP   t ƒ  |¡ |j| _t|ƒ| _|jdkrt | jj|j¡nt ¡ | _	|  
¡  d S rÍ   )r;   r<   Z
num_labelsrý   r÷   r   r}   rþ   rÉ   Ú
classifierr  )rN   rM   rP   r   r    r<   ö  s   
"ÿz(DonutSwinForImageClassification.__init__NFr]   r§   Úlabelsr¨   rï   r\   rñ   rS   c                 C   s    |dur|n| j j}| j||||||d}|d }	|  |	¡}
d}|dur.| j|
||
| j d}|sD|
f|dd…  }|durB|f| S |S t||
|j|j|jdS )aŠ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r§   r¨   rï   r\   rñ   r   )r'   r  Zpooled_logitsrM   r(   )r&   r'   r   r   r   )	rM   r  r÷   r
  Zloss_functionr%   r   r   r   )rN   r]   r§   r  r¨   rï   r\   rñ   r­   r  r'   r&   rŠ   r   r   r    re     s0   ú	
ûz'DonutSwinForImageClassification.forwardr  )r   r   r   r<   r   r   r   r   Z
LongTensorri   r   r   r%   re   rj   r   r   rP   r    r	  æ  s6    øþýüûúùø
	÷r	  )rý   rö   r	  )r   F)9r   Úcollections.abcrm   r«   Údataclassesr   Útypingr   r   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_utilsr	   Zpytorch_utilsr
   r   r   Úutilsr   r   r   r   Zconfiguration_donut_swinr   Z
get_loggerr   Úloggerr   r"   r%   r8   r9   r€   r:   r=   rx   rf   r’   ri   r‹   rŒ   r•   r¯   r¶   rÀ   rÄ   rÅ   rÙ   rç   rö   rý   r	  Ú__all__r   r   r   r    Ú<module>   sX   
 #$], 7e'~=eaÿ@