o
    Zh@                     @   sp  d Z ddlmZmZmZmZmZ ddlZddlm	Z	 ddlm
Z
 ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ee Z!G dd de	j"Z#G dd de	j"Z$G dd de	j"Z%G dd de	j"Z&eG dd deZ'eG dd de'Z(eddG dd de'Z)eddG dd  d e'eZ*g d!Z+dS )"zPyTorch TextNet model.    )AnyListOptionalTupleUnionN)Tensor)BCEWithLogitsLossCrossEntropyLossMSELoss)PreTrainedModel)ACT2CLS)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)TextNetConfig)logging)BackboneMixin   )auto_docstringc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )TextNetConvLayerconfigc                    s   t    |j| _|j| _|j| _t|jt	r%|jd d |jd d fn|jd }t
j|j|j|j|j|dd| _t
|j|j| _t
 | _| jd urVt| j  | _d S d S )Nr         F)kernel_sizestridepaddingbias)super__init__Zstem_kernel_sizer   Zstem_strider   Zstem_act_funcactivation_function
isinstancetuplennConv2dZstem_num_channelsZstem_out_channelsconvBatchNorm2dbatch_norm_eps
batch_normIdentity
activationr   )selfr   r   	__class__ [/var/www/auris/lib/python3.10/site-packages/transformers/models/textnet/modeling_textnet.pyr   +   s*   



zTextNetConvLayer.__init__hidden_statesreturnc                 C   s   |  |}| |}| |S N)r%   r(   r*   )r+   r0   r.   r.   r/   forwardF   s   


zTextNetConvLayer.forward)	__name__
__module____qualname__r   r   torchr   r3   __classcell__r.   r.   r,   r/   r   *   s    r   c                
       sL   e Zd ZdZdededededef
 fddZd	ejd
ejfddZ	  Z
S )TextNetRepConvLayera  
    This layer supports re-parameterization by combining multiple convolutional branches
    (e.g., main convolution, vertical, horizontal, and identity branches) during training.
    At inference time, these branches can be collapsed into a single convolution for
    efficiency, as per the re-parameterization paradigm.

    The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG).
    r   in_channelsout_channelsr   r   c           	         sf  t    || _|| _|| _|| _|d d d |d d d f}t | _tj	|||||dd| _
tj||jd| _|d d d df}d|d d d f}|d dkrotj	|||d df||dd| _tj||jd| _nd\| _| _|d dkrtj	||d|d f||dd| _tj||jd| _nd\| _| _||kr|dkrtj||jd| _d S d | _d S )Nr   r   r   F)r:   r;   r   r   r   r   )num_featuresZepsNN)r   r   Znum_channelsr;   r   r   r#   ZReLUr    r$   	main_convr&   r'   main_batch_normvertical_convvertical_batch_normhorizontal_convhorizontal_batch_normrbr_identity)	r+   r   r:   r;   r   r   r   Zvertical_paddingZhorizontal_paddingr,   r.   r/   r   V   sZ   
 


zTextNetRepConvLayer.__init__r0   r1   c                 C   s   |  |}| |}| jd ur| |}| |}|| }| jd ur0| |}| |}|| }| jd ur>| |}|| }| |S r2   )r>   r?   r@   rA   rB   rC   rD   r    )r+   r0   Zmain_outputsZvertical_outputsZhorizontal_outputsZid_outr.   r.   r/   r3      s   










zTextNetRepConvLayer.forward)r4   r5   r6   __doc__r   intr   r7   r   r3   r8   r.   r.   r,   r/   r9   L   s    "	9r9   c                       s.   e Zd Zdedef fddZdd Z  ZS )TextNetStager   depthc                    s   t    |j| }|j| }t|}|j| }|j|d  }|g|g|d   }|g| }	g }
t||	||D ]}|
t|g|R   q7t	
|
| _d S )Nr   )r   r   conv_layer_kernel_sizesZconv_layer_strideslenhidden_sizeszipappendr9   r#   
ModuleListstage)r+   r   rH   r   r   Z
num_layersZstage_in_channel_sizeZstage_out_channel_sizer:   r;   rO   Zstage_configr,   r.   r/   r      s   




zTextNetStage.__init__c                 C   s   | j D ]}||}q|S r2   )rO   )r+   hidden_stateblockr.   r.   r/   r3      s   

zTextNetStage.forward)r4   r5   r6   r   rF   r   r3   r8   r.   r.   r,   r/   rG      s    rG   c                	       sL   e Zd Zdef fddZ		ddejdee dee de	fd	d
Z
  ZS )TextNetEncoderr   c                    sF   t    g }t|j}t|D ]
}|t|| qt|| _	d S r2   )
r   r   rJ   rI   rangerM   rG   r#   rN   stages)r+   r   rT   Z
num_stagesZstage_ixr,   r.   r/   r      s   

zTextNetEncoder.__init__NrP   output_hidden_statesreturn_dictr1   c                 C   sL   |g}| j D ]}||}|| q|s |f}|r||f S |S t||dS )N)last_hidden_stater0   )rT   rM   r   )r+   rP   rU   rV   r0   rO   outputr.   r.   r/   r3      s   
zTextNetEncoder.forwardr=   )r4   r5   r6   r   r   r7   r   r   boolr   r3   r8   r.   r.   r,   r/   rR      s    rR   c                   @   s    e Zd ZeZdZdZdd ZdS )TextNetPreTrainedModeltextnetpixel_valuesc                 C   s   t |tjtjfr#|jjjd| jjd |j	d ur!|j	j
  d S d S t |tjr=|jjd |j	d ur?|j	j
  d S d S d S )Ng        )meanZstdg      ?)r!   r#   Linearr$   weightdataZnormal_r   Zinitializer_ranger   Zzero_r&   Zfill_)r+   moduler.   r.   r/   _init_weights   s   

z$TextNetPreTrainedModel._init_weightsN)r4   r5   r6   r   Zconfig_classZbase_model_prefixZmain_input_namerb   r.   r.   r.   r/   rZ      s
    rZ   c                       s`   e Zd Z fddZe	d
dedee dee dee	e
ee
 f e	e
 ef fdd	Z  ZS )TextNetModelc                    s8   t  | t|| _t|| _td| _| 	  d S )N)r   r   )
r   r   r   stemrR   encoderr#   AdaptiveAvgPool2dpooler	post_initr+   r   r,   r.   r/   r      s
   

zTextNetModel.__init__Nr\   rU   rV   r1   c           	      C   s   |d ur|n| j j}|d ur|n| j j}| |}| j|||d}|d }| |}|s;||f}|r9||d f S |S t|||rF|d dS d dS )NrU   rV   r   r   )rW   Zpooler_outputr0   )r   use_return_dictrU   rd   re   rg   r   )	r+   r\   rU   rV   rP   Zencoder_outputsrW   Zpooled_outputrX   r.   r.   r/   r3      s&   


zTextNetModel.forwardr=   )r4   r5   r6   r   r   r   r   rY   r   r   r   r   r   r3   r8   r.   r.   r,   r/   rc      s    rc   z
    TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       s\   e Zd Z fddZe				ddeej deej dee	 dee	 de
f
d	d
Z  ZS )TextNetForImageClassificationc                    s|   t  | |j| _t|| _td| _t | _	|jdkr)t
|jd |jnt | _t| j| j	g| _|   d S )N)r   r   r   )r   r   
num_labelsrc   r[   r#   rf   Zavg_poolZFlattenflattenr^   rK   r)   fcrN   
classifierrh   ri   r,   r.   r/   r     s   

(z&TextNetForImageClassification.__init__Nr\   labelsrU   rV   r1   c                 C   sl  |dur|n| j j}| j|||d}|d }| jD ]}||}q| |}d}	|dur| j jdu rU| jdkr;d| j _n| jdkrQ|jtj	ksL|jtj
krQd| j _nd| j _| j jdkrst }
| jdkrm|
| | }	n+|
||}	n%| j jdkrt }
|
|d| j|d}	n| j jdkrt }
|
||}	|s|f|d	d  }|	dur|	f| S |S t|	||jd
S )al  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> import torch
        >>> import requests
        >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> outputs.logits.shape
        torch.Size([1, 2])
        ```Nrj   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrm   r   )losslogitsr0   )r   rk   r[   rq   rp   Zproblem_typern   Zdtyper7   longrF   r
   Zsqueezer	   viewr   r   r0   )r+   r\   rr   rU   rV   outputsrW   layerrt   rs   Zloss_fctrX   r.   r.   r/   r3   '  s:   !




"


z%TextNetForImageClassification.forward)NNNN)r4   r5   r6   r   r   r   r7   ZFloatTensorZ
LongTensorrY   r   r3   r8   r.   r.   r,   r/   rl     s$    rl   zP
    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sR   e Zd Z fddZe	d
dedee dee dee	e	 e
f fdd	Z  ZS )TextNetBackbonec                    s6   t  | t  | t|| _|j| _|   d S r2   )r   r   Z_init_backbonerc   r[   rK   r<   rh   ri   r,   r.   r/   r   s  s
   
zTextNetBackbone.__init__Nr\   rU   rV   r1   c           
      C   s   |dur|n| j j}|dur|n| j j}| j|d|d}|r!|jn|d }d}t| jD ]\}}|| jv r<||| f7 }q,|sT|f}	|rR|rI|jn|d }|	|f7 }	|	S t||r^|jddS dddS )a  
        Examples:

        ```python
        >>> import torch
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, AutoBackbone

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(image, return_tensors="pt")
        >>> with torch.no_grad():
        >>>     outputs = model(**inputs)
        ```NTrj   r   r.   )feature_mapsr0   Z
attentions)	r   rk   rU   r[   r0   	enumerateZstage_namesZout_featuresr   )
r+   r\   rU   rV   rw   r0   rz   idxrO   rX   r.   r.   r/   r3   }  s0   

zTextNetBackbone.forwardr=   )r4   r5   r6   r   r   r   r   rY   r   r   r   r3   r8   r.   r.   r,   r/   ry   m  s    
ry   )ry   rc   rZ   rl   ),rE   typingr   r   r   r   r   r7   Ztorch.nnr#   r   r   r	   r
   Ztransformersr   Ztransformers.activationsr   Ztransformers.modeling_outputsr   r   r   r   Z1transformers.models.textnet.configuration_textnetr   Ztransformers.utilsr   Z!transformers.utils.backbone_utilsr   utilsr   Z
get_loggerr4   loggerModuler   r9   rG   rR   rZ   rc   rl   ry   __all__r.   r.   r.   r/   <module>   s>   
"Z%U>