o
    Zh{*                    @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* e# rddl+m,Z, ddl-m.Z. e&/e0Z1G dd de
j2Z3zddl4m5Z5 e5Z3e16d W n e7y   Y n e8y   e19d Y nw e:e3 G dd de
j2Z;G dd de
j2Z<G dd de
j2Z=G dd de
j2Z>G dd  d e
j2Z?e"G d!d" d"eZ@e"G d#d$ d$e@ZAG d%d& d&e
j2ZBG d'd( d(e
j2ZCG d)d* d*e
j2ZDG d+d, d,e
j2ZEG d-d. d.e
j2ZFG d/d0 d0e
j2ZGe"d1d2G d3d4 d4e@ZHe"d5d2G d6d7 d7e@eZIg d8ZJdS )9zPix2Struct modeling file    N)DictListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )Pix2StructLayerNormư>c                    s&   t    tt|| _|| _dS )zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ a/var/www/auris/lib/python3.10/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr&   >   s   

zPix2StructLayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)Zkeepdim)tor(   float32powmeanZrsqrtr+   r*   dtypefloat16Zbfloat16)r,   hidden_statesZvariancer1   r1   r2   forwardF   s
   
zPix2StructLayerNorm.forward)r$   __name__
__module____qualname__r&   r<   __classcell__r1   r1   r/   r2   r#   =   s    r#   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    sR   t    t|j|j| _t|j|j| _	t|j|j| _
t|j| _d S N)r%   r&   r   LinearZpatch_embed_hidden_sizer-   patch_projection	EmbeddingZseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr,   rD   r/   r1   r2   r&   m   s
   
z#Pix2StructVisionEmbeddings.__init__flattened_patchesc                 C   s   |d d d d df   }|d d d d df   }|d d d d dd f }| |}| |}| |}|| | }| |}|S )Nr   r   r3   )longrH   rJ   rK   rN   )r,   rP   Zrow_indicesZcol_indices
embeddingsZrow_embeddingsZcol_embeddingsr1   r1   r2   r<   v   s   



z"Pix2StructVisionEmbeddings.forward)
r>   r?   r@   __doc__r   r&   r(   Tensorr<   rA   r1   r1   r/   r2   rC   f   s    	rC   c                       s.   e Zd Z fddZ				dddZ  ZS )Pix2StructVisionAttentionc                    s   t    |j| _|j| _|j| _|j| _| j| j | _	t
j| j| j	dd| _t
j| j| j	dd| _t
j| j| j	dd| _t
j| j	| jdd| _d| _d S NFbias)r%   r&   r-   d_kvkey_value_proj_dimZnum_attention_headsn_headsZattention_dropoutrN   	inner_dimr   rG   querykeyvalueoutputgradient_checkpointingrO   r/   r1   r2   r&      s   

z"Pix2StructVisionAttention.__init__NFc                    s  |j dd \ } fdd}||}||}	||}
t||	dd}|du rtjdj||f|j	|j
d}jrKjrKd|_| dkrd||ddddddf |j	 }n$|durq|||j	 }nt stj |f|j	|j
d}|||j	 }d| }||dkt|j
j}||7 }t|tt|j
j}tjj|d	tjd
|}tjj|jjd}|dur|| }t||
}|dd  d	j} |}|f|f }|r||f }|S )z&
        Self-attention block
        Nr3   c                    s    |    djjddS )Z
projectionr4   r   r3   )
contiguousviewr[   rZ   	transpose)Zstates
batch_sizer,   r1   r2   to_projection_shape   s    z>Pix2StructVisionAttention.forward.<locals>.to_projection_shaper   r   devicer9   Tr4   )dimr9   ptraining)!shaper]   r^   r_   r(   matmulrd   zerosr[   ri   r9   ra   rm   requires_gradrj   r5   r   r)   masked_fillfinfominmaxtensorr   
functionalsoftmaxr6   type_asrN   rb   rc   r\   r`   )r,   r;   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthrg   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr1   re   r2   r<      sH   &

z!Pix2StructVisionAttention.forward)NNNFr=   r1   r1   r/   r2   rU      s    rU   c                       *   e Zd Zdef fddZdd Z  ZS )Pix2StructVisionMlprD   c                    j   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rV   r%   r&   r   rG   r-   d_ffwi_0wi_1worL   rM   rN   r	   Zdense_act_fnactrO   r/   r1   r2   r&         
zPix2StructVisionMlp.__init__c                 C   z   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rF   r   r   r   rN   
isinstancer   r*   r(   rT   r9   Zint8r5   r,   r;   Zhidden_geluZhidden_linearr1   r1   r2   r<         


zPix2StructVisionMlp.forward)r>   r?   r@   r    r&   r<   rA   r1   r1   r/   r2   r          r   c                       st   e Zd Zdeddf fddZ			ddejdeej d	eej d
ede	e
ejejf e
ej f f
ddZ  ZS )Pix2StructVisionLayerrD   rE   Nc                    sT   t    |j| _d| _t|| _t|| _t|j	|j
d| _t|j	|j
d| _d S )Nr   r.   )r%   r&   Zchunk_size_feed_forwardZseq_len_dimrU   	attentionr   mlpr#   r-   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrO   r/   r1   r2   r&   
  s   


zPix2StructVisionLayer.__init__Fr;   rz   	head_maskr}   c           
      C   sb   |}|  |}| j||||d}|d }|dd  }|| }| |}	| |	| }	|	f| }|S )N)rz   r|   r}   r   r   )r   r   r   r   )
r,   r;   rz   r   r}   Zresidualself_attention_outputsattention_outputr   layer_outputr1   r1   r2   r<     s   


zPix2StructVisionLayer.forward)NNF)r>   r?   r@   r   r&   r(   rT   r   boolr   r   r<   rA   r1   r1   r/   r2   r   	  s     r   c                       sn   e Zd Zdeddf fddZ					ddejd	eej d
eej dededede	e
ef fddZ  ZS )Pix2StructVisionEncoderrD   rE   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r1   )r   ).0_rD   r1   r2   
<listcomp>8  s    z4Pix2StructVisionEncoder.__init__.<locals>.<listcomp>F)	r%   r&   rD   r   
ModuleListrangenum_hidden_layerslayerra   rO   r/   r   r2   r&   5  s   
 
z Pix2StructVisionEncoder.__init__FTr;   rz   r   r}   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]:\}	}
|r||f }|d ur$||	 nd }| jr7| jr7| |
j||||}n|
||||}|d }|rK||d f }q|rS||f }|satdd |||fD S t|||dS )Nr1   r   r   c                 s       | ]	}|d ur|V  qd S rF   r1   r   vr1   r1   r2   	<genexpr>a  s    z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater;   
attentions)	enumerater   ra   rm   _gradient_checkpointing_func__call__tupler   )r,   r;   rz   r   r}   r   r   all_hidden_statesZall_self_attentionsilayer_moduler|   layer_outputsr1   r1   r2   r<   ;  s8   	

zPix2StructVisionEncoder.forward)NNFFT)r>   r?   r@   r   r&   r(   rT   r   r   r   r   r   r<   rA   r1   r1   r/   r2   r   4  s,    	
r   c                   @   s4   e Zd ZeZdZdZedd Zdd Z	dd Z
d	S )
Pix2StructPreTrainedModelTFc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r(   rv   r   r   )r,   r   Z
input_maskdummy_inputsr1   r1   r2   r   o  s   

z&Pix2StructPreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr|jj|d  dS t|trt| j tr$| j j	j
n| j j
}t| j tr3| j j	jn| j j}|jjjjd||d  d t|jdrX|jjdurX|jjj  |jjjjd||d  d t|jdry|jjdury|jjj  |jjjjd||d  d t|jdr|jjdur|jjj  dS dS dS t|tr%t| j tr| j j	j
n| j j
}t| j tr| j j	jn| j j
}t| j tr| j j	jn| j j}|jjjjd||| d  d |jjjjd||d  d |jjjjd||d  d |jjjjd||| d  d |jr#|jjjjd||d  d dS dS t|tjr\t| j tr8| j j	j
n| j j
}|jjjd||d  d |jdurZ|jj|j   dS dS t|trt| j trn| j j	j
n| j j
}|j jjjd||d  d dS t|tj!tj"frtj#j$|jj%t&j'd| j j(d%|jj)|j_|jdur|jj  dS dS t|tr|jdur|jjd dS dS t|tjr|jjjd| j j(d |jdur|jj|j   dS dS dS )zInitialize the weights      ?        g      )r8   ZstdrX   N)*rD   Zinitializer_factorr   r#   r*   dataZfill_ Pix2StructTextDenseGatedActDenser   text_configr-   r   r   Znormal_hasattrrX   Zzero_r   r   Pix2StructTextAttentionrY   	num_headsr]   r^   r_   r`   has_relative_attention_biasrelative_attention_biasr   rI   Zpadding_idxPix2StructTextModellm_headrG   ZConv2dinitZtrunc_normal_r5   r(   r6   Zinitializer_ranger9   )r,   modulefactorr-   r   rZ   r[   r1   r1   r2   _init_weightsz  s   




   

 
z'Pix2StructPreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r4   )r   .rj   r   ).r   z1self.model.config.pad_token_id has to be defined.)rD   decoder_start_token_idpad_token_id
ValueErrorr   r(   fullrn   catZ	new_zeroscloneZmasked_fill_)r,   r   r   r   Zshifted_input_idsr1   r1   r2   _shift_right  s      z&Pix2StructPreTrainedModel._shift_rightN)r>   r?   r@   r   config_classZ_supports_cache_classZ_supports_static_cachepropertyr   r   r   r1   r1   r1   r2   r   i  s    

Pr   c                       s   e Zd ZeZdZdZdgZdef fddZ	dd Z
d	eeee f d
dfddZe						ddeej deej deej dee dee dee d
eeef fddZ  ZS )Pix2StructVisionModelrP   Tr   rD   c                    sD   t  | || _t|| _t|| _t|j|j	d| _
|   d S Nr   )r%   r&   rD   rC   rR   r   encoderr#   r-   r   	layernorm	post_initrO   r/   r1   r2   r&     s   

zPix2StructVisionModel.__init__c                 C   s   | j jS rF   )rR   rH   r,   r1   r1   r2   get_input_embeddings  s   z*Pix2StructVisionModel.get_input_embeddingsheads_to_prunerE   Nc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   Zprune_heads)r,   r   r   Zheadsr1   r1   r2   _prune_heads  s   z"Pix2StructVisionModel._prune_headsrz   r   r}   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td|du r4|jdddk }| || j j}| 	|}| j
||||||d}|d }	| |	}	|sb|	f}
|
|dd  S t|	|j|jdS )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://arxiv.org/abs/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr4   r   r   )rz   r   r}   r   r   r   r   )rD   r}   r   use_return_dictr   sumfloatget_head_maskr   rR   r   r   r   r;   r   )r,   rP   rz   r   r}   r   r   Zembedding_outputencoder_outputsZsequence_outputZhead_outputsr1   r1   r2   r<     s8   &

zPix2StructVisionModel.forward)NNNNNN)r>   r?   r@   r    r   main_input_namesupports_gradient_checkpointing_no_split_modulesr   r&   r   r   intr   r   r   r   r(   rT   r   r   r   r   r<   rA   r1   r1   r/   r2   r     s<    
r   c                       r   )r   rD   c                    r   rV   r   rO   r/   r1   r2   r&   X  r   z)Pix2StructTextDenseGatedActDense.__init__c                 C   r   rF   r   r   r1   r1   r2   r<   `  r   z(Pix2StructTextDenseGatedActDense.forwardr>   r?   r@   r   r&   r<   rA   r1   r1   r/   r2   r   W  r   r   c                       r   )Pix2StructTextLayerFFrD   c                    s8   t    t|| _t|j|jd| _t	|j
| _d S r   )r%   r&   r   DenseReluDenser#   r-   layer_norm_epsilon
layer_normr   rL   rM   rN   rO   r/   r1   r2   r&   u  s   

zPix2StructTextLayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rF   )r   r   rN   )r,   r;   Zforwarded_statesr1   r1   r2   r<   }  s   

zPix2StructTextLayerFF.forwardr   r1   r1   r/   r2   r   t  r   r   c                       sb   e Zd Z	ddedee f fddZedd
dZdddZ										dddZ
  ZS )r   FNrD   	layer_idxc                    s   t    || _|j| _|j| _|j| _|j| _|j| _	|j
| _| j	| j | _|| _|d u r9td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j	| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrW   )r%   r&   r   relative_attention_num_bucketsrelative_attention_max_distancer-   rY   rZ   r   r[   rM   rN   r\   r   loggerwarning_oncer0   r>   r   rG   r]   r^   r_   r`   rI   r   setpruned_headsra   r,   rD   r   r   r/   r1   r2   r&     s,   

z Pix2StructTextAttention.__init__T       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r3   r   )r5   r(   rQ   absrt   Z
zeros_likelogr   mathZ	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distanceZrelative_bucketsZ	max_exactZis_smallZrelative_position_if_larger1   r1   r2   _relative_position_bucket  s*   z1Pix2StructTextAttention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|d| j| j	d}|  |}	|	
g dd}	|	S )z%Compute binned relative position biasN)r9   ri   F)r  r  r  )r3   r   r   r   )r   r*   ri   r(   arangerQ   r5   r  r   r   ZpermuteZ	unsqueeze)
r,   query_length
key_lengthri   cache_positionZcontext_positionZmemory_positionr  Zrelative_position_bucketvaluesr1   r1   r2   compute_bias  s    
 
z$Pix2StructTextAttention.compute_biasc                 C   s  |j dd \}}|du}| |}||d| j| jdd}|dur4|j| j}|r1|j	}n|j
}|r8|n|}|rM|rM|rM|j| j }|j| j }nE| |}| |}||d| j| jdd}||d| j| jdd}|dur|s{|
nd}
|||| jd|
i\}}|rd|j| j< t||dd}|du r|j d }|dur|n|
d d }| jstjd| j||f|j|jd	}| jr| jrd|_n| j|||j|
d
}|dddd| dddf }|dur|ddddddd|j d f }|| }| jr#t|j d }d|t| j< |dd| f }n|}||7 }tjj |! dd"|}tjj#|| j#| jd}|durJ|| }t||}|dd$ }||d| j%}| &|}|||f}|	rr||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr3   r4   r   r  Tr   rh   )ri   r  r   r   rk   )'rn   r]   rc   r[   rZ   rd   
is_updatedgetr   Zcross_attention_cacheself_attention_cacheZ	key_cacheZvalue_cacher^   r_   updater(   ro   r   rp   ri   r9   ra   rm   rq   r  r   r)   listr   r   rw   rx   r   ry   rN   rb   r\   r`   )r,   r;   maskkey_value_statesr{   past_key_valuer|   r	  	use_cacher}   r  rf   r~   Zis_cross_attentionr   r  Zcurr_past_key_valueZcurrent_statesr   r   r   r
  Zreal_seq_lengthcausal_maskr   r   r   r   r1   r1   r2   r<     sx   





"
&



zPix2StructTextAttention.forwardFN)Tr   r   )NN)	NNNNNNFFN)r>   r?   r@   r   r   r   r&   staticmethodr  r  r<   rA   r1   r1   r/   r2   r     s(    
0r   c                       s@   e Zd Zddee f fddZ							d	ddZ  ZS )
 Pix2StructTextLayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   r%   r&   r   r   r#   r-   r   r   r   rL   rM   rN   r   r/   r1   r2   r&   Y  s   
z)Pix2StructTextLayerSelfAttention.__init__c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)r  r{   r|   r  r  r}   r  r   r   r   r   rN   )r,   r;   rz   r{   r|   r  r  r}   r  normed_hidden_statesr   r   r1   r1   r2   r<   a  s   

z(Pix2StructTextLayerSelfAttention.forwardr  )NNNNFFNr>   r?   r@   r   r   r&   r<   rA   r1   r1   r/   r2   r  X  s    r  c                       sB   e Zd Zddee f fddZ								d	ddZ  ZS )
!Pix2StructTextLayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr  r   r  )r,   rD   r   r/   r1   r2   r&   ~  s   
z*Pix2StructTextLayerCrossAttention.__init__Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	r  r  r{   r|   r  r  r	  r}   r  r   r   r  )r,   r;   r  rz   r{   r|   r  r  r	  r}   r  r  r   r   r   r1   r1   r2   r<     s    
z)Pix2StructTextLayerCrossAttention.forwardrF   )NNNNFNFNr   r1   r1   r/   r2   r!  }  s    
r!  c                       sJ   e Zd Zd	dee f fddZ												d
ddZ  ZS )Pix2StructTextBlockFNr   c                    s6   t    t|||d| _t||d| _t|| _d S )Nr  )r   )r%   r&   r  self_attentionr!  encoder_decoder_attentionr   r   r   r/   r1   r2   r&     s   
zPix2StructTextBlock.__init__Tc                 C   sr  | j |||||	|
||d}|d d \}}	|dd  }|jtjkr:t| r:t|jjd }tj|| |d}|d u}|r| j	||||||	|d d |
|d	}|d d \}}	|jtjkryt| ryt|jjd }tj|| |d}||dd   }| 
|}|jtjkrt| rt|jjd }tj|| |d}|f}|
r||	f | }|S || }|S )N)rz   r{   r|   r  r  r}   r  r3   i  )rt   ru   r4   r   )r  rz   r{   r|   r  r	  r  r}   )r#  r9   r(   r:   isinfanyrs   ru   clampr$  r   )r,   r;   rz   r{   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr|   cross_attn_layer_head_maskr  r  r}   r   r  r   Zattention_outputsZclamp_valueZdo_cross_attentionZcross_attention_outputsr   r1   r1   r2   r<     sV   


zPix2StructTextBlock.forwardr  )NNNNNNNNFFTNr   r1   r1   r/   r2   r"    s    r"  z3
    The standalone text decoder of Pix2Struct
    )Zcustom_introc                #       s  e Zd ZeZdgZdgZdZ fddZdd Z	dd	 Z
d
d Zdd Zdd Ze														d.deej deej deej deej deej deej deej deeeej   dee dee dee deej dee deej deeejd f ef fd!d"Z	#d/deejd$f d%ejdejdedef
d&d'Zedejd(ed)ed*ejdejd+efd,d-Z  ZS )0r   r"  zlm_head.weightTc                    s   t    t j j| _t fddt j	D | _
t j jd| _t j| _tj j jdd| _|   d| _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   r  )r"  r   )r   r   r   r1   r2   r     s    z0Pix2StructTextModel.__init__.<locals>.<listcomp>r   FrW   )r%   r&   r   rI   Z
vocab_sizer-   embed_tokensr   r   
num_layersr   r#   r   final_layer_normrL   rM   rN   rG   r   r   ra   rO   r/   r   r2   r&   
  s   

zPix2StructTextModel.__init__c              	   C   s   |d u rt d |S d}|D ]M}d}|D ]}||d||jf }q|d j|d jkr@td|d j d|d j dt|t|krWtdt| dt| d||f }q|S )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr1   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )r   warningZindex_selectr5   ri   rn   r   len)r,   past_key_valuesZbeam_idxZreordered_decoder_pastZlayer_past_statesZreordered_layer_past_statesZlayer_past_stater1   r1   r2   _reorder_cache  s(   
z"Pix2StructTextModel._reorder_cachec                 C      | j S rF   r,  r   r1   r1   r2   r   <     z(Pix2StructTextModel.get_input_embeddingsc                 C   
   || _ d S rF   r4  r,   Znew_embeddingsr1   r1   r2   set_input_embeddings?     
z(Pix2StructTextModel.set_input_embeddingsc                 C   r3  rF   r   r   r1   r1   r2   get_output_embeddingsB  r5  z)Pix2StructTextModel.get_output_embeddingsc                 C   r6  rF   r:  r7  r1   r1   r2   set_output_embeddingsE  r9  z)Pix2StructTextModel.set_output_embeddingsNr   rz   r(  r)  inputs_embedsr   cross_attn_head_maskr1  r  r}   r   labelsr   r  rE   .c           -      K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|durE| }|d|d }n|durR| dd }ntd|du rh| jduscJ d| |}|\}}d}d}|	sv|durt	|t
rt	|tsd}t|t }nt	|tsd}td t|}n|du rtt t }d	}|dur|d	 }n|dur| }|du rtj||| |jd
}|du r|dur| | n|}tj|||jd
}| j jr| ||||dur|jnd|
}n|ddddddf }|j|jd}d| t|jj }|dur7| \}}}||f}|du r1tj||jd
}| |}nd}| || j j}| || j j}|rNdnd}|
rUdnd}|
r\dnd}d} d}!| |}"t | j!D ]\}#}$||# }%||# }&|r||"f }| j"r| j#r|	rt$d d}	| %|$j&|"|| |||!|%|&d|	|
|}'n|$|"|| |||!|%|&||	|
|d}'|	du r|'dd d |'dd  }'|'dd \}"}(|'d } |dur|'|
rdnd }!|
r||'d f }|dur||'d f }ql| '|"}"| |"}"| (|"})|r||"f }d}*|dur8||)j}t)j*ddd}+|+|)+ d|)d|+ d}*|	r=|(nd},|rE|j},|rL|, },|s^t-dd |*|)|,|||fD S t.|*|)|,|||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer4   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsFTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   ri   )r9   r   r1   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)rz   r{   r(  r)  r*  r|   r+  r  r  r}   r  r   rF   r3      r      r   r8   )Zignore_indexZ	reductionc                 s   r   rF   r1   r   r1   r1   r2   r   /  s    	z.Pix2StructTextModel.forward.<locals>.<genexpr>)losslogitsr1  r;   r   cross_attentions)/rD   r  r}   r   r   r   sizerc   r,  r   r
   r   r   r   r   Zfrom_legacy_cacheget_seq_lengthr(   r  ri   r)   Z
is_decoder_update_causal_maskr  r5   r9   rs   rt   Zinvert_attention_maskr   r-  rN   r   r   ra   rm   r/  r   r<   r.  r   r   ZCrossEntropyLossrb   Zto_legacy_cacher   r   )-r,   r   rz   r(  r)  r=  r   r>  r1  r  r}   r   r?  r   r  kwargsZinput_shaperf   r~   Zreturn_legacy_cacheZreturn_self_attention_cachepast_key_values_lengthZmask_seq_lengthr  Zencoder_batch_sizeZencoder_sequence_lengthr   Zencoder_hidden_shapeZencoder_extended_attention_maskr   Zall_attentionsZall_cross_attentionsr{   r*  r;   r   r   r|   r+  r   Znext_decoder_cacherD  rC  Zloss_fctZ
next_cacher1   r1   r2   r<   H  s4  3














&zPix2StructTextModel.forwardFr!   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r=  rJ  Zis_trainingr   r4   )sequence_lengthtarget_lengthr9   r  rf   )cudaZxpuZnpu)rD   Z_attn_implementationr&  r   r(   rT   r"   rG  Zis_compileabler   Z_ignore_causal_mask_sdparm   r9   rn   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionri   typers   rt   Z_unmask_unattended)r,   rz   rK  r  r1  r}   Zpast_seen_tokensZusing_compilable_cacher9   rL  rM  r  	min_dtyper1   r1   r2   rH  E  sT   




z'Pix2StructTextModel._update_causal_maskrL  rM  r9   rf   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        NrA  )Z
fill_valuer9   ri   r   )Zdiagonalr@  r4   r   )rj   r(   rs   rt   r   ri   Ztriur  Zreshapeexpandr   rn   r5   rr   )rz   rL  rM  r9   r  rf   rI  r  rQ  Zmask_lengthZpadding_maskr1   r1   r2   rO    s,    $
6  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNN)F) r>   r?   r@   r   r   r   _tied_weights_keysr   r&   r2  r   r8  r;  r<  r   r   r(   
LongTensorFloatTensorrT   r   r   r   r   r<   r
   rH  r  r   r9   rO  rA   r1   r1   r/   r2   r     s    	
  
Dr   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c                &       sR  e Zd ZeZdZdgZdef fddZdd Zdd	 Z	d
e
jfddZdd Zdd Zdd Ze																d%deej deej deej deej deej deej deej deeeej   deeeej   deej deej dee dee d ee d!ee d"eej d
eeej ef f"d#d$Z  ZS )&"Pix2StructForConditionalGenerationrP   zdecoder.lm_head.weightrD   c                    s8   t  | t|j| _t|j| _|j| _| 	  d S rF   )
r%   r&   r   Zvision_configr   r   r   decoderZis_vqar   rO   r/   r1   r2   r&     s
   z+Pix2StructForConditionalGeneration.__init__c                 C   
   | j  S rF   )rW  r   r   r1   r1   r2   r     r9  z7Pix2StructForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rF   )rW  r8  r7  r1   r1   r2   r8       z7Pix2StructForConditionalGeneration.set_input_embeddingsrE   c                 C   rX  rF   )rW  r;  r   r1   r1   r2   r;    r9  z8Pix2StructForConditionalGeneration.get_output_embeddingsc                 C   rY  rF   )rW  r<  r7  r1   r1   r2   r<    rZ  z8Pix2StructForConditionalGeneration.set_output_embeddingsc                 C   r3  rF   )rW  r   r1   r1   r2   get_decoder  r5  z.Pix2StructForConditionalGeneration.get_decoderc                 C   r3  rF   )r   r   r1   r1   r2   get_encoder  r5  z.Pix2StructForConditionalGeneration.get_encoderNrz   r   r   r   decoder_head_maskr>  r   r1  r?  decoder_inputs_embedsr  r}   r   r   r  c                 C   sH  |dur|n| j jj}|dur|n| j j}|du r%| j||||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }|
duru|du ru|du ru| |
}|durd|n|	| j j
 }d|dddf< | j||||	||||||||
||d}|s|| S t|j|j|j|j|j|j|j|j|jd	S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rP   rz   r   r}   r   r   r   r   r3   r   )r   rz   r=  r1  r(  r)  r   r>  r  r}   r   r?  r   r  )	rC  rD  r1  Zdecoder_hidden_statesZdecoder_attentionsrE  Zencoder_last_hidden_stater(  Zencoder_attentions)rD   r   r  r   r   r   r   r0  r   ner   r   rW  r   rC  rD  r1  r;   r   rE  r   )r,   rP   rz   r   r   r   r]  r>  r   r1  r?  r^  r  r}   r   r   r  r;   Zdecoder_outputsr1   r1   r2   r<     sl   r
z*Pix2StructForConditionalGeneration.forward)NNNNNNNNNNNNNNNN)r>   r?   r@   r   r   r   rS  r&   r   r8  r   Moduler;  r<  r[  r\  r   r   r(   rU  rT  Z
BoolTensorrT   r   r   r   r   r<   rA   r1   r1   r/   r2   rV    s~    	
rV  )r   rV  r   r   )KrS   r  typingr   r   r   r   r   r(   Ztorch.utils.checkpointr   Zactivationsr	   Zcache_utilsr
   r   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   utilsr   r   r   r   r   r   r   Zconfiguration_pix2structr   r   r    Z!torch.nn.attention.flex_attentionr!   Zintegrations.flex_attentionr"   Z
get_loggerr>   r   r`  r#   Zapex.normalizationrB   infoImportError	Exceptionr/  appendrC   rU   r   r   r   r   r   r   r   r   r  r!  r"  r   rV  __all__r1   r1   r1   r2   <module>   sx   $	


$b+5|p U%&\   A X