o
    Zh                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlZd dlmZm	Z	m
Z
 d dlmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ eeZ G dd de
j!Z"G dd de
j!Z#G dd de
j!Z$G dd de
j!Z%G dd de
j!Z&G dd de
j!Z'G dd de
j!Z(G dd de
j!Z)G dd de
j!Z*G dd  d e
j!Z+G d!d" d"e
j!Z,G d#d$ d$e
j!Z-G d%d& d&eZ.G d'd( d(e.Z/G d)d* d*e.eZ0g d+Z1dS ),    N)ListOptionalTupleUnion)Tensordevicenn)CrossEntropyLoss   )ACT2FN)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModelapply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging   )BlipTextConfigc                       s\   e Zd ZdZ fddZ				ddeej deej deej d	e	d
ej
f
ddZ  ZS )BlipTextEmbeddingsz;Construct the embeddings from word and position embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _
t|j| _| jdt|jddd t|dd| _|| _d S )	N)Zpadding_idxZepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutZregister_buffertorcharangeexpandgetattrr   configselfr/   	__class__ Z/var/www/auris/lib/python3.10/site-packages/transformers/models/blip/modeling_blip_text.pyr   1   s   

zBlipTextEmbeddings.__init__Nr   	input_idsr   inputs_embedspast_key_values_lengthreturnc           	      C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u r/| |}|}| jdkr?| |}||7 }| |}| |}|S )Nr   r   r   )sizer   r#   r   r%   r&   r*   )	r1   r6   r   r7   r8   input_shape
seq_length
embeddingsr%   r4   r4   r5   forwardC   s   





zBlipTextEmbeddings.forward)NNNr   )__name__
__module____qualname____doc__r   r   r+   Z
LongTensorFloatTensorintr   r>   __classcell__r4   r4   r2   r5   r   .   s$    r   c                       s   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Z						dde	j
dee	j dee	j dee	j dee	j deeee	j   dee dee	j
 fddZ  ZS )BlipTextSelfAttentionc                    s   t    || _|j|j dkrt|dstd|j|jf |j| _t|j|j | _| j| j | _	t
|j| j	| _|rQt
|j| j	| _t
|j| j	| _nt
|j| j	| _t
|j| j	| _t
|j| _t|dd| _| jdks{| jdkr|j| _t
d|j d	 | j| _d S d S )
Nr   Zembedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r   r   relative_keyrelative_key_query   r   )r   r   r/   r"   num_attention_headshasattr
ValueErrorrD   attention_head_sizeall_head_sizer   LinearqueryZencoder_hidden_sizekeyvaluer(   Zattention_probs_dropout_probr*   r.   r   r$   r    distance_embeddingr1   r/   is_cross_attentionr2   r4   r5   r   c   s.   

zBlipTextSelfAttention.__init__c                 C   
   || _ d S Nattn_gradients)r1   rY   r4   r4   r5   save_attn_gradients~      
z)BlipTextSelfAttention.save_attn_gradientsc                 C      | j S rW   rX   r1   r4   r4   r5   get_attn_gradients      z(BlipTextSelfAttention.get_attn_gradientsc                 C   rV   rW   attention_map)r1   ra   r4   r4   r5   save_attention_map   r[   z(BlipTextSelfAttention.save_attention_mapc                 C   r\   rW   r`   r]   r4   r4   r5   get_attention_map   r_   z'BlipTextSelfAttention.get_attention_mapc                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   rI   r   r
   )r:   rJ   rM   viewpermute)r1   xZnew_x_shaper4   r4   r5   transpose_for_scores   s   
z*BlipTextSelfAttention.transpose_for_scoresNFhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsr9   c                 C   sf  |  |}|d u}	|	r| | |}
| | |}|}n;|d urI| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n| | |}
| | |}| |}|
|f}t||
dd}| jdksv| jdkr|	 d }tj
|tj|jd	dd}tj
|tj|jd	dd}|| }| || j d }|j|jd
}| jdkrtd||}|| }n| jdkrtd||}td|
|}|| | }|t| j }|d ur|||j }tjdd|}| |}|d ur|| }t||}|dddd }|	 d d | jf }|j| }|r)||fn|f}||f }|S )Nr   rI   dimr   r   rG   rH   )dtyper   rr   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   )rP   rg   rQ   rR   r+   catmatmulZ	transposer   r:   r,   longr   rd   rS   r$   torr   ZeinsummathsqrtrM   r   ZSoftmaxr*   re   
contiguousrN   )r1   rh   ri   rj   rk   rl   rm   rn   Zmixed_query_layerrU   Z	key_layerZvalue_layerZquery_layerZattention_scoresr<   Zposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZattention_probs_droppedZcontext_layerZnew_context_layer_shapeoutputsr4   r4   r5   r>      sX   









zBlipTextSelfAttention.forwardNNNNNF)r?   r@   rA   r   rZ   r^   rb   rc   rg   r+   r   r   rC   r   boolr>   rE   r4   r4   r2   r5   rF   b   s<    	rF   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )BlipTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r   r   r   rO   r"   denser&   r'   r(   r)   r*   r0   r2   r4   r5   r         
zBlipTextSelfOutput.__init__rh   input_tensorr9   c                 C   &   |  |}| |}| || }|S rW   r   r*   r&   r1   rh   r   r4   r4   r5   r>         

zBlipTextSelfOutput.forwardr?   r@   rA   r   r+   r   r>   rE   r4   r4   r2   r5   r          $r   c                       s   e Zd Zd fdd	Zdd Z						ddejdeej d	eej d
eej deej dee	e	ej   dee
 de	ej fddZ  ZS )BlipTextAttentionFc                    s,   t    t||| _t|| _t | _d S rW   )r   r   rF   r1   r   outputsetpruned_headsrT   r2   r4   r5   r      s   

zBlipTextAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   ro   )lenr   r1   rJ   rM   r   r   rP   rQ   rR   r   r   rN   union)r1   headsindexr4   r4   r5   prune_heads   s   zBlipTextAttention.prune_headsNrh   ri   rj   rk   rl   rm   rn   r9   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )r1   r   )r1   rh   ri   rj   rk   rl   rm   rn   Zself_outputsattention_outputr{   r4   r4   r5   r>   	  s   
	zBlipTextAttention.forward)Fr|   )r?   r@   rA   r   r   r+   r   r   rC   r   r}   r>   rE   r4   r4   r2   r5   r      s4    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )BlipTextIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rW   )r   r   r   rO   r"   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr0   r2   r4   r5   r   #  s
   
zBlipTextIntermediate.__init__rh   r9   c                 C      |  |}| |}|S rW   )r   r   r1   rh   r4   r4   r5   r>   +     

zBlipTextIntermediate.forwardr   r4   r4   r2   r5   r   "  s    r   c                       r~   )BlipTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r   r   r   rO   r   r"   r   r&   r'   r(   r)   r*   r0   r2   r4   r5   r   3  r   zBlipTextOutput.__init__rh   r   r9   c                 C   r   rW   r   r   r4   r4   r5   r>   9  r   zBlipTextOutput.forwardr   r4   r4   r2   r5   r   2  r   r   c                       s   e Zd Z fddZ						ddejdeej deej deej d	eej d
eeeej   dee	 deej fddZ
dd Z  ZS )BlipTextLayerc                    s`   t    || _|j| _d| _t|| _|| _| jjr$t|| jjd| _	t
|| _t|| _d S )Nr   )rU   )r   r   r/   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r1   r/   r   r2   r4   r5   r   A  s   


zBlipTextLayer.__init__NFrh   ri   rj   rk   rl   rm   rn   r9   c                 C   s   |d ur
|d d nd }| j |||||d}	|	d }
|	dd }|	d }|d ur?| j|
|||||d}|d }
||dd  }t| j| j| j|
}|f| }||f }|S )NrI   )rn   rm   r   r   r   )rn   )r   r   r   feed_forward_chunkr   r   )r1   rh   ri   rj   rk   rl   rm   rn   Zself_attn_past_key_valueZself_attention_outputsr   r{   Zpresent_key_valueZcross_attention_outputslayer_outputr4   r4   r5   r>   M  s8   

zBlipTextLayer.forwardc                 C   s   |  |}| ||}|S rW   )r   r   )r1   r   Zintermediate_outputr   r4   r4   r5   r   y  s   
z BlipTextLayer.feed_forward_chunkr|   )r?   r@   rA   r   r+   r   r   rC   r   r}   r>   r   rE   r4   r4   r2   r5   r   @  s4    	
,r   c                       s   e Zd Z fddZ									ddejdeej deej d	eej d
eej deeeej   dee	 dee	 dee	 dee	 de
eej ef fddZ  ZS )BlipTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |qS r4   )r   ).0ir/   r4   r5   
<listcomp>      z,BlipTextEncoder.__init__.<locals>.<listcomp>F)	r   r   r/   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingr0   r2   r   r5   r     s   
 
zBlipTextEncoder.__init__NFTrh   ri   rj   rk   rl   past_key_values	use_cachern   output_hidden_statesreturn_dictr9   c                 C   s^  | j r| jr|rtd d}|	rdnd }|rdnd }|r#| jjr#dnd }|r)dnd }t| jjD ]]}| j| }|	r?||f }|d urG|| nd }|d urQ|| nd }| j rg| jrg| 	|j
|||||||}n
||||||||}|d }|r~||d f7 }|r||d f }||d f }q1|	r||f }|
stdd	 |||||fD S t|||||d
S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr4   r   r   r   rI   c                 s   s    | ]	}|d ur|V  qd S rW   r4   )r   vr4   r4   r5   	<genexpr>  s    z*BlipTextEncoder.forward.<locals>.<genexpr>)last_hidden_stater   rh   
attentionscross_attentions)r   Ztrainingloggerwarningr/   r   r   r   r   Z_gradient_checkpointing_func__call__tupler   )r1   rh   ri   rj   rk   rl   r   r   rn   r   r   Zall_hidden_statesZall_self_attentionsZall_cross_attentionsZnext_decoder_cacher   Zlayer_moduleZlayer_head_maskrm   Zlayer_outputsr4   r4   r5   r>     sz   



zBlipTextEncoder.forward)	NNNNNNFFT)r?   r@   rA   r   r+   r   r   rC   r   r}   r   r   r>   rE   r4   r4   r2   r5   r     sD    		
r   c                       r   )BlipTextPoolerc                    s*   t    t|j|j| _t | _d S rW   )r   r   r   rO   r"   r   ZTanh
activationr0   r2   r4   r5   r     s   
zBlipTextPooler.__init__rh   r9   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r1   rh   Zfirst_token_tensorpooled_outputr4   r4   r5   r>     s   

zBlipTextPooler.forwardr   r4   r4   r2   r5   r     s    r   c                       r   )BlipTextPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r   )r   r   r   rO   r"   r   r   r   r   r   transform_act_fnr&   r'   r0   r2   r4   r5   r     s   
z(BlipTextPredictionHeadTransform.__init__rh   r9   c                 C   s"   |  |}| |}| |}|S rW   )r   r   r&   r   r4   r4   r5   r>     s   


z'BlipTextPredictionHeadTransform.forwardr   r4   r4   r2   r5   r     s    	r   c                       s,   e Zd Z fddZdd Zdd Z  ZS )BlipTextLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)bias)r   r   r   	transformr   rO   r"   r!   decoder	Parameterr+   Zzerosr   r0   r2   r4   r5   r     s
   

z!BlipTextLMPredictionHead.__init__c                 C   s   | j | j_ d S rW   )r   r   r]   r4   r4   r5   _tie_weights  s   z%BlipTextLMPredictionHead._tie_weightsc                 C   r   rW   )r   r   r   r4   r4   r5   r>     r   z BlipTextLMPredictionHead.forward)r?   r@   rA   r   r   r>   rE   r4   r4   r2   r5   r      s    r   c                       r   )BlipTextOnlyMLMHeadc                    s   t    t|| _d S rW   )r   r   r   predictionsr0   r2   r4   r5   r     s   
zBlipTextOnlyMLMHead.__init__sequence_outputr9   c                 C   s   |  |}|S rW   )r   )r1   r   prediction_scoresr4   r4   r5   r>     s   
zBlipTextOnlyMLMHead.forwardr   r4   r4   r2   r5   r     s    r   c                   @   s$   e Zd ZdZeZdZg Zdd ZdS )BlipTextPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bertc                 C   s~   t |tjtjfr|jjjd| jjd nt |tj	r(|j
j  |jjd t |tjr;|j
dur=|j
j  dS dS dS )zInitialize the weightsg        )meanZstd      ?N)r   r   rO   r    weightdataZnormal_r/   Zinitializer_ranger&   r   Zzero_Zfill_)r1   moduler4   r4   r5   _init_weights-  s   z%BlipTextPreTrainedModel._init_weightsN)	r?   r@   rA   rB   r   Zconfig_classZbase_model_prefixZ_no_split_modulesr   r4   r4   r4   r5   r   #  s    r   c                !       s  e Zd ZdZd" fdd	Zdd Zdd Zd	d
 Zdede	e
 dededef
ddZ														d#deej deej deej deej deej deej deej deej deeej  dee dee dee dee dee dee	ej ef fd d!Z  ZS )$BlipTextModela  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    Tc                    sD   t  | || _t|| _t|| _|rt|nd | _| 	  d S rW   )
r   r   r/   r   r=   r   encoderr   poolerZ	post_init)r1   r/   add_pooling_layerr2   r4   r5   r   D  s   

zBlipTextModel.__init__c                 C   s   | j jS rW   r=   r#   r]   r4   r4   r5   get_input_embeddingsN  s   z"BlipTextModel.get_input_embeddingsc                 C   s   || j _d S rW   r   )r1   rR   r4   r4   r5   set_input_embeddingsQ  s   z"BlipTextModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r1   Zheads_to_pruner   r   r4   r4   r5   _prune_headsU  s   zBlipTextModel._prune_headsri   r;   r   r   r9   c                 C   sX  |  dkr|dddddddf }n|  dkr|r|\}}tj||d}|ddddf ||d|ddddf k}	|	|j}	|	jd |jd k rl|jd |	jd  }
tjtj|||
f||	jd|	gdd}	|	dddddddf |ddddddf  }n|ddddddf }n	t	d	
||j|j| jd
}d| d }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r
   NrI   r   r   )r   rr   r   )ZaxiszAWrong shape for input_ids (shape {}) or attention_mask (shape {})rs   r   g     )rp   r+   r,   repeatrw   rr   shapert   onesrL   format)r1   ri   r;   r   r   extended_attention_mask
batch_sizer<   Zseq_idsZcausal_maskZprefix_seq_lenr4   r4   r5   get_extended_attention_mask]  s8   .6
z)BlipTextModel.get_extended_attention_maskNFr6   r   rj   r7   encoder_embedsrk   rl   r   r   rn   r   r   c                    sv  |dur|n j j}|dur|n j j}|dur|n j j}|r+|
dur&|
n j j}
nd}
|dur9|dur9td|durO || | }|\}}|j}n,|durc| dd }|\}}|j}n|durw| dd }|\}}|j}ntd|	dur|	d d j	d nd}|du rt
||| f|} ||||}|durt|tr|d  \}}}n| \}}}||f}t|trχ fdd	|D }n|du rt
j||d
} |}n |}nd} | j j}|du r j||||d}n|} j||||||	|
|||d
}|d } jdur |nd}|s-||f|dd  S t|||j|j|j|jdS )a.  
        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   rI   c                    s   g | ]}  |qS r4   )invert_attention_mask)r   maskr]   r4   r5   r     r   z)BlipTextModel.forward.<locals>.<listcomp>r   )r6   r   r7   r8   )	ri   rj   rk   rl   r   r   rn   r   r   r   )r   Zpooler_outputr   rh   r   r   )r/   rn   r   use_return_dictr   rL   Z%warn_if_padding_and_no_attention_maskr:   r   r   r+   r   rw   r   r   listr   Zget_head_maskr   r=   r   r   r   r   rh   r   r   )r1   r6   ri   r   rj   r7   r   rk   rl   r   r   rn   r   r   r   r;   r   r<   r   r8   r   Zencoder_batch_sizeZencoder_sequence_length_Zencoder_hidden_shapeZencoder_extended_attention_maskZembedding_outputZencoder_outputsr   r   r4   r]   r5   r>     s   #

zBlipTextModel.forward)T)NNNNNNNNNNNNNF)r?   r@   rA   rB   r   r   r   r   r   r   rD   r   r}   r   r   r+   r   rC   r   r   r>   rE   r4   r4   r2   r5   r   ;  s|    

B	
r   c                %       s"  e Zd Z fddZdd Zdd Zdd Zd	d
 Z																d&dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 deee	j
  dee dee dee dee dee dee dee deee	j
 ef f"d d!Zd'd"d#Zd$d% Z  ZS )(BlipTextLMHeadModelc                    s0   t  | t|dd| _t|| _|j| _d S )NF)r   )r   r   r   r   r   clslabel_smoothingr0   r2   r4   r5   r   +  s   
zBlipTextLMHeadModel.__init__c                 C   s
   | j  S rW   )r   r   r]   r4   r4   r5   r   2  r[   z(BlipTextLMHeadModel.get_input_embeddingsc                 C   s   | j | d S rW   )r   r   r1   Znew_embeddingsr4   r4   r5   r   5  s   z(BlipTextLMHeadModel.set_input_embeddingsc                 C   s
   | j jjS rW   )r   r   r   r]   r4   r4   r5   get_output_embeddings8  r[   z)BlipTextLMHeadModel.get_output_embeddingsc                 C   s   || j j_|j| j j_d S rW   )r   r   r   r   r   r4   r4   r5   set_output_embeddings;  s   
z)BlipTextLMHeadModel.set_output_embeddingsNFTr   r6   ri   r   rj   r7   rk   rl   labelsr   r   rn   r   r   return_logitsr   	reductionr9   c                 C   sT  |dur|n| j j}|durd}
| j||||||||	|
||||d}|d }| |}|r<|ddddddf  S d}|dur|ddddddf  }|ddddf  |j}t|| jd}||	d| j j
|	d}|dkr|	|ddd}|s|f|d	d  }|dur|f| S |S t|||j|j|j|jd
S )a  
        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
            configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NF)ri   r   rj   r7   rk   rl   r   r   rn   r   r   r   r   r   r   )r   r   nonerI   )ZlossZlogitsr   rh   r   r   )r/   r   r   r   rz   rw   r   r	   r   rd   r!   r:   sumr   r   rh   r   r   )r1   r6   ri   r   rj   r7   rk   rl   r   r   r   rn   r   r   r   r   r   r{   r   r   Zlm_lossZshifted_prediction_scoresZloss_fctr   r4   r4   r5   r>   ?  sR   )
 zBlipTextLMHeadModel.forwardc                 K   s   |j }|d u r||}|d ur4|d d j d }|j d |kr#|}n|j d d }|d d |d f }||||dd |dd ddS )Nr   rI   r   rk   rl   T)r6   ri   r   rk   rl   r   )r   Znew_onesget)r1   r6   r   ri   Zmodel_kwargsr;   Zpast_lengthZremove_prefix_lengthr4   r4   r5   prepare_inputs_for_generation  s    


z1BlipTextLMHeadModel.prepare_inputs_for_generationc                    s.   d}|D ]}|t  fdd|D f7 }q|S )Nr4   c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectrw   r   )r   Z
past_statebeam_idxr4   r5   r     s   " z5BlipTextLMHeadModel._reorder_cache.<locals>.<genexpr>)r   )r1   r   r   Zreordered_pastZ
layer_pastr4   r   r5   _reorder_cache  s   z"BlipTextLMHeadModel._reorder_cache)NNNNNNNNNNNNNFTr   )NN)r?   r@   rA   r   r   r   r   r   r   r+   r   r   r}   r   r   r   r   r>   r   r   rE   r4   r4   r2   r5   r   *  sv    	


Zr   )r   r   r   )2rx   typingr   r   r   r   r+   Ztorch.utils.checkpointr   r   r   Ztorch.nnr	   Zactivationsr   Z
generationr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   r   r   utilsr   Zconfiguration_blipr   Z
get_loggerr?   r   Moduler   rF   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r4   r4   r4   r5   <module>   s>   
42@^ p 