
    fTh                        S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJrJrJr  SSK J!r!J"r"  \RF                  " \$5      r%S\	RL                  S\	RL                  4S jr'S\	RL                  S\	RL                  4S jr(\ " S S\5      5       r)\ " S S\5      5       r*\ " S S\5      5       r+\ " S S\5      5       r, " S S\RZ                  5      r. " S S \RZ                  5      r/ " S! S"\RZ                  5      r0 " S# S$\RZ                  5      r1 " S% S&\RZ                  5      r2\ " S' S(\5      5       r3 " S) S*\RZ                  5      r4 " S+ S,\35      r5\" S-S.9 " S/ S0\35      5       r6\" S1S.9 " S2 S3\3\5      5       r7\" S4S.9 " S5 S6\3\5      5       r8\" S7S.9 " S8 S9\35      5       r9/ S:Qr:g);zPyTorch BLIP model.    N)	dataclass)AnyOptionalTupleUnion)nn)	normalize   )ACT2FN)GenerationMixin)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )
BlipConfigBlipTextConfigBlipVisionConfig)BlipTextLMHeadModelBlipTextModellogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/blip/modeling_blip.pycontrastive_lossr%   '   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r%   t)r'   caption_loss
image_losss      r$   	blip_lossr,   ,   s*    #J/L!*,,.1J%,,r&   c                   :   \ rS rSr% SrSr\\\R                        \
S'   Sr\\\R                        \
S'   Sr\\R                     \
S'   Sr\\R                     \
S'   Sr\\\R                  S4      \
S	'   Sr\\\R                  S4      \
S
'   \S 5       rSrg)'BlipForConditionalGenerationModelOutput2   a  
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
last hidden states. This class also adds the loss term from the text decoder.

Args:
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the text decoder.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
        Prediction scores of the language modeling head of the text decoder model.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
        The image embeddings obtained after applying the Vision Transformer model to the input image.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossr   image_embedslast_hidden_state.hidden_states
attentionsc                 P    [         R                  " S[        5        U R                  $ )Nz`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers. Please use the `logits` attribute to retrieve the final output instead.)warningswarnFutureWarningr   selfs    r$   decoder_logits6BlipForConditionalGenerationModelOutput.decoder_logitsU   s#    W	

 {{r&    )__name__
__module____qualname____firstlineno____doc__r0   r   r   r!   FloatTensor__annotations__r   r1   r2   r3   r4   propertyr;   __static_attributes__r=   r&   r$   r.   r.   2   s    4 04D(5**+
,315FHU5,,-.504L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567> r&   r.   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)BlipTextVisionModelOutput_   a~  
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
last hidden states. This class also adds the loss term from the text decoder.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss from the text decoder.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr0   r1   r2   .r3   r4   r=   )r>   r?   r@   rA   rB   r0   r   r!   rC   rD   r1   r2   r3   r   r4   rF   r=   r&   r$   rH   rH   _   s    0 )-D(5$$
%,04L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r&   rH   c                   t   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S	'   Sr\\R                     \	S
'   Sr\\\R                  S4      \	S'   Sr\\\R                        \	S'   Srg) BlipImageTextMatchingModelOutput   a"  
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
scores.

Args:
    itm_score (`torch.FloatTensor`):
        The image-text similarity scores.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss from the text decoder.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
        Last layer hidden-state of the vision of the vision-only branch of the model.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    question_embeds (`torch.FloatTensor`):
        The question embeddings obtained by the text projection layer.
N	itm_scorer0   r1   r2   .r3   vision_pooler_outputr4   question_embedsr=   )r>   r?   r@   rA   rB   rM   r   r!   rC   rD   r0   r1   r2   r3   r   rN   r4   rO   rF   r=   r&   r$   rK   rK      s    > .2Ix))*1(,D(5$$
%,04L(5,,-459x 1 129=AM8E%"3"3S"89:A8<(5#4#45<:>Ju00#567>:>OXeE$5$567>r&   rK   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)
BlipOutput   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
    image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
    text_model_output(`BaseModelOutputWithPooling`):
        The output of the [`BlipTextModel`].
    vision_model_output(`BaseModelOutputWithPooling`):
        The output of the [`BlipVisionModel`].
Nr0   logits_per_imagelogits_per_texttext_embedsr1   text_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))rV   rW   N)getattrto_tuple).0kr:   s     r$   	<genexpr>&BlipOutput.to_tuple.<locals>.<genexpr>   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysr9   s   `r$   r[   BlipOutput.to_tuple   s#     
YY[
 
 	
r&   r=   )r>   r?   r@   rA   rB   r0   r   r!   rC   rD   rS   rT   rU   r1   rV   r   rW   r   r   r[   rF   r=   r&   r$   rQ   rQ      s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r&   rQ   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )BlipVisionEmbeddings   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr   r
   )in_channelsout_channelskernel_sizestride   )super__init__rf   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr!   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr:   rf   	__class__s     r$   rn   BlipVisionEmbeddings.__init__   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"br&   
embeddingsheightwidthr   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Ng      ?r   r
   rl   bicubicF)sizemodealign_cornersdim)shaperz   r!   jit
is_tracingrr   r   reshapepermuter   r   interpolateviewcat)r:   r~   r   r   rx   ry   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r$   interpolate_pos_encoding-BlipVisionEmbeddings.interpolate_pos_encoding   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr&   pixel_valuesr   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )Ndtyperl   r   r   r   )r   rw   weightr   toflatten	transposeru   expandr!   r   r   rz   r   )r:   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr~   rz   s               r$   forwardBlipVisionEmbeddings.forward  s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
r&   )	ru   rf   rp   rq   rx   ry   rw   rr   rz   F)r>   r?   r@   rA   r   rn   r!   Tensorintr   rC   boolr   rF   __classcell__r|   s   @r$   rd   rd      sr    c/ c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  r&   rd   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )BlipTextEmbeddingsi  rf   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nposition_ids)r   r   F)
persistent)rm   rn   ro   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsrz   register_bufferr!   r"   r   r:   rf   rp   r|   s      r$   rn   BlipTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r&   	input_idsr   inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nr   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rz   r   
ValueErrorr   r   )r:   r   r   r   
seq_lengthmax_position_embeddingposition_embeddingsr~   s           r$   r   BlipTextEmbeddings.forward(  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r&   )rz   r   )NNN)r>   r?   r@   rA   r   rn   r   r!   
LongTensorrC   r   r   rF   r   r   s   @r$   r   r     so    

~ 

 153759	E,,- u//0   1 12	
 
 r&   r   c                     ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	  SS\R                  S	\
\R                     S
\
\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )BlipAttentioniC  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        [        R                  " UR                  5      U l        [        R                  " U R                  SU R                  -  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r
   )rm   rn   rf   ro   rp   num_attention_heads	num_headshead_dimr   scaler   Dropoutattention_dropoutdropoutLinearqkv
projectionr{   s     r$   rn   BlipAttention.__init__F  s    ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
zz&":":;99T^^Q-?@))DNNDNNCr&   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   rl   )r   r   r   r   
contiguous)r:   r   r   r   s       r$   _shapeBlipAttention._shapeX  s5    {{3GQQRSUVWbbddr&   r3   	head_maskoutput_attentionsr   c                    UR                  5       u  pEnU R                  U5      R                  XESU R                  X`R                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        R                  " XR                  SS5      5      nXR                  -  n[        R                  R                  USS9nU R                  U5      nUb  X-  n[
        R                  " X5      R	                  SSSS5      nUR                  5       S	S U R                  4-   nUR                  U5      nU R                  U5      nU(       a  X4nU$ US	4nU$ )
z#Input shape: Batch x Time x Channelr
   rl   r   r      r   r   r   N)r   r   r   r   r   r!   matmulr   r   r   r   softmaxr   rp   r   )r:   r3   r   r   r   tgt_lenrp   	mixed_qkvquery_states
key_statesvalue_statesattention_scoresattention_probscontext_layernew_context_layer_shapeoutputoutputss                    r$   r   BlipAttention.forward[  s[    #0"4"4"6i HH]#WS1dnni>>6QRWQ1a# 	
 2;1y|YWX\, !<<6J6J2r6RS+jj8 --//0@b/I ,,7  -9O_CKKAqRSUVW"/"4"4"6s";t~~>O"O%--.EF//@6+ HNtnr&   )rf   r   rp   r   r   r   r   r   NF)r>   r?   r@   rA   rB   rn   r!   r   r   r   r   r   r   r   rF   r   r   s   @r$   r   r   C  s    GD$eU\\ eC ec e -1,1	*||* ELL)* $D>	*
 
u||Xell3XeELL>Q5RR	S* *r&   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)rm   rn   rf   r   
hidden_actactivation_fnr   r   ro   intermediate_sizefc1fc2r{   s     r$   rn   BlipMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr&   r3   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r:   r3   s     r$   r   BlipMLP.forward  s4    /**=9/r&   )r   rf   r   r   )
r>   r?   r@   rA   rn   r!   r   r   rF   r   r   s   @r$   r   r     s)    KU\\ ell  r&   r   c            
          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S\\	   S\
\R                     4S jjrS	rU =r$ )BlipEncoderLayeri  rf   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)eps)rm   rn   ro   rp   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r{   s     r$   rn   BlipEncoderLayer.__init__  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr&   r3   attention_maskr   r   c                     UnU R                  U5      nU R                  UUUS9u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU4nU(       a  Xe4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r3   r   r   )r   r   r  r  )r:   r3   r  r   residualattn_weightsr   s          r$   r   BlipEncoderLayer.forward  s      !((7&*nn'$/ '5 '
#
 &0 ((7/%0 "&Gr&   )rp   r   r  r  r   r   )r>   r?   r@   rA   r   rn   r!   r   r   r   r   rC   r   rF   r   r   s   @r$   r   r     s]    Sz S -2	$||$ $ $D>	$
 
u  	!$ $r&   r   c                   4    \ rS rSr\rSrSrSS/rS/r	S r
Srg	)
BlipPreTrainedModeli  blipTr   r   past_key_valuec                 r   U R                   R                  n[        U[        R                  5      (       d>  [        U[        R
                  5      (       d  [        U[        R                  5      (       af  UR                  R                  R                  SUS9  [        US5      (       a1  UR                  b$  UR                  R                  R                  5         [        U[        5      (       a  [        U R                   S5      (       a   U R                   R                  R                  n[        R                  R!                  UR"                  SUS9  [        R                  R!                  UR$                  SUS9  g[        U[        R&                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R)                  S5        g[        U[        R                  5      (       a3  UR                  b%  UR                  R                  R                  5         ggg)zInitialize the weightsg        )meanstdbiasNvision_configg      ?)rf   initializer_range
isinstancer   rv   r   r   r   datanormal_hasattrr  zero_rd   r  inittrunc_normal_rz   ru   r   fill_)r:   modulefactors      r$   _init_weights!BlipPreTrainedModel._init_weights  s   ..fbii((Jvr||,L,LPZ[aceclclPmPmMM&&CV&<vv&&6;;+B  &&(f233t{{O4422DDGG!!)) "  GG!!&& "  --KK""$MM$$S)		**v{{/FKK""$ 0G*r&   r=   N)r>   r?   r@   rA   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr  rF   r=   r&   r$   r
  r
    s-    L&*#+-AB#3"4%r&   r
  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\\R                     S\\
   S\\
   S\\
   S	\\\4   4
S
 jjrSrU =r$ )BlipEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`BlipEncoderLayer`].

Args:
    config (`BlipConfig`):
        The corresponding vision configuration for the `BlipEncoder`.
rf   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
rm   rn   rf   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r:   rf   r   r|   s      r$   rn   BlipEncoder.__init__  sS    mmuVMeMeGf$gGf!%5f%=Gf$gh&+# %hs   A&r  r   output_hidden_statesreturn_dictr   c                 H   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn[	        U R
                  5       Hp  u  pU(       a  Xh4-   nU R                  (       a0  U R                  (       a  U R                  U
R                  UUU5      nOU
" UUUS9nUS   nU(       d  Mh  X{S   4-   nMr     U(       a  Xh4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr=   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r=   )r\   vs     r$   r^   &BlipEncoder.forward.<locals>.<genexpr>>  s     e$Sq$S   	)r2   r3   r4   )rf   r   r-  use_return_dict	enumerater*  r+  training_gradient_checkpointing_func__call__r`   r   )r:   r   r  r   r-  r.  encoder_statesall_attentionsr3   idxencoder_layerlayer_outputss               r$   r   BlipEncoder.forward  s1   8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!"%	! !.!"&7! *!,M  !/3C2E!E) #9,  +.>>Ne]N$Seee+Vd
 	
r&   )rf   r+  r*  NNNN)r>   r?   r@   rA   rB   r   rn   r   r!   r   r   r   r   r   r   rF   r   r   s   @r$   r%  r%    s    ,z , 26,0/3&*C
 !.C
 $D>	C

 'tnC
 d^C
 
uo%	&C
 C
r&   r%  c                      ^  \ rS rSrSr\rS\4U 4S jjr\     SS\	\
R                     S\	\   S\	\   S\	\   S\S	\\\4   4S
 jj5       rS rSrU =r$ )BlipVisionModeliD  r   rf   c                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )rm   rn   rf   ro   rd   r~   r%  encoderr   r   r   post_layernorm	post_initr   s      r$   rn   BlipVisionModel.__init__H  sY     &&	.v6"6* ll9:O:OPr&   r   r-  r.  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  UUUUS9nUS   nU R                  U5      nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r   )r   r   r-  r.  r   r   )r2   pooler_outputr3   r4   )rf   r   r-  r4  r   r~   rC  rD  r   r3   r4   )
r:   r   r   r-  r.  r   r3   encoder_outputsr2   pooled_outputs
             r$   r   BlipVisionModel.forwardS  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%58KKK)/')77&11	
 	
r&   c                     U R                   $ r   )r~   r9   s    r$   get_input_embeddings$BlipVisionModel.get_input_embeddings~  s    r&   )rf   r~   rC  rD  NNNNF)r>   r?   r@   rA   main_input_namer   r  rn   r   r   r!   rC   r   r   r   r   r   rM  rF   r   r   s   @r$   rA  rA  D  s    $O#L	/ 	  59,0/3&*).(
u001(
 $D>(
 'tn	(

 d^(
 #'(
 
u00	1(
 (
T r&   rA  z
    This model is going to be deprecated in future versions. Please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.
    )custom_introc                     ^  \ rS rSr\rS\4U 4S jjrS rS r\	    SS\
\R                     S\
\R                     S\
\R                     S	\
\   S
\R                  4
S jj5       r\	   SS\
\R                     S	\
\   S\S
\R                  4S jj5       r\	     SS\
\R"                     S\
\R                     S\
\R                     S	\
\   S\S
\R                  4S jj5       r\	         SS\
\R"                     S\
\R                     S\
\R                     S\
\R"                     S\
\   S\
\   S\
\   S	\
\   S\S
\\\4   4S jj5       rSrU =r$ )	BlipModeli  rf   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        UR                  U l        [        U5      U l        [        U5      U l        ["        R$                  " U R                  U R                  SS9U l        ["        R$                  " U R                  U R                  SS9U l        ["        R*                  " [,        R.                  " U R0                  R2                  5      5      U l        [6        R9                  S5        U R;                  5         g )NzKconfig.text_config is expected to be of type BlipTextConfig but is of type .zOconfig.vision_config is expected to be of type BlipVisionConfig but is of type F)r  z`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.)rm   rn   r  text_configr   	TypeErrortyper  r   projection_dimro   text_embed_dimvision_embed_dimr   
text_modelrA  vision_modelr   r   visual_projectiontext_projectionrs   r!   r   rf   logit_scale_init_valuelogit_scaleloggerwarningrE  )r:   rf   rV  r  r|   s       r$   rn   BlipModel.__init__  sn    &,,n==++,-Q0 
 &..0@AA--./q2 
 ((,,$33)55 - 9 9'4+M:!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY G	

 	r&   c                 6    U R                   R                  5       $ r   )r\  rM  r9   s    r$   rM  BlipModel.get_input_embeddings  s    3355r&   c                 :    U R                   R                  U5        g r   )r\  set_input_embeddingsr:   values     r$   rh  BlipModel.set_input_embeddings  s    ,,U3r&   r   r  r   r.  r   c                     Ub  UOU R                   R                  nU R                  UUUUS9nUS   nU R                  U5      nU$ )aY  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`BlipTextModel`].

Examples:

```python
>>> from transformers import AutoProcessor, BlipModel

>>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```)r   r  r   r.  r   )rf   r4  r\  r_  )r:   r   r  r   r.  text_outputsrJ  text_featuress           r$   get_text_featuresBlipModel.get_text_features  sZ    0 &1%<k$++B]B])%#	 ' 
 %Q,,];r&   r   r   c                     Ub  UOU R                   R                  nU R                  UUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`BlipVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipModel

>>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```)r   r.  r   r   )rf   r4  r]  r^  )r:   r   r.  r   vision_outputsrJ  image_featuress          r$   get_image_featuresBlipModel.get_image_features  sZ    : &1%<k$++B]B]**%#%= + 
 'q)//>r&   c                 &   Ub  UOU R                   R                  nU R                  USSUUS9nUS   n[        R                  " UR                  5       SS [        R                  S9nU R                  UUUUUS9n	U	S   n
U R                  U
5      nU$ )	aK  
Returns:
    multimodal_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The multimodal embeddings
    obtained by applying the image embeddings to the text encoder using the cross-attention mechanism.

Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipModel

>>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["a photo of a cat", "a photo of a dog"]
>>> inputs = processor(images=image, text=texts, padding=True, return_tensors="pt")

>>> multimodal_features = model.get_multimodal_features(**inputs)
```NTr   r   r-  r.  r   r   r   r   r   r  encoder_hidden_statesencoder_attention_maskr.  r   )	rf   r4  r]  r!   onesr   longr\  r_  )r:   r   r   r  r.  r   rr  r1   
image_attsrm  rJ  multimodal_featuress               r$   get_multimodal_features!BlipModel.get_multimodal_features  s    < &1%<k$++B]B]**%"!%#%= + 
 &a(ZZ 1 1 3CR 8

K
)".#-# ' 
 %Q"22=A""r&   return_lossr   r-  c
           
      $   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUU	S9n
U R                  UUUUUUS9nU
S   nU R                  U5      nUS   nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       R                  UR                  S	9nUR                  UR                  UR                  S
9n[        R                  " XR!                  5       5      U-  nUR!                  5       nSnU(       a  [#        U5      nU(       d  UXXU
4nUb  U4U-   $ U$ [%        UUUUUUU
S9$ )a_  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipModel

>>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```Nrw  )r   r  r   r   r-  r.  r   rl   r   T)pr   keepdimr   )r   r   )r0   rS   rT   rU   r1   rV   rW   )rf   r   r-  r4  r]  r\  r^  r_  normra  expr   r   r   r!   r   r)   r,   rQ   )r:   r   r   r  r   r  r   r-  r.  r   rr  rm  r1   rU   ra  rT   rS   r0   r   s                      r$   r   BlipModel.forward;  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5#%= + 
 )%/!5# ' 
 &a(--l;"1o**;7 $&7&7!T&7&RR!$4$4qb$$4$OO &&**,//{7I7I/J#k.@.@HYHYZ,,{NN4DES*,,._-D&lbpqF)-)9TGf$EvE-+#%* .
 	
r&   )ra  rY  rZ  r\  r_  r[  r]  r^  r?  NNFrO  )	NNNNNNNNF)r>   r?   r@   rA   r   r  rn   rM  rh  r   r   r!   r   r   rC   ro  rt  r   r  r   r   rQ   r   rF   r   r   s   @r$   rS  rS    s`    L"z "H64  -115/3&*#ELL)# !.# u||,	#
 d^# 
		# #J  59&*).	'u001' d^' #'	'
 
		' 'R  154815&*).4#E,,-4# u0014# !.	4#
 d^4# #'4# 
		4# 4#l  15481537&*,0/3&*).]
E,,-]
 u001]
 !.	]

 u//0]
 d^]
 $D>]
 'tn]
 d^]
 #']
 
uj 	!]
 ]
r&   rS  a  
    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
    c                     ^  \ rS rSr\rS/rSrS\4U 4S jjrS r	S r
\       SS\R                  S\\R                     S	\\R                     S
\\   S\\   S\\R                     S\\   S\S\\\4   4S jj5       r\R*                  " 5          SS\R                  S\\R                     S	\\R                     S\S\R                  4
S jj5       rSrU =r$ )BlipForConditionalGenerationi  )text_decoder.cls.predictions.decoder.biasr   rf   c                   > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        UR                  R                  U l	        UR                  R                  U l        U R                  5         g r   )rm   rn   rA  r  r]  r   rV  text_decoderbos_token_iddecoder_input_idspad_token_iddecoder_pad_token_idrE  r{   s     r$   rn   %BlipForConditionalGeneration.__init__  sj     +F,@,@A/0B0BC!'!3!3!@!@$*$6$6$C$C! 	r&   c                 6    U R                   R                  5       $ r   )r  rM  r9   s    r$   rM  1BlipForConditionalGeneration.get_input_embeddings        5577r&   c                 :    U R                   R                  U5        g r   )r  rh  ri  s     r$   rh  1BlipForConditionalGeneration.set_input_embeddings      ..u5r&   r   r  r   r-  labelsr.  r   r   c	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9n	U	S   n
U R                  UUU
UUSS9nU(       d4  Ub
  US   US   4OUS   4nXU	S   4U	SS -   -  n[        S U 5       5      $ [        UR                  UR                  U
U	R                  U	R                  U	R                  S	9$ )
aH  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
>>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "A picture of"

>>> inputs = processor(images=image, text=text, return_tensors="pt")

>>> outputs = model(**inputs)
```Nrw  r   r  )r   r  ry  r  r.  	reductionr   rl   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r=   r\   r   s     r$   r^   7BlipForConditionalGeneration.forward.<locals>.<genexpr>       LgFgr3  )r0   r   r1   r2   r3   r4   )rf   r4  r   r-  r]  r  r`   r.   r0   r   r2   r3   r4   )r:   r   r   r  r   r-  r  r.  r   rr  r1   r   s               r$   r   $BlipForConditionalGeneration.forward  s6   @ &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 **%/!5#%= + 
 &a(##)".# $ 
 282Dwqz71:.7ST:-GnQ&78>!";MMMGLgLLL6>>%,>>(66%00
 	
r&   c           
      <   UR                   S   nU R                  UUS9nUS   n[        R                  " UR	                  5       SS [        R
                  UR                  S9n	[        U[        5      (       a  [        R                  " U5      nOmUcj  [        R                  " U R                  U R                  R                  R                  //5      R                  US5      R                  UR                  5      nU R                  R                  R                   USS2S4'   Ub  USS2SS24   OSnU R"                  R$                  " SUSS2SS24   U R                  R                  R&                  U R                  R                  R(                  UUU	S.UD6n
U
$ )	a  
Overrides *generate* function to be able to use the model as a conditional generator

Parameters:
    pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
        Input image to be processed
    input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForConditionalGeneration

>>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
two cats sleeping on a couch
```
r   r   r   Nr   r   r   r   )r   eos_token_idr  r  ry  rz  r=   )r   r]  r!   r{  r   r|  r   r  listr   r  rf   rV  r  repeatr   r  r  generatesep_token_idr  )r:   r   r   r  r   generate_kwargsr   rr  r1   image_attention_maskr   s              r$   r  %BlipForConditionalGeneration.generate  s~   R "''*
**%%= + 

 &a($zz,*;*;*=cr*B%**]i]p]pqi&&((3I  4#9#94;;;R;R;_;_"`!ab
A&L''(  ++11>>	!Q$3A3M3B3/SW##,, 
3B3'00==00==)".#7
 
 r&   )r  r  r  r]  )NNNNNNFr  )r>   r?   r@   rA   r   r  _tied_weights_keysrP  rn   rM  rh  r   r!   rC   r   r   r   r   r   r.   r   no_gradr  rF   r   r   s   @r$   r  r    sy    LEF$Oz 86  1559,0/3-1&*).D
''D
 E,,-D
 !!1!12	D

 $D>D
 'tnD
 ))*D
 d^D
 #'D
 
u==	>D
 D
L ]]_ 1559).H''H E,,-H !!1!12	H
 #'H 
		H Hr&   r  aS  
    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
    with the encoding of the image, and the text decoder will output the answer to the question.
    c                     ^  \ rS rSr\rS/rS\4U 4S jjrS rS r	\
        SS\R                  S\R                  S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\R                     S\\   S\S\\\4   4S jj5       r\R(                  " 5         SS\R                  S\R                  S\\R                     S\S\R                  4
S jj5       rSrU =r$ )BlipForQuestionAnsweringiO  r  rf   c                 J  > [         TU ]  U5        [        UR                  5      U l        [        UR                  SS9U l        [        UR                  5      U l	        UR                  R                  U l        UR                  R                  U l        U R                  5         g )NFadd_pooling_layer)rm   rn   rA  r  r]  r   rV  text_encoderr   r  r  r  r  decoder_start_token_idrE  r{   s     r$   rn   !BlipForQuestionAnswering.__init__Z  s     +F,@,@A)&*<*<PUV/0B0BC$*$6$6$C$C!&,&8&8&E&E# 	r&   c                 :    U R                   R                  U5        g r   r  rh  ri  s     r$   rh  -BlipForQuestionAnswering.set_input_embeddingsi  r  r&   c                 6    U R                   R                  5       $ r   r  rM  r9   s    r$   rM  -BlipForQuestionAnswering.get_input_embeddingsl  s      5577r&   r   r   r  decoder_attention_maskr  r   r-  r  r.  r   r   c           
         Uc  Uc  [        S5      eU	b  U	OU R                  R                  n	Ub  UOU R                  R                  nUb  UOU R                  R                  nU R                  UUUU	U
S9nUS   n[        R                  " UR                  5       SS [        R                  S9nU R                  UUUUU	S9nUb  Uc  UnU	(       d  US   OUR                  nU R                  UUUUUU	SS	9nUb5  U	(       a  UR                  R                  5       OUS   R                  5       nOSnU	(       d  UXS   4US
S -   n[        S U 5       5      $ [!        UUUR                  UR"                  UR$                  S9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForQuestionAnswering

>>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> # training
>>> text = "How many cats are in the picture?"
>>> label = "2"
>>> inputs = processor(images=image, text=text, return_tensors="pt")
>>> labels = processor(text=label, return_tensors="pt").input_ids

>>> inputs["labels"] = labels
>>> outputs = model(**inputs)
>>> loss = outputs.loss
>>> loss.backward()

>>> # inference
>>> text = "How many cats are in the picture?"
>>> inputs = processor(images=image, text=text, return_tensors="pt")
>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
2
```Na  Either `decoder_input_ids` or `labels` should be passed when calling `forward` with `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`rw  r   r   r   rx  r  )r   r  ry  rz  r  r.  r  rl   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r=   r  s     r$   r^   3BlipForQuestionAnswering.forward.<locals>.<genexpr>  r  r3  )r0   r1   r2   r3   r4   )r   rf   r4  r   r-  r]  r!   r{  r   r|  r  r2   r  r0   r  r`   rH   r3   r4   )r:   r   r   r  r  r  r   r-  r  r.  r   rr  r1   r  rO   answer_outputdecoder_lossr   s                     r$   r    BlipForQuestionAnswering.forwardp  s   \ >/7u  &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 **%/!5#%= + 
 &a($zz,*;*;*=cr*B%**U++)".#7# , 
 "3"; &4?/!,_EfEf))'1"1#1# * 
 8C=--224WXIYI^I^I`LL#\!3DEWXWYHZZGLgLLL(%,>>(66%00
 	
r&   c           	         U R                  UUS9nUS   n[        R                  " UR                  5       SS [        R                  UR
                  S9n[        U[        5      (       a  [        R                  " U5      nU R                  UUUUSS9n	U	S   n
[        R                  " U
R                  5       SS [        R                  U
R
                  S9n[        R                  " U
R                  S5      S4U R                  U
R
                  S	9nU R                  R                  " SUU R                  R                  R                   U R                  R                  R"                  U
US
.UD6nU$ )al  
Overrides *generate* function to be able to use the model as a conditional generator

Parameters:
    input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
        The sequence used as a prompt for the generation.
    pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
        Input image to be processed
    attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
        tokens that are NOT MASKED, `0` for MASKED tokens.
    **generate_kwargs:
        Additional arguments passed to the *generate* function of the decoder


Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForQuestionAnswering

>>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "How many cats are in the picture?"

>>> inputs = processor(images=image, text=text, return_tensors="pt")

>>> outputs = model.generate(**inputs)
>>> print(processor.decode(outputs[0], skip_special_tokens=True))
2
```
r  r   Nr   r  Frx  r   )
fill_valuer   )r   r  r  ry  rz  r=   )r]  r!   r{  r   r|  r   r  r  r   r  fullr  r  r  rf   rV  r  r  )r:   r   r   r  r   r  rr  r1   r  question_outputsrO   question_attention_maskbos_idsr   s                 r$   r  !BlipForQuestionAnswering.generate  si   X **%%= + 

 &a($zz,*;*;*=cr*B%**]i]p]pqi&&((3I,,)".#7 - 
 +1-"'**  "3B'uzz/BXBX#
 **!!!$a(T5P5PYhYoYo
 ##,, 
00==00=="1#:
 
 r&   )r  r  r  r  r]  )NNNNNNNFr   )r>   r?   r@   rA   r   r  r  rn   rh  rM  r   r!   r   rC   r   r   r   r   rH   r   r  r  rF   r   r   s   @r$   r  r  O  s    LEFz 68 
 9==A59,0/3-1&*).l
##l
 ''l
 $E$4$45	l

 !))9)9 :l
 !!1!12l
 $D>l
 'tnl
 ))*l
 d^l
 #'l
 
u//	0l
 l
\ ]]_
 6:).R##R ''R !!1!12	R
 #'R 
		R Rr&   r  a   
    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
    the image.
    c                      ^  \ rS rSr\rS\4U 4S jjrS rS r\	      SS\
R                  S\
R                  S\\   S	\\
R                     S
\\   S\\   S\\   S\S\\\4   4S jj5       rSrU =r$ )BlipForImageTextRetrievali5  rf   c                   > [         TU ]  U5        [        UR                  5      U l        [        UR                  SS9U l        [        R                  " UR                  R                  UR                  5      U l        [        R                  " UR                  R                  UR                  5      U l        [        R                  " UR                  R                  S5      U l        [        US5      (       d  UR                  R                   OUR"                  U l        [        US5      (       d  UR                  R$                  OUR&                  U l        U R)                  5         g )NFr  rl   r  r  )rm   rn   rA  r  r]  r   rV  r  r   r   ro   image_text_hidden_sizevision_proj	text_projitm_headr  r  r  r  r  rE  r{   s     r$   rn   "BlipForImageTextRetrieval.__init__?  s    +F,@,@A)&*<*<PUV 99V%9%9%E%EvGdGde 6#5#5#A#A6C`C`a 		&"4"4"@"@!D 6#9:: ++,, 	! 6#;<< ++.. 	# 	r&   c                 6    U R                   R                  5       $ r   r  r9   s    r$   rM  .BlipForImageTextRetrieval.get_input_embeddings]  r  r&   c                 :    U R                   R                  U5        g r   r  ri  s     r$   rh  .BlipForImageTextRetrieval.set_input_embeddings`  r  r&   r   r   use_itm_headr  r   r-  r.  r   r   c	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9n	U	S   n
[
        R                  " U
R                  5       SS [
        R                  S9nU(       aG  U R                  UUU
UUS9nU(       d  US   OUR                  nU R                  USS2SSS24   5      nOU R                  UUUS9nU(       d  US   OUR                  n[        U R                  U
SS2SSS24   5      SS9n[        U R                  USS2SSS24   5      SS9nXR                  5       -  nU(       d"  XS   4U	S	S -   U4-   n[!        S
 U 5       5      $ [#        UU	R                  U	R$                  U	R&                  US9$ )a  
use_itm_head (`bool`, *optional*, defaults to `True`):
    Whether or not to use the image-text matching head.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, BlipForImageTextRetrieval

>>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
>>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "an image of a cat"

>>> inputs = processor(images=image, text=text, return_tensors="pt")
>>> outputs = model(**inputs)
```
Nrw  r   r   r   rx  )r   r  r.  r   rl   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r=   r  s     r$   r^   4BlipForImageTextRetrieval.forward.<locals>.<genexpr>  r  r3  )rM   r2   r3   r4   rO   )rf   r4  r   r-  r]  r!   r{  r   r|  r  r2   r  r	   r  r  r)   r`   rK   r3   r4   )r:   r   r   r  r  r   r-  r.  r   rr  r1   r}  rO   r   
image_feat	text_featr   s                    r$   r   !BlipForImageTextRetrieval.forwardc  s   D &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 **%/!5#%= + 
 &a(ZZ 1 1 3CR 8

K
"//#-&2'1' 0 O 9Doa0IjIjO]]?1a7#;<F"//#-' 0 O
 9Doa0IjIjO"4#3#3LAq4I#JPRSJ!$..Aq1I"JPRSI++-/Fa01N124FF/I[[GLgLLL/,>>(66%00+
 	
r&   )r  r  r  r  r  r]  r  )TNNNNF)r>   r?   r@   rA   r   r  rn   rM  rh  r   r!   r   rC   r   r   r   r   rH   r   rF   r   r   s   @r$   r  r  5  s     Lz <86 
 (,59,0/3&*).T
##T
 ''T
 tn	T

 !!1!12T
 $D>T
 'tnT
 d^T
 #'T
 
u//	0T
 T
r&   r  )rS  r
  r  r  rA  r   r  );rB   r6   dataclassesr   typingr   r   r   r   r!   torch.utils.checkpointr   torch.nn.functionalr	   activationsr   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   configuration_blipr   r   r   modeling_blip_textr   r   
get_loggerr>   rb  r   r%   r,   r.   rH   rK   rQ   Modulerd   r   r   r   r   r
  r%  rA  rS  r  r  r  __all__r=   r&   r$   <module>r     s@     ! . .    ) ! ) K - D D L L B 
		H	%`U\\ `ell `
-%,, -5<< - )k ) )X ? ? ?@ '?{ '? '?T !
 !
 !
HG299 GV% %PBBII BLbii -ryy -` "%/ "% "%JS
")) S
l;) ;| 
R
# R

R
j h#6 hhV \2O \\~ |
 3 |
|
~r&   