
    fThs                    `   S r SSKrSSKrSSKJr  SSKJr  SSKJrJrJ	r	J
r
JrJrJr  SSKrSSKrSSKJr  SSKJr  SS	KJrJr  SS
KJrJrJr  SSKJrJrJrJr  SSKJ r J!r!J"r"J#r#J$r$  \RJ                  " \&5      r'Sr(Sr)Sr*\\$\"\#4   r+\ " S S\5      5       r,\ " S S\5      5       r-\ " S S\5      5       r. " S S\R^                  5      r0 " S S\R^                  5      r1 " S S\R^                  5      r2 " S S\R^                  5      r3 " S S\R^                  5      r4 " S  S!\R^                  5      r5 " S" S#\R^                  5      r6 " S$ S%\R^                  5      r7 " S& S'\R^                  5      r8 " S( S)\R^                  5      r9 " S* S+\R^                  5      r:\ " S, S-\5      5       r;\ " S. S/\;5      5       r<\ " S0 S1\;5      5       r=\ " S2 S3\;5      5       r>\ " S4 S5\;5      5       r? " S6 S7\R^                  5      r@ " S8 S9\R^                  5      rA " S: S;\R^                  5      rB\" S<S=9 " S> S?\;5      5       rC " S@ SA\R^                  5      rD " SB SC\R^                  5      rE " SD SE\R^                  5      rF " SF SG\R^                  5      rG\" SHS=9 " SI SJ\;5      5       rH/ SKQrIg)LzPyTorch FLAVA model.    N)OrderedDict)	dataclass)AnyDictListOptionalSetTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )FlavaConfigFlavaImageCodebookConfigFlavaImageConfigFlavaMultimodalConfigFlavaTextConfigzfacebook/flava-image-codebookg$(~k@c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\R                     \	S'   Sr\\   \	S'   Sr\\R                     \	S'   Sr\\   \	S	'   S
\\   4S jrSrg)FlavaModelOutput2   ap  
Output from FlavaModel containing embeddings and outputs from individual encoders.

Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
`text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

Args:
    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`].
    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
        The output of the [`FlavaTextModel`].
    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
        The output of the [`FlavaMultimodalModel`].
Nimage_embeddingsimage_outputtext_embeddingstext_outputmultimodal_embeddingsmultimodal_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r$   r"   r&   Ngetattrto_tuple).0kselfs     `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/flava/modeling_flava.py	<genexpr>,FlavaModelOutput.to_tuple.<locals>.<genexpr>R   s<      
   TTDGZabfhiZjZsZsZuu s   25tuplekeysr/   s   `r0   r,   FlavaModelOutput.to_tupleQ   s#     
YY[
 
 	
     )__name__
__module____qualname____firstlineno____doc__r!   r   torchFloatTensor__annotations__r"   r   r#   r$   r%   r&   r
   r   r,   __static_attributes__r9   r8   r0   r   r   2   s    , 59hu00189=L(56=37OXe//078<K45<9=8E$5$56=>Bx :;B
%* 
r8   r   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
\4S jrSrg)FlavaLossesX   a  Class representing pretraining losses from FLAVA model

Args:
    mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
        Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
    mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
        Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
    itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
        Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
        masked pairs in FLAVA.
    global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
        Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
        data. This is calculated on unmasked images and texts.
    mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
        Masked Multimodal Modeling loss's image component calculated on paired image-text data.
    mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
        Masked Multimodal Modeling loss's text component calculated on paired image-text data.
Nmimmlmitmglobal_contrastive	mmm_imagemmm_textr'   c                 H    SnU R                  5        H  nUc  M  Sn  U$    U$ )NTF)values)r/   all_nonevs      r0   rN   FlavaLosses.all_nonet   s0    A} 	  r8   r9   )r:   r;   r<   r=   r>   rF   r   r?   r@   rA   rG   rH   rI   rJ   rK   boolrN   rB   r9   r8   r0   rD   rD   X   s    & (,C%##	$+'+C%##	$+'+C%##	$+6:!2!23:-1Ix))*1,0Hhu(()0$ r8   rD   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\	S'   Sr\\R                     \	S'   Sr\\   \	S'   Sr\\R                     \	S'   Sr\\   \	S	'   Sr\\R                     \	S
'   Sr\\   \	S'   Sr\\R                     \	S'   Sr\\   \	S'   Sr\\R                     \	S'   Sr\\   \	S'   Sr\\R                     \	S'   Sr\\   \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   S\ \!   4S jr"Sr#g)FlavaForPreTrainingOutput}   a  
Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.

Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
`text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

Args:
    loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
        Total loss calculated for this model.
    loss_info (`FlavaLosses`):
        Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
        the keys.
    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`].
    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
        The output of the [`FlavaTextModel`].
    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
        The output of the [`FlavaMultimodalModel`].

    image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
        to create masked images.
    image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
    text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
        The output of the [`FlavaTextModel`].
    multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
        The output of the [`FlavaMultimodalModel`].

    mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
            The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
            returned when `bool_masked_pos` has some of the patches masked.
    mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
            The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
            the tokens masked.
    itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
            The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
    mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
            The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
            output is returned when `bool_masked_pos` has some of the patches masked.
    mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
            The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
            some of the tokens masked.
    contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
        `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
        scores. This is calculated on unmasked images and texts.
    contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
        `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
        texts.
Nloss	loss_infor!   r"   r#   r$   r%   r&   image_masked_embeddingsimage_masked_outputtext_masked_embeddingstext_masked_outputmultimodal_masked_embeddingsmultimodal_masked_output
mim_logits
mlm_logits
itm_logitscontrastive_logits_per_imagecontrastive_logits_per_textmmm_image_logitsmmm_text_logitsr'   c                 V   ^ ^ / SQm[        U U4S jT R                  5        5       5      $ )N)r$   r"   r&   rZ   rX   r\   c              3   l   >#    U  H)  oT;  a  TU   O[        TU5      R                  5       v   M+     g 7fNr*   )r-   r.   r/   transformer_outputss     r0   r1   5FlavaForPreTrainingOutput.to_tuple.<locals>.<genexpr>   s4     sgrbc)< <T!W'$PQBRB[B[B]]grs   14r3   )r/   rg   s   `@r0   r,   "FlavaForPreTrainingOutput.to_tuple   s(    
 sgkgpgpgrsssr8   r9   )$r:   r;   r<   r=   r>   rU   r   r?   r@   rA   rV   rD   r!   r"   r   r#   r$   r%   r&   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   r
   r   r,   rB   r9   r8   r0   rS   rS   }   s   >@ )-D(5$$
%,!I{!48hu00189=L(56=37OXe//078<K45<9=8E$5$56=>Bx :;B;?Xe&7&78?@D"<=D:>HU%6%67>?C!;<C@D (5+<+<"=DEIh'ABI.2J**+2.2J**+2.2J**+2@D (5+<+<"=D?C%*;*;!<C48hu001837OXe//07	t%* 	tr8   rS   c            	          ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  S	\
S
\
S\R                  4S jr  SS\R                  S\\R                     S\S\R                  4S jjrSrU =r$ )FlavaImageEmbeddings   zZ
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
configuse_mask_tokenr'   Nc                   > [         TU ]  5         U=(       d    UR                  n[        R                  " [
        R                  " SSUR                  5      5      U l        U(       a6  [        R                  " [
        R                  " SSUR                  5      5      OS U l        [        UR                  UR                  UR                  UR                  S9U l        U R                  R                  n[        R                  " [
        R                  " SUS-   UR                  5      5      U l        [        R                   " UR"                  5      U l        UR                  U l        Xl        g )Nr   )
image_size
patch_sizenum_channels	embed_dim)super__init__
mask_tokenr   	Parameterr?   zeroshidden_size	cls_tokenPatchEmbeddingsrp   rq   rr   patch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutrm   )r/   rm   rn   r}   	__class__s       r0   ru   FlavaImageEmbeddings.__init__   s    '<6+<+<ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei /((((,,((	!
 ++77#%<<A{QPVPbPb0c#d zz&"<"<= ++r8   
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Ng      ?r   r      bicubicF)sizemodealign_cornersdim)shaper~   r?   jit
is_tracingrq   r   reshapepermuter   
functionalinterpolateviewcat)r/   r   r   r   r}   num_positionsclass_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r0   interpolate_pos_encoding-FlavaImageEmbeddings.interpolate_pos_encoding   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr8   pixel_valuesbool_masked_posr   c                 F   UR                   u  pEpgU R                  XS9nUR                  5       u  pIn
Ub~  U R                  R	                  XIS5      nUR                  5       S:X  a!  UR                  UR                  S5      S5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  USS5      n[        R                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )N)r   r   r   r         ?r   r   )r   r|   r   rv   expandr   r   	unsqueezetype_asrz   r?   r   r   r~   r   )r/   r   r   r   
batch_sizerr   r   r   r   seq_len_mask_tokensmask
cls_tokenss                 r0   forwardFlavaImageEmbeddings.forward#  s    3?2D2D/
&**<*k
!+!2
Q&//00bIK""$)"1"6"67K7KA7NPR"S",,R088ED#sTz2[5GGJ ^^**:r2>
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r8   )rz   rm   r   rv   r|   rq   r~   FNF)r:   r;   r<   r=   r>   r   rQ   ru   r?   Tensorintr   r   
BoolTensorr   rB   __classcell__r   s   @r0   rk   rk      s    /  RV  &&D5<< &D &DUX &D]b]i]i &DV 7;).	ll "%"2"23 #'	
 
 r8   rk   c            	          ^  \ rS rSrSr    SS\S\\\\\4   4   S\S\4U 4S jjjrSS\	R                  S	\S
\	R                  4S jjrSrU =r$ )r{   iG  z
Image to Patch Embedding.
rp   rq   rr   rs   c                 X  > [         TU ]  5         [        U[        R                  R
                  5      (       d  X4n[        U[        R                  R
                  5      (       d  X"4nUS   US   -  US   US   -  -  nXl        X l        XPl        [        R                  " X4X"S9U l        g )Nr   r   )kernel_sizestride)rt   ru   
isinstancecollectionsabcIterablerp   rq   r}   r   Conv2d
projection)r/   rp   rq   rr   rs   r}   r   s         r0   ru   PatchEmbeddings.__init__L  s     	*koo&>&>??$1J*koo&>&>??$1J!!}
15*Q-:VW=:XY$$&))Lgr8   r   r   r'   c                 >   UR                   u  p4pVU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   r   zInput image size (*z) doesn't match model (z).r   )r   rp   
ValueErrorr   flatten	transpose)r/   r   r   r   rr   r   r   xs           r0   r   PatchEmbeddings.forward_  s    2>2D2D/
&'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr8   )rp   r}   rq   r   )      r   i   r   )r:   r;   r<   r=   r>   r   r   r
   ru   r?   r   rQ   r   rB   r   r   s   @r0   r{   r{   G  s     24hh #uS#X./h 	h
 h h&	ELL 	D 	]b]i]i 	 	r8   r{   c                      ^  \ rS rSrSrU 4S jr   S	S\\R                     S\\R                     S\\R                     4S jjr	Sr
U =r$ )
FlavaTextEmbeddingsik  zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r   F)
persistenttoken_type_ids)dtype)rt   ru   r   	Embedding
vocab_sizery   pad_token_idword_embeddingsmax_position_embeddingsr~   type_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   r   r   r+   r   register_bufferr?   aranger   rx   r   r   longr/   rm   r   s     r0   ru   FlavaTextEmbeddings.__init__n  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r8   	input_idsr   r   c                 ,   UR                  5       nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nU R                  U5      nU R                  U5      n	X-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr   r   r   )r   devicer   )r   r   hasattrr   r   r?   rx   r   r   r   r   r   r~   r   r   )r/   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedinputs_embedsr   r   r~   s               r0   r   FlavaTextEmbeddings.forward  s     nn& ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l,,Y7 $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r8   )r   r   r   r~   r   r   )NNN)r:   r;   r<   r=   r>   ru   r   r?   r   r   rB   r   r   s   @r0   r   r   k  sW    Q
* -115/3	 ELL)  !.  u||,	   r8   r   c                   4  ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jr   SS\R                  S	\	\R                     S
\	\R                     S\
S\\\R                  \R                  4   \\R                     4   4
S jjrSrU =r$ )FlavaSelfAttentioni  rm   r'   Nc                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                   5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)rt   ru   ry   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer   attention_probs_dropout_probr   r   s     r0   ru   FlavaSelfAttention.__init__  s1    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr8   r   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr   r   r   r   r   )r   r   r   r   r   )r/   r   new_x_shapes      r0   transpose_for_scores'FlavaSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r8   hidden_statesattention_mask	head_maskoutput_attentionsc                    U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n[        R
                  " XR                  SS5      5      n	U	[        R                  " U R                  5      -  n	Ub  X-   n	[        R                  R                  U	SS9n
U R                  U
5      n
Ub  X-  n
[        R
                  " X5      nUR                  SSSS5      R                  5       nUR!                  5       S S U R"                  4-   nUR$                  " U6 nU(       a  X4nU$ U4nU$ )Nr   r   r   r   r   r   )r   r  r   r   r?   matmulr   mathsqrtr   r   r   softmaxr   r   
contiguousr   r   r   )r/   r  r  r  r  mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r0   r   FlavaSelfAttention.forward  s[    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2 O\M]r8   )r   r   r   r   r   r   r   NNF)r:   r;   r<   r=   FlavaPossibleConfigsru   r?   r   r  r   rQ   r   r
   r   rB   r   r   s   @r0   r   r     s    G3 G G$%ell %u|| % 26,0"'(||( !.( ELL)	(
  ( 
uU\\5<</0%2EE	F( (r8   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )FlavaSelfOutputi  z
The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
models), due to the layernorm applied before each block.
rm   r'   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rf   )	rt   ru   r   r   ry   denser   r   r   r   s     r0   ru   FlavaSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r8   r  input_tensorc                 J    U R                  U5      nU R                  U5      nU$ rf   r  r   r/   r  r   s      r0   r   FlavaSelfOutput.forward  s$    

=1]3r8   r"  )r:   r;   r<   r=   r>   r  ru   r?   r   r   rB   r   r   s   @r0   r  r    sJ    
>3 > >
U\\  RWR^R^  r8   r  c                     ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr   SS\	R                  S	\\	R                     S
\\	R                     S\S\\\	R                  \	R                  4   \\	R                     4   4
S jjrSrU =r$ )FlavaAttentioni  rm   r'   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g rf   )rt   ru   r   	attentionr  outputsetpruned_headsr   s     r0   ru   FlavaAttention.__init__  s0    +F3%f-Er8   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r(  r   r   r+  r   r   r   r   r)  r  r   union)r/   r-  indexs      r0   prune_headsFlavaAttention.prune_heads   s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r8   r  r  r  r  c                 b    U R                  XX4S9nU R                  US   U5      nU4USS  -   nU$ N)r  r  r  r   r   )r(  r)  )r/   r  r  r  r  self_outputsattention_outputr  s           r0   r   FlavaAttention.forward  sL     ~~I & 
  ;;|AF#%QR(88r8   )r(  r)  r+  r  )r:   r;   r<   r=   r  ru   r	   r   r2  r?   r   r   rQ   r   r
   r   rB   r   r   s   @r0   r&  r&    s    "3 " ";S ;d ;* 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	F r8   r&  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	FlavaIntermediatei#  rm   r'   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rf   )rt   ru   r   r   ry   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r0   ru   FlavaIntermediate.__init__$  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r8   r  c                 J    U R                  U5      nU R                  U5      nU$ rf   r  r?  r/   r  s     r0   r   FlavaIntermediate.forward-  s&    

=100?r8   rB  r:   r;   r<   r=   r  ru   r?   r   r   rB   r   r   s   @r0   r:  r:  #  s7    93 9 9U\\ ell  r8   r:  c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )
FlavaOutputi4  rm   r'   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g rf   )
rt   ru   r   r   r<  ry   r  r   r   r   r   s     r0   ru   FlavaOutput.__init__5  sB    YYv779K9KL
zz&"<"<=r8   r  r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ rf   r"  r#  s      r0   r   FlavaOutput.forward;  s,    

=1]3%4r8   r"  rE  r   s   @r0   rG  rG  4  sE    >3 > >U\\  RWR^R^  r8   rG  c                      ^  \ rS rSrSrS\SS4U 4S jjr   SS\R                  S\	\R                     S	\	\R                     S
\
S\\\R                  \R                  4   \\R                     4   4
S jjrSrU =r$ )
FlavaLayeriD  z?This corresponds to the Block class in the timm implementation.rm   r'   Nc                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   r   )rt   ru   chunk_size_feed_forwardseq_len_dimr&  r(  r:  intermediaterG  r)  r   r   ry   r   layernorm_beforelayernorm_afterr   s     r0   ru   FlavaLayer.__init__G  s    '-'E'E$'/-f5!&) !#V-?-?VEZEZ [!||F,>,>FDYDYZr8   r  r  r  r  c                     U R                  U R                  U5      UUUS9nUS   nUSS  nXa-   nU R                  U5      nU R                  U5      nU R	                  X5      nU4U-   nU$ r5  )r(  rR  rS  rQ  r)  )	r/   r  r  r  r  self_attention_outputsr7  r  layer_outputs	            r0   r   FlavaLayer.forwardS  s     "&!!-0)/	 "0 "
 2!4(, )8 ++M:((6 {{<?/G+r8   )r(  rO  rQ  rS  rR  r)  rP  r  )r:   r;   r<   r=   r>   r  ru   r?   r   r   rQ   r   r
   r   rB   r   r   s   @r0   rM  rM  D  s    I
[3 
[ 
[ 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	F r8   rM  c                      ^  \ rS rSrS\SS4U 4S jjr     SS\R                  S\\R                     S\\R                     S	\	S
\	S\	S\
\\4   4S jjrSrU =r$ )FlavaEncoderir  rm   r'   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
rt   ru   rm   r   
ModuleListrangenum_hidden_layersrM  layergradient_checkpointing)r/   rm   r   r   s      r0   ru   FlavaEncoder.__init__s  sR    ]]fF^F^@_#`@_1Jv$6@_#`a
&+# $as   A&r  r  r  r  output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n[        U R                  5       H{  u  pU(       a  Xq4-   nUb  X9   OS nU R                  (       a1  U R                  (       a   U R	                  U
R
                  UUUU5      nO	U
" XX5      nUS   nU(       d  Ms  XS   4-   nM}     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )Nr9   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frf   r9   )r-   rO   s     r0   r1   'FlavaEncoder.forward.<locals>.<genexpr>  s     m$[q$[   	)last_hidden_stater  
attentions)	enumerater_  r`  training_gradient_checkpointing_func__call__r4   r   )r/   r  r  r  r  rb  rc  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                r0   r   FlavaEncoder.forwardy  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]O o)!,M  &91=M<O&O#)  5,   14D Dm]GZ$[mmm+Yl
 	
r8   )rm   r`  r_  )NNFFT)r:   r;   r<   r=   r   ru   r?   r   r   rQ   r   r4   r   r   rB   r   r   s   @r0   rZ  rZ  r  s    ,{ ,t , 26,0"'%* )
||)
 !.)
 ELL)	)

  )
 #)
 )
 
uo%	&)
 )
r8   rZ  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )FlavaPooleri  rm   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rf   )rt   ru   r   r   ry   r  Tanh
activationr   s     r0   ru   FlavaPooler.__init__  s9    YYv1163E3EF
'')r8   r  c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r  ry  )r/   r  first_token_tensorpooled_outputs       r0   r   FlavaPooler.forward  s6     +1a40

#566r8   )ry  r  rE  r   s   @r0   rv  rv    s%    $3 $
U\\  r8   rv  c                   z    \ rS rSr\rSrSrS\\	R                  \	R                  \	R                  4   SS4S jrSrg)	FlavaPreTrainedModeli  flavaTmoduler'   Nc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       a%  UR                  R
                  R                  5         g[        U[         5      (       a{  UR"                  R
                  R                  5         UR$                  R
                  R                  5         UR&                  b%  UR&                  R
                  R                  5         gg[        U[(        5      (       a7  UR*                  (       a%  UR"                  R
                  R                  5         gg[        U[,        5      (       a:  UR.                  R
                  R                  U R                  R0                  5        gg)zInitialize the weightsg        )meanstdNr   )r   r   r   r   weightdatanormal_rm   initializer_ranger   zero_r   r   r   fill_FlavaMaskedPredictionHeadrk   rz   r~   rv   FlavaMultimodalModeluse_cls_token
FlavaModellogit_scalelogit_scale_init_value)r/   r  s     r0   _init_weights"FlavaPreTrainedModel._init_weights  s   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 9::KK""$ 455!!'')&&++113  ,!!&&,,. - 455##  %%++- $
++##))$++*L*LM ,r8   r9   )r:   r;   r<   r=   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r   r   r   r   r  rB   r9   r8   r0   r  r    sB    L&*#NE"))RYY*L$M NRV Nr8   r  c                     ^  \ rS rSr\rSrSrSS\S\4U 4S jjjr	S\
R                  4S jrS	\
R                  4S
 jrS\\\\   4   SS4S jr\        SS\\R*                     S\\R,                     S\\   S\\R*                     S\\R*                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )FlavaImageModeli  zflava.image_modelr   rm   add_pooling_layerc                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)rt   ru   rm   rk   r   rZ  encoderr   r   ry   r   	layernormrv  pooler	post_initr/   rm   r  r   s      r0   ru   FlavaImageModel.__init__  sg    
 	 .v6#F+f&8&8f>S>ST->k&)Dr8   r'   c                 .    U R                   R                  $ rf   r   r|   r6   s    r0   get_input_embeddings$FlavaImageModel.get_input_embeddings  s    ///r8   r   c                 $    XR                   l        g rf   r  r/   r   s     r0   set_input_embeddings$FlavaImageModel.set_input_embeddings  s    +0(r8   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gz
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
Nitemsr  r_  r(  r2  r/   r  r_  r-  s       r0   _prune_headsFlavaImageModel._prune_heads  <    
 +002LELLu%//;;EB 3r8   r   r   r  r  r  rb  rc  c	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XPR                   R                  5      nU R                  XUS9n	U R                  U	UUUUUS9n
U
S   nU R                  U5      nU R                  b  U R                  U5      OSnU(       d	  X4U
SS -   $ [        UUU
R                  U
R                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)r   r   r  r  r  rb  rc  r   r   rh  pooler_outputr  ri  )rm   r  rb  use_return_dictr   get_head_maskr^  r   r  r  r  r   r  ri  )r/   r   r   r   r  r  r  rb  rc  embedding_outputencoder_outputssequence_outputr~  s                r0   r   FlavaImageModel.forward  s2     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y++2O2OP	??Tl + 
 ,,)/!5# ' 
 *!,..98<8OO4UY#3oab6III)-')77&11	
 	
r8   rm   r   r  r  r  TNNNNNNNN)r:   r;   r<   r=   r   r  r  main_input_namerQ   ru   r   Moduler  r  r   r   r   r  r   r   r?   r   r   r   r4   r   r   rB   r   r   s   @r0   r  r    s6   #L+$O/ D  "0bii 01")) 1C4T#Y+? CD C  046:3715,0,0/3&*7
u||,7
 "%"2"237
 #+4.	7

 !.7
 ELL)7
 $D>7
 'tn7
 d^7
 
u00	17
 7
r8   r  c                   |  ^  \ rS rSr\rSrSS\S\4U 4S jjjrS\	4S jr
S\R                  4S	 jrS
\\\\   4   SS4S jr\        SS\\R*                     S\\R*                     S\\R*                     S\\R*                     S\\R*                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )FlavaTextModeli8  zflava.text_modelrm   r  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         gr  )rt   ru   rm   r   r   rZ  r  r   r   ry   r   r  rv  r  r  r  s      r0   ru   FlavaTextModel.__init__>  sg    
 	 -f5#F+f&8&8f>S>ST->k&)Dr8   r'   c                 .    U R                   R                  $ rf   r   r   r6   s    r0   r  #FlavaTextModel.get_input_embeddingsN  s    ...r8   r   c                 $    XR                   l        g rf   r  r  s     r0   r  #FlavaTextModel.set_input_embeddingsQ  s    */'r8   r  Nc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gr  r  r  s       r0   r  FlavaTextModel._prune_headsT  r  r8   r   r  r   r   r  r  rb  rc  c	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR                  5       n	Uc  [        R                  " XR                  S9nU R                  XPR                   R                  5      nU R                  X)UR                  5      n
U R                  UUUS9nU R                  UU
UUUUS9nUS   nU R                  U5      nU R                  b  U R                  U5      OSnU(       d	  X4USS -   $ [!        UUUR"                  UR$                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:
    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    [What are token type IDs?](../glossary#token-type-ids)
NzYou have to specify input_idsr   )r   r   r   r  r   r   r  )rm   r  rb  r  r   r   r?   onesr   r  r^  get_extended_attention_maskr   r  r  r  r   r  ri  )r/   r   r  r   r   r  r  rb  rc  r   extended_attention_maskr  r  r  r~  s                  r0   r   FlavaTextModel.forward\  sy   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&!"ZZ<L<LMN &&y++2O2OP	040P0P)9)91
  ??)% + 
 ,,2/!5# ' 
 *!,..98<8OO4UY#3oab6III)-')77&11	
 	
r8   r  r  r  )r:   r;   r<   r=   r   r  r  rQ   ru   r{   r  r   r  r  r   r   r   r  r   r   r?   r   r   r4   r   r   rB   r   r   s   @r0   r  r  8  s:   "L* 4   /o /0")) 0C4T#Y+? CD C  -11515/3,0,0/3&*I
ELL)I
 !.I
 !.	I

 u||,I
 ELL)I
 $D>I
 'tnI
 d^I
 
u00	1I
 I
r8   r  c                     ^  \ rS rSr\rSrSrSS\4U 4S jjjrS\	\
\\
   4   SS4S	 jr\     SS\R                  S
\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )r  i  zflava.multimodal_modelr  rm   c                   > [         TU ]  U5        Xl        U R                  R                  U l        U R                  (       a;  [        R
                  " [        R                  " SSUR                  5      5      U l	        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R#                  5         g)r  r   r   N)rt   ru   rm   r  r   rw   r?   rx   ry   rz   rZ  r  r   r   r  rv  r  r  r  s      r0   ru   FlavaMultimodalModel.__init__  s    
 	 ![[66\\%++aF<N<N*OPDN#F+f&8&8f>S>ST->k&)Dr8   r  r'   Nc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gr  r  r  s       r0   r  !FlavaMultimodalModel._prune_heads  r  r8   r  r  r  rb  rc  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR	                  5       u  pxn	U R
                  (       a8  U R                  R                  USS5      n
[        R                  " X4SS9nUS-  nUc   [        R                  " Xx4UR                  S9nU R                  X0R                   R                  5      nU R                  X'U4UR                  5      nU R                  UUUUUUS9nUS   nU R!                  U5      nU R"                  b  U R#                  U5      OSnU(       d	  X4USS -   $ [%        UUUR&                  UR(                  S9$ )	z
hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
    The concatenated hidden states of unimodal encoders.
Nr   r   r   r  r  r   r  )rm   r  rb  r  r   r  rz   r   r?   r   r  r   r  r^  r  r  r  r  r   r  ri  )r/   r  r  r  r  rb  rc  r   r   r   r   r  r  r  r~  s                  r0   r   FlavaMultimodalModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$1$6$6$8!
..z2rBJ!IIz&AqIM!OJ!"ZZ(@I]I]^N &&y++2O2OP	040P0P4m6J6J1
 ,,2/!5# ' 
 *!,..98<8OO4UY#3oab6III)-')77&11	
 	
r8   )rz   rm   r  r  r  r  r  )NNNNN)r:   r;   r<   r=   r   r  r  r  ru   r   r   r   r  r   r?   r   r   rQ   r   r4   r   r   rB   r   r   s   @r0   r  r    s    (L0%O4  $C4T#Y+? CD C  26,0,0/3&*;
||;
 !.;
 ELL)	;

 $D>;
 'tn;
 d^;
 
u00	1;
 ;
r8   r  c                     ^  \ rS rSr\rS\4U 4S jjr\       SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S	\\   S
\\   S\	R                  4S jj5       r\        SS\\	R                     S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S	\\   S
\\   S\	R                  4S jj5       r\           SS\\	R                      S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                      S\\	R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )r  i	  rm   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d%  [        SS[        UR                  5       S3-   5      eUR                  nUR                  nUR                  nUR                  U l        UR                  U l        UR                  U l        UR                  U l        [!        U5      U l        [%        U5      U l        [)        U5      U l        [,        R.                  " U R                  U R                  5      U l        [,        R.                  " U R                  U R                  5      U l        [,        R4                  " [6        R8                  " U R:                  R<                  5      5      U l        [,        R.                  " U R                  U R                  5      U l         [,        R.                  " U R                  U R                  5      U l!        U RE                  5         g )NzLconfig.text_config is expected to be of type FlavaTextConfig but is of type r   zNconfig.image_config is expected to be of type FlavaImageConfig but is of type zMconfig.multimodal_config is expected to be of type FlavaMultimodalConfig but zis of type )#rt   ru   r   text_configr   	TypeErrortypeimage_configr   multimodal_configr   projection_dimry   text_hidden_sizeimage_hidden_sizemm_hidden_sizer  
text_modelr  image_modelr  multimodal_modelr   r   image_projectiontext_projectionrw   r?   tensorrm   r  r  image_to_mm_projectiontext_to_mm_projectionr  )r/   rm   r  r  r  r   s        r0   ru   FlavaModel.__init__  s    &,,o>>++,-Q0 
 &--/?@@,,-.a1 
 &224IJJ_V%=%= >?qAB 
 ((**"44$33 + 7 7!-!9!9/;;(5*<8 45F G "		$*@*@$BUBU V!yy)>)>@S@ST<<T[[5W5W(XY&(ii0F0FH[H[&\#%'YYt/D/DdFYFY%Z"r8   r   r  r   r   r  rb  rc  r'   c           
      \    U R                  UUUUUUUS9nUS   n	U R                  U	5      n
U
$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:
    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    [What are token type IDs?](../glossary#token-type-ids)

Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`FlavaTextModel`].

Examples:

```python
>>> from transformers import AutoProcessor, FlavaModel

>>> model = FlavaModel.from_pretrained("{0}")
>>> processor = AutoProcessor.from_pretrained("{0}")

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
... )
>>> text_features = model.get_text_features(**inputs)
```
)r   r  r   r   r  rb  rc  r   )r  r  )r/   r   r  r   r   r  rb  rc  text_outputsr~  text_featuress              r0   get_text_featuresFlavaModel.get_text_features8  sN    R ))%/!5# ' 
 %Q,,];r8   r   r   r   r  c	                 ^    U R                  UUUUUUUUS9n	U	S   n
U R                  U
5      nU$ )aJ  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`FlavaImageModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, FlavaModel

>>> model = FlavaModel.from_pretrained("{0}")
>>> processor = AutoProcessor.from_pretrained("{0}")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```
)r   r   r  r  r  rb  r   rc  r   )r  r  )r/   r   r   r   r  r  r  rb  rc  image_outputsr~  image_featuress               r0   get_image_featuresFlavaModel.get_image_featuresp  sT    L ((%+)/!5%=# ) 	
 &a(..}=r8   image_attention_maskskip_multimodal_encoderc           
         Ub  UOU R                   R                  nU
(       d  [        S5      eSnSnSnSnUb1  U R                  UUUU	U
US9nUS   US   pU R	                  US   5      nSnSnSnSnUb3  U R                  UUUUU	U
US9nUS   US   nnU R                  US   5      nSnSnUb  Ub  U(       d  Ubh  UR                  u  nnnU R                  R                  (       a  US-  n[        R                  " UUUR                  S	9n[        R                  " UU/SS
9nOSn[        R                  " UU/SS
9nU R                  UUUS9nUS   nU(       d  UUUUUU4$ [        UUUUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:
    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    [What are token type IDs?](../glossary#token-type-ids)
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
skip_multimodal_encoder (*bool*, *optional*):
    Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
    Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
    - 1 for pixel values that are real (i.e., **not masked**),
    - 0 for pixel values that are padding (i.e., **masked**).

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, FlavaModel

>>> model = FlavaModel.from_pretrained("facebook/flava-full")
>>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)

>>> image_embeddings = outputs.image_embeddings
>>> text_embeddings = outputs.text_embeddings
>>> multimodal_embeddings = outputs.multimodal_embeddings

>>> outputs.image_embeddings.shape
torch.Size([1, 197, 768])

>>> text_embeddings.shape
torch.Size([1, 7, 768])

>>> multimodal_embeddings.shape
torch.Size([1, 205, 768])
```
NzRFLAVA model requires hidden states to work. Please set `output_hidden_states=True`)r   r   r  r  rb  rc  r   r   r   )r   r  r   r   r  rb  rc  r   r  r   )r  rc  )r!   r"   r#   r$   r%   r&   )rm   rc  r   r  r  r  r  r   r  r  r?   r  r   r   r   )r/   r   r   r  r   r   r   r  r  r  rb  rc  r!   image_statesimage_mm_projectionr"   r#   text_statestext_mm_projectionr$   r%   r&   r   r   r   attention_mask_imageattention_multimodalmultimodal_inputs                               r0   r   FlavaModel.forward  s   F &1%<k$++BYBY#qrr"#++) /3"3%9' , L .:!_l1ol"&"="=l2>N"O! //#-)-"3%9' * K ,7q>;q>[O!%!;!;KO!L $ */A/MVm))<)B)B&
GQ((66qLG',zz*gNaNhNh'i$',yy2F1W]^'_$'+$$yy*=?Q)RXYZ $ 5 5 1ES^ !6 ! %6a$8! %!   -%+#"7/
 	
r8   )r  r  r  r  r  r  r  r  r  r  r  r  )NNNNNNNr  )NNNNNNNNNTN)r:   r;   r<   r=   r   r  ru   r   r   r?   r   rQ   r@   r  r   r  
LongTensorr   r
   rG  r   rB   r   r   s   @r0   r  r  	  s   L){ )V  -11515/3,0/3&*5ELL)5 !.5 !.	5
 u||,5 $D>5 'tn5 d^5 
		5 5n  046:3715,0,0/3&*3u||,3 "%"2"233 #+4.	3
 !.3 ELL)3 $D>3 'tn3 d^3 
		3 3j  1548151526377;26,0%)&*K
E,,-K
 u001K
 !.	K

 !.K
 "%,,/K
 u//0K
 'u||4K
 "*$K
 $D>K
 #K
 d^K
 
uk!	"K
 K
r8   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	FlavaImageCodebookResPathi5  in_sizeout_sizec                   > [         TU ]  5         US-  n[        5       n[        R                  " 5       US'   [        R
                  " XSSS9US'   [        R                  " 5       US'   [        R
                  " XDSSS9US'   [        R                  " 5       US	'   [        R
                  " XDSSS9US
'   [        R                  " 5       US'   [        R
                  " XBSSS9US'   [        R                  " U5      U l        g )N   relu_1r   r   r   paddingconv_1relu_2conv_2relu_3conv_3relu_4r   conv_4)rt   ru   r   r   ReLUr   
Sequentialpath)r/   r  r	  kwargshid_sizer  r   s         r0   ru   "FlavaImageCodebookResPath.__init__6  s    q=}X7!QOXX81aPXX81aPXX81aPXMM$'	r8   r   r'   c                 $    U R                  U5      $ rf   r  r/   r   s     r0   r   !FlavaImageCodebookResPath.forwardF  s    yy|r8   r  r:   r;   r<   r=   r   ru   r?   r   r   rB   r   r   s   @r0   r  r  5  s6    ( (s (  %,,  r8   r  c                   r   ^  \ rS rSrS\S\S\4U 4S jjrS\R                  S\R                  4S jrS	r	U =r
$ )
FlavaImageCodebookBlockiJ  r  r	  
num_layersc                    > [         TU ]  5         SUS-  -  U l        X:w  a  [        R                  " XSSS9U l        O[        R                  " 5       U l        [        X5      U l        g )Nr   r   r   r  )	rt   ru   	post_gainr   r   id_pathIdentityr  res_path)r/   r  r	  r#  r  r   s        r0   ru    FlavaImageCodebookBlock.__init__K  sQ    j!m,99WAqQDL;;=DL1'Dr8   r   r'   c                 b    U R                  U5      U R                  U R                  U5      -  -   $ rf   r&  r%  r(  r  s     r0   r   FlavaImageCodebookBlock.forwardW  s'    ||A$--2B!BBBr8   r+  r   r   s   @r0   r"  r"  J  sE    
E 
Es 
E 
EC C%,, C Cr8   r"  c                   ~   ^  \ rS rSrSS\S\S\S\S\4
U 4S jjjrS\R                  S	\R                  4S
 jr	Sr
U =r$ )FlavaImageCodebookLayerGroupi[  
num_blocksr#  r  r	  use_poolc                 0  > [         TU ]  5         [        5       n[        U5       H5  nUS:X  a  [	        X4U5      USUS-    3'   M   [	        XDU5      USUS-    3'   M7     U(       a  [
        R                  " SS9US'   [
        R                  " U5      U l        g )Nr   block_r   r   )r   pool)	rt   ru   r   r]  r"  r   	MaxPool2dr  group)	r/   r/  r#  r  r	  r0  blocksrp  r   s	           r0   ru   %FlavaImageCodebookLayerGroup.__init__\  s    z"AAv+B7V`+aAw'(+B8Wa+bAw'(	 # \\a8F6N]]6*
r8   r   r'   c                 $    U R                  U5      $ rf   r5  r  s     r0   r   $FlavaImageCodebookLayerGroup.forwardj  s    zz!}r8   r9  r  )r:   r;   r<   r=   r   rQ   ru   r?   r   r   rB   r   r   s   @r0   r.  r.  [  sR    +3 +C +# +QT +`d + + %,,  r8   r.  a"  
    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
    `get_codebook_indices` to get image tokens for an image.
    )custom_introc                      ^  \ rS rSrSr\rSrSrS\S\	4U 4S jjr
S\R                  S\R                  4S	 jrS\R                  S\R                  4S
 jrS\R                  S\R                  4S jrSrU =r$ )FlavaImageCodebookio   r   Frm   r  c                   > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  U R
                  -  n[        5       n[        R                  " 5       US'   [        R                  " SU R                  -  U R                  SSS9US'   [        5       n[        R                  " U R                  SU R                  -  SSS9US	'   [        U R
                  USU R                  -  SU R                  -  5      US
'   [        U R
                  USU R                  -  SU R                  -  5      US'   [        U R
                  USU R                  -  SU R                  -  5      US'   [        U R
                  USU R                  -  SU R                  -  SS9US'   [        R                  " U5      US'   [        R                  " U5      U l        U R                  5         U R                  R                   (       a  U R#                  5        H
  nSUl        M     g g )Nrelu   r   r   r  conv   r   inputgroup_1r   group_2r  group_3F)r0  group_4r)  )rt   ru   rm   
num_groupsinput_channelsnum_blocks_per_groupry   r   r   r   r  r   r.  r  r6  r  freeze
parametersrequires_grad)r/   rm   r  r#  output_blocksr6  paramr   s          r0   ru   FlavaImageCodebook.__init__|  s   
 	  ++$33$*$?$?!!-- ++__t'@'@@
# "	f "		!d.>.>*>]^hi jf))D$7$7T=M=M9M[\fghw8%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]hm
y ==7xmmF+;;*&+# + r8   r'   c                 x    SR                  [        5        U R                  U5      n[        R                  " USS9$ )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model.get_codebook_indices(**inputs)
        ```
        r   )axis)format_CHECKPOINT_FOR_CODEBOOK_DOCr6  r?   argmaxr/   r   z_logitss      r0   get_codebook_indices'FlavaImageCodebook.get_codebook_indices  s3    	. F/0;;|,||H1--r8   c                 X    U R                  U5      n[        R                  " SS9" U5      $ )Nr   r   )r6  r   SoftmaxrW  s      r0   get_codebook_probs%FlavaImageCodebook.get_codebook_probs  s$    ;;|,zza **r8   c                 <   SR                  [        5        [        UR                  5      S:w  a  [	        SUR                   S35      eUR                  S   U R
                  :w  a(  [	        SUR                  S    SU R
                   35      eU R                  U5      $ )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model(**inputs)
        >>> print(outputs.shape)
        (1, 196)
        ```
        r  zinput shape z
 is not 4dr   z
input has z channels but model built for )rT  rU  r/  r   r   rJ  r6  )r/   r   s     r0   r   FlavaImageCodebook.forward  s    	4 F/0|!!"a'|L,>,>+?zJKKa D$7$77z,*<*<Q*?)@@^_c_r_r^stuu{{<((r8   )r6  rm   ry   rJ  rK  rI  r   )r:   r;   r<   r=   r  r   r  r  r  r   ru   r?   r   rY  r]  r@   r   rB   r   r   s   @r0   r=  r=  o  s     +L$O&+#*,(*, *,X. .%,, .8+u|| + + )E$5$5  )%,,  )  )r8   r=  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FlavaPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g )Nr   )rt   ru   r   r   ry   r  r   r=  r>  r   transform_act_fnr   r   r   s     r0   ru   %FlavaPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr8   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rf   )r  rd  r   rC  s     r0   r   $FlavaPredictionHeadTransform.forward  s4    

=1--m<}5r8   )r   r  rd  r:   r;   r<   r=   ru   r   rB   r   r   s   @r0   rb  rb    s    U r8   rb  c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )r  i  c                 z  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " UR                  5      5      U l        Ub  X R                  l        U R                  U R                  l        g )NFr   )rt   ru   rm   rb  	transformr   r   ry   r   decoderrw   r?   rx   r   r  )r/   rm   r  r   s      r0   ru   "FlavaMaskedPredictionHead.__init__  s    5f=yy!3!3V5F5FUSLLV->->!?@	"(LL !IIr8   c                 :    U R                   U R                  l         g rf   )r   rl  r6   s    r0   _tie_weights&FlavaMaskedPredictionHead._tie_weights	  s     IIr8   c                 J    U R                  U5      nU R                  U5      nU$ rf   )rk  rl  r  s     r0   r   !FlavaMaskedPredictionHead.forward  s"    NN1LLOr8   )r   rm   rl  rk  rf   )	r:   r;   r<   r=   ru   ro  r   rB   r   r   s   @r0   r  r    s    
&& r8   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FlavaITMHeadi  c                    > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  S5      U l        g )Nr   )	rt   ru   rm   rv  r  r   r   ry   seq_relationshipr   s     r0   ru   FlavaITMHead.__init__  s8    !&) "		&*<*<a @r8   c                 J    U R                  U5      nU R                  U5      nU$ rf   )r  rv  r  s     r0   r   FlavaITMHead.forward  s$    KKN!!!$r8   )rm   r  rv  rh  r   s   @r0   rt  rt    s    A r8   rt  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FlavaGlobalContrastiveHeadi  c                 P   > [         TU ]  5         Xl        UR                  U l        g rf   )rt   ru   rm   global_backprop_contrastiver   s     r0   ru   #FlavaGlobalContrastiveHead.__init__   s!    +1+M+M(r8   c                 D   [         R                  " U5      n[         R                  R                  5       (       a#  [         R                  R	                  5       (       d6  [         R
                  " UR                  S5      UR                  S9nU/nU/nGOUR                  S5      n[         R                  R                  5       n	U R                  (       ag  [         R                  R                  R                  R                  U5      n[         R                  R                  R                  R                  U5      nO[        U	5       V
s/ s H  n
[         R                  " U5      PM     nn
[        U	5       V
s/ s H  n
[         R                  " U5      PM     nn
[         R                  R                  Xa5        [         R                  R                  Xr5        U[         R                  R                  5       -  [         R
                  " XR                  S9-   n[         R                   " U5      n[         R                   " U5      n[         R"                  " XR%                  SS5      5      U-  n[         R"                  " X&R%                  SS5      5      U-  nXU4$ s  sn
f s  sn
f )Nr   r  r   )r?   expdistributedis_availableis_initializedr   r   r   get_world_sizer}  r   r   
all_gatherr]  
zeros_likeget_rankr   r
  r   )r/   r!   r#   r  temperaturelabelsimage_embeddings_alltext_embeddings_alllocal_batch_size
world_sizer   logits_per_imagelogits_per_texts                r0   r   "FlavaGlobalContrastiveHead.forward%  s   ii,  --//u7H7H7W7W7Y7Y\\"2"7"7":CSCZCZ[F$4#5 #2"3/44Q7**99;J// (-'8'8';';'F'F'Q'QRb'c$&+&7&7&:&:&E&E&P&PQ`&a#SXYcSd'eSda(8(8(ISd$'eSXYcSd&eSdau'7'78H'ISd#&e!!,,-AT!!,,-@R%(9(9(B(B(DDu|| )@)@H F  %yy)=>#ii(;< <<(8:W:WXY[\:]^all,,8V8VWXZ[8\]`kk&88 (f&es    J6 J)rm   r}  rh  r   s   @r0   r{  r{    s    N
9 9r8   r{  zk
    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
    c            (       r  ^  \ rS rSr/ SQrSS\S\\R                     4U 4S jjjr	S\
R                  4S jr\                 SS\\
R                     S	\\
R                     S
\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\   S\\
R                     S\\
R                     S\\
R                     S\\   S\S\\   S\\   S\\\
R                     \4   4$S jj5       rSrU =r$ )FlavaForPreTrainingiG  )zmmm_text_head.decoder.biaszmmm_image_head.decoder.biaszmlm_head.decoder.biaszmim_head.decoder.biasrm   image_codebookc                 l  > [         TU ]  U5        [        U5      U l        X l        U R                  c+  UR
                  (       a  [        UR                  5      U l        [        UR                  5      U l
        [        UR                  5      U l        [        U5      U l        [        UR                  5      U l        [        UR                  5      U l        [#        U5      U l        UR                  R&                  U l        UR                  R&                  U l        UR,                  U l        UR.                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        UR8                  U l        UR:                  U l        U R=                  5         g)z
image_codebook ([`nn.Module`]):
    If passed, the image codebook will be set to this. Otherwise, it will be initialized using the
    image_codebook_config defined in the config first as the first parameter.
N)rt   ru   r  r  r  init_codebookr=  image_codebook_configr  r  mim_headr  mlm_headrt  itm_headmmm_image_headmmm_text_headr{  global_contrastive_headr   image_vocab_sizetext_vocab_size
mlm_weight
mim_weightglobal_contrastive_weightce_ignore_index
itm_weightmmm_image_weightmmm_text_weight skip_unmasked_multimodal_encoderr  )r/   rm   r  r   s      r0   ru   FlavaForPreTraining.__init__U  sO    	 '
,&6+?+?"4V5Q5Q"RD 2&2E2EF1&2D2DE$V,78K8KL6v7I7IJ'A&'I$ & 3 3 > >%11<< ++ ++)/)I)I&%55 ++ & 7 7%55060W0W-r8   r   c                 p    UR                  5       S:  a!  UR                  UR                  S5      S5      nU$ )Nr   r   r   )r   r   r   r  s     r0   _resize_to_2d!FlavaForPreTraining._resize_to_2dx  s,    557Q;qvvay"%Ar8   r   input_ids_maskedr   codebook_pixel_valuesr  r   r   r   r  r  
mlm_labels
mim_labels
itm_labelsr  rb  rc  return_lossr'   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                  n
Uc  Ub  [        R                  S5        UnU R                  UUUUUU	U
UUSS9
nU R                  UUUUU	UUUSS9	nSnUR                  nUR                  nUR                  nUR                  nUR                  nS=n=n=n=n=n=nn S=n!=n"=n#n$S=n%=n&n'Uc  UbK  UcH  U(       aA  U R                  c  [        S5      eUc  [        S5      eU R                  R                  U5      nU R                  S:  Ga  UGb  UGc  Un(Ub  U R                  U5      nU R                  U5      nU R                   XR#                  S5      '   U(SS2UR%                  S	5      * S2SS24   n(UR#                  U R                   5      n)UU)   n*U(U)SS24   n(U R'                  U(5      n!U(       aX  [(        R*                  R-                  U!R/                  S
U R0                  5      U*R/                  S
5      5      nUU R                  -  nOU R'                  U(5      n!U R2                  S:  a  Ub  Uc  Un+Ub  U R                  U5      nU+SS2UR%                  S	5      * S2SS24   n+UR#                  U R                   5      n)UU)   n,U+U)SS24   n+U R5                  U+5      n"U(       aX  [(        R*                  R-                  U"R/                  S
U R6                  5      U,R/                  S
5      5      nUU R2                  -  nOU R5                  U+5      n"U R8                  S:  a  Ub  U R;                  U5      n%Ub  UR#                  S5      n-[<        R>                  " U-RA                  5       U-U-RC                  S/5      5      nU(       a/  [(        R*                  R-                  U%U5      n U U R8                  -  n Ub  UU   nUb  UU   nUb
  UU   nUU   nUGb%  U RD                  S:  Ga  Un(UR%                  S	5      S	-
  n.U(SS2SSU.-   2SS24   n(Ub  U R                  U5      nU R                  U5      nU R                   XR#                  S5      '   UR#                  U R                   5      n)UU)   n*U(U)SS24   n(U RG                  U(5      n$U(       aX  [(        R*                  R-                  U$R/                  S
U R0                  5      U*R/                  S
5      5      nUU RD                  -  nOU RG                  U(5      n$Ub  U RH                  S:  a  Un+U+SS2UR%                  S	5      * S2SS24   n+Ub  U R                  U5      nUR#                  U R                   5      n)UU)   n,U+U)SS24   n+U RK                  U+5      n#U(       aX  [(        R*                  R-                  U#R/                  S
U R6                  5      U,R/                  S
5      5      nUU RH                  -  nOU RK                  U+5      n#UGbn  UGbj  U RL                  S:  GaY  U R                  RO                  USS2SSS24   5      n/[(        R*                  RQ                  U/S
S9n/U R                  RS                  USS2SSS24   5      n0[(        R*                  RQ                  U0S
S9n0U R                  RT                  RV                  RY                  [Z        [\        5        U R_                  U0U/U R                  RT                  5      u  n&n'n1Ub  U&U   n&U'U   n'U1U   n1U(       aW  [(        R*                  R-                  U&U15      n2[(        R*                  R-                  U'U15      n3U2U3-   S-  nUU RL                  -  n[a        UUU UUUS9n4U(       a5  U4Rc                  5       (       d   [e        S U4Rg                  5        5       5      nU(       Gd>  UURh                  b  URh                  Rk                  5       OSUURl                  b  URl                  Rk                  5       OSUR                  URn                  b  URn                  Rk                  5       OSUURh                  b  URh                  Rk                  5       OSUURl                  b  URl                  Rk                  5       OSUURn                  b  URn                  Rk                  5       OSU!U"U%U&U&U$U#4n5U(       a  U4Rc                  5       (       d  UU44U5-   n5[q        S U5 5       5      $ [s        S%0 SU_SU4_SU_SURh                  _SU_SURl                  _SUR                  _SURn                  _SU_SURh                  _SU_SURl                  _SU_SURn                  _SU!_SU"_S U%_S!U&_S"U'_S#U$_S$U#_6$ )&a  
input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)
input_ids_masked (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
    Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
    to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
    [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:
    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    [What are token type IDs?](../glossary#token-type-ids)
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
image_attention_mask (`torch.FloatTensor` of shape `(batch_size, image_num_patches)`, *optional*):
    Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
    in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
    [What are attention masks?](../glossary#attention-mask)
skip_unmasked_multimodal_encoder (*bool*, *optional*):
    Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
    multimodal embeddings or outputs as of now.
mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
    Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
    Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
    indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
    ..., text_config.vocab_size - 1]`.
mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
    Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
    image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
    computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
    generated automatically using the image codebook assigned to the model. By default, it uses
    [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.
itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
    Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
    The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
return_loss (`bool`, *optional*, default to None):
    Whether to return calculated loss or not.
codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
    Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.

Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import FlavaForPreTraining, AutoProcessor

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
>>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

>>> text = ["a photo of a cat"]

>>> inputs = processor(
...     images=[image],
...     text=text,
...     return_masks=True,
...     return_codebook_pixels=True,
...     padding=True,
...     max_length=77,
...     return_tensors="pt",
... )


>>> output = model(**inputs)
```
Nz`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...T)
r   r   r  r   r   r  r  r  rb  rc  )	r   r   r  r   r  r   r  rb  rc  z`return_loss` is set to True but the image codebook is not initialized and no `mim_labels`  have been passed. Reinstantiate the model with `init_codebook` set to True or pass in your custom `mim_labels`z`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to Truer   r   r   r   r   )rF   rG   rH   rI   rJ   rK   c              3   .   #    U  H  ob  UOSv   M     g 7fr|  r9   )r-   rU   s     r0   r1   .FlavaForPreTraining.forward.<locals>.<genexpr>  s     _I^%5T1<I^s   c              3   .   #    U  H  ob  M  Uv   M     g 7frf   r9   )r-   r   s     r0   r1   r    s     8FqFrg  rU   rV   r!   r"   r#   r$   r%   r&   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   r9   ):rm   r  r  r  loggerwarningr  r!   r#   r%   r  RuntimeErrorr   rY  r  r  r  ner   r  r   r   cross_entropyr   r  r  r  r  r  r  r?   whereanynewr  r  r  r  r  r  	normalizer  r  r  clamp_LOGIT_SCALE_CLAMP_MINLOGIT_SCALE_CLAMP_MAXr  rD   rN   sumrM   r"   r,   r$   r&   r4   rS   )6r/   r   r  r   r  r  r   r   r   r  r  r  r  r  r  rb  rc  r  flava_outputflava_masked_outputpos_maskr!   r#   rW   rY   r[   
total_lossmim_lossmlm_lossmmm_text_lossmmm_image_lossgc_lossitm_lossr]   r^   rc   rb   r_   r  r  sequence_for_imagemasked_tokensmim_labels_filteredsequence_for_textmlm_labels_filtered	pos_pairs	end_indextext_embeddingimage_embedding	gc_labelsgc_loss_imagegc_loss_textflava_lossesr)  s6                                                         r0   r   FlavaForPreTraining.forward}  s
   ~ &1%<k$++B]B]%0%<k$++BYBY 0; -66 	) #	(=NN?
  )zz%))%!5 %E/!5 " 
  #jj&%))!5+/!5 ) 

 '88&66"5"F"F!4!D!D':'P'P$aee
eXee=e>eGV^GKK
KZK/4D:>>
>% #.2N2Z!k&&.&; 
 )0$Y  "00EEF[\
 ??Q#:#FKgKo!8%!//
;
"&"4"4_"E7;7K7K
--d34%7JOOA<N;N;PRS8S%T" *d.B.B C&0&?#%7q8H%I"!]]+=>
!}}::"D,A,ABDWD\D\]_D` H /H!]]+=>
 ??Q#9#EJfJn 6%!//
;
$5a*//!:L9L9NPQ6Q$R! *d.B.B C&0&?#$5mQ6F$G!!]]+<=
!}}::"D,@,@ACVC[C[\^C_ H /H!]]+<=
 ??Q#?#K'CDJ%&MM!,	 ;;y}}	9==RVQWCXY!}}:::zRH/H/;3OPX3Y0)!+H!5J)!+H!5J&5h&?O (38M8MPQ8Q!=/44Q7!;I!3Aq1y=7H!4K!L%!//
;
"&"4"4_"E7;7K7K
--d34 *d.B.B C&0&?#%7q8H%I"#'#6#67I#J %']]%@%@(--b$2G2GHJ]JbJbceJf&N #d&;&;;N#'#6#67I#J  (38L8Lq8P < 1!6L6Q6QRS6T5T5VXY2Y Z%!//
;
 *d.B.B C&0&?#$5mQ6F$G!"&"4"45F"G$&MM$?$?',,R1E1EFH[H`H`acHd%M "T%9%99M"&"4"45F"G 'O,GDLjLjmnLn!ZZ771a8PQN]]44^4LN"jj99:J1aQR7:STO mm55o25NOJJ""''../DF[\;?;W;W1G1G<8oy
 ##3H#= "1(";%h/	 " ; ;<Li X!}}::?IV(<71<4999"&$"
 |4466_I\I\I^__J 8D8Q8Q8]))224cg7C7O7O7[((113ae22=I=[=[=g..779mq'?R?_?_?k#0099;qu&>Q>]>]>i#//88:os,&88D $55>>@   +F. <#8#8#:#:   8F888( 

"
 .
 &22	

 ,
 %00
 #/"D"D
 +<<
 %<
 !4 @ @
 $:
  3>>
 *F
 &9%J%J
 "
  "!
" "#
$ *:%
& )8'
( .)
* ,+
 	
r8   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rf   )NNNNNNNNNNNNNNTNN)r:   r;   r<   r=   _tied_weights_keysr   r   r   r  ru   r?   r   r  r   r  r@   rQ   r   r
   rS   r   rB   r   r   s   @r0   r  r  G  s   !{ !HRYY<O ! !Fu|| 
  157;48=A151526377;;?-1-1-1,0%)&*&*%k
E,,-k
 #5#3#34k
 u001	k

  ((9(9:k
 !.k
 !.k
 "%,,/k
 u//0k
 'u||4k
 +34.k
 U\\*k
 U\\*k
 U\\*k
 $D>k
  #!k
" d^#k
$ d^%k
& 
uU\\"$==	>'k
 k
r8   r  )r  r=  r  r  r  r  r  )Jr>   r   r  r   dataclassesr   typingr   r   r   r   r	   r
   r   r?   torch.utils.checkpointr   activationsr   modeling_outputsr   r   modeling_utilsr   r   r   utilsr   r   r   r   configuration_flavar   r   r   r   r   
get_loggerr:   r  rU  r  r  r  r   rD   rS   r  rk   r{   r   r   r  r&  r:  rG  rM  rZ  rv  r  r  r  r  r  r  r"  r.  r=  rb  r  rt  r{  r  __all__r9   r8   r0   <module>r     s      # ! ? ? ?    ! K c c D D  
		H	%>   _.>@UUV  "
{ "
 "
J !+ ! !H `t `t `tJ_299 _H!bii !H6")) 6r@ @Fbii $'RYY 'T		 ""))  + +\0
299 0
f"))  N? N ND ]
* ]
 ]
@ m
) m
 m
` \
/ \
 \
~ h
% h
 h
V			 *Cbii C"299 ( r)- r)r)j299 "		 ,
299 
%9 %9P 
]
. ]

]
@r8   