
    fTh                        S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
  SSKrSSKJr  SSKJrJr  SSKJrJr  SS	KJr  SS
KJrJrJr  SSKJr  \R6                  " \5      r " S S\R<                  5      r\ " S S\5      5       r \ " S S\5      5       r!\ " S S\5      5       r"S r# " S S\R<                  5      r$ " S S\R<                  5      r% " S S\R<                  5      r& " S S\R<                  5      r' " S S\R<                  5      r( " S  S!\R<                  5      r) " S" S#\R<                  5      r* " S$ S%\R<                  5      r+ " S& S'\R<                  5      r, " S( S)\R<                  5      r- " S* S+\R<                  5      r. " S, S-\R<                  5      r/ " S. S/\R<                  5      r0 " S0 S1\R<                  5      r1 " S2 S3\R<                  5      r2 " S4 S5\R<                  5      r3 " S6 S7\R<                  5      r4\ " S8 S9\5      5       r5\ " S: S;\55      5       r6\ " S< S=\55      5       r7\" S>S?9 " S@ SA\55      5       r8/ SBQr9g)CzPyTorch LXMERT model.    N)	dataclass)DictOptionalTupleUnion)nn)CrossEntropyLossSmoothL1Loss   )ACT2FNgelu)PreTrainedModel)ModelOutputauto_docstringlogging   )LxmertConfigc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )GeLU$   c                 "   > [         TU ]  5         g N)super__init__)self	__class__s    b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/lxmert/modeling_lxmert.pyr   GeLU.__init__%   s        c                     [        U5      $ r   )r   )r   xs     r   forwardGeLU.forward(   s    Awr    __name__
__module____qualname____firstlineno__r   r"   __static_attributes____classcell__r   s   @r   r   r   $   s     r   r   c                   x   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Srg)LxmertModelOutput,   a
  
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
encoder")


Args:
    language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the language encoder.
    vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the visual encoder.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
        by a Linear layer and a Tanh activation function. The Linear
    language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlanguage_outputvision_outputpooled_outputlanguage_hidden_statesvision_hidden_stateslanguage_attentionsvision_attentionscross_encoder_attentionsr$   )r&   r'   r(   r)   __doc__r0   r   torchFloatTensor__annotations__r1   r2   r3   r   r4   r5   r6   r7   r*   r$   r   r   r.   r.   ,   s     D 48OXe//0715M8E--.515M8E--.5AEHU5+<+<%=>E?C(5):):#;<C>B%(9(9":;B<@xe&7&7 89@CGhuU->->'?@Gr   r.   c                   P   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg) LxmertForQuestionAnsweringOutputZ   aQ	  
Output type of [`LxmertForQuestionAnswering`].

Args:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.k.
    question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
        Prediction scores of question answering objective (classification).
    language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlossquestion_answering_scorer3   r4   r5   r6   r7   r$   )r&   r'   r(   r)   r8   r?   r   r9   r:   r;   r@   r3   r   r4   r5   r6   r7   r*   r$   r   r   r=   r=   Z   s    : )-D(5$$
%,<@hu'8'89@AEHU5+<+<%=>E?C(5):):#;<C>B%(9(9":;B<@xe&7&7 89@CGhuU->->'?@Gr   r=   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)LxmertForPreTrainingOutput   a
  
Output type of [`LxmertForPreTraining`].

Args:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the textual matching objective (classification) head (scores of True/False
        continuation before SoftMax).
    question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
        Prediction scores of question answering objective (classification).
    language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
        shape `(batch_size, sequence_length, hidden_size)`.
    language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.

Nr?   prediction_logitscross_relationship_scorer@   r3   r4   r5   r6   r7   r$   )r&   r'   r(   r)   r8   r?   r   r9   r:   r;   rD   rE   r@   r3   r   r4   r5   r6   r7   r*   r$   r   r   rB   rB      s    !F )-D(5$$
%,59x 1 129<@hu'8'89@<@hu'8'89@AEHU5+<+<%=>E?C(5):):#;<C>B%(9(9":;B<@xe&7&7 89@CGhuU->->'?@Gr   rB   c           	      <    SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ n	U H]  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        U	R                  U5        M_     [        X5       GH  u  pU
R                  S5      n
[!        S U
 5       5      (       a)  [        R                  S	SR#                  U
5       35        MW  U nU
 H  nUR%                  S
U5      (       a  UR                  SU5      nOU/nUS   S:X  d	  US   S:X  a  ['        US5      nOZUS   S:X  d	  US   S:X  a  ['        US5      nO;US   S:X  a  ['        US5      nO%US   S:X  a  ['        US5      nO ['        XS   5      n[+        U5      S:  d  M  [-        US   5      nUU   nM     WSS S:X  a  ['        US5      nOUS:X  a  UR/                  U5      n UR0                  UR0                  :X  d   e [        R                  SU
 35        [6        R8                  " U5      Ul        GM     U $ ! [         a    [        R                  S5        e f = f! [(         a,    [        R                  S	SR#                  U
5       35         GM  f = f! [2         a1  nU=R4                  UR0                  UR0                  4-  sl        e SnAff = f)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   ,   #    U  H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepNr$   ).0ns     r   	<genexpr>,load_tf_weights_in_lxmert.<locals>.<genexpr>   s%      

   s   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r   i_embeddingszInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargsr9   
from_numpydata)modelconfigtf_checkpoint_pathr]   nptftf_path	init_varsnamesarraysnameru   arraypointerm_namescope_namesnumes                     r   load_tf_weights_in_lxmertr      s   
 ggoo01G
KK8	BC''0IEF (l5'BC&&w5Te	 ! 5)zz#  

 

 

 

 KK)CHHTN#345F||,f55 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g1~>G ;1$+a.)!#,+ , #$<=(gx0GxLL'E	==EKK/// 	078''.e *f LK  Q	
 	h & KK)CHHTN+; <=  	FFw}}ekk22F	s5   J J',K !J$'1KK 
L*,LLc                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )LxmertEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l
        [        R                  " UR
                  SS9U l        [        R                  " UR                  5      U l        g )Nr   )padding_idx-q=eps)r   r   r   	Embedding
vocab_sizehidden_sizeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormDropouthidden_dropout_probdropoutr   r{   r   s     r   r   LxmertEmbeddings.__init__  s    !||F,=,=v?Q?Q_`a#%<<0N0NPVPbPbpq#r %'\\&2H2H&J\J\jk%l" f&8&8eDzz&"<"<=r   c                 J   Ub  UR                  5       nUR                  nOUR                  5       S S nUR                  nUS   n[        R                  " U[        R                  US9nUR                  S5      R                  U5      nUc8  [        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      nU R                  U5      n	X8-   U	-   n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr   dtypedevicer   )sizer   r9   arangelong	unsqueezeexpandzerosposition_idsr   r   r   r   r   )r   	input_idstoken_type_idsinputs_embedsinput_shaper   
seq_lengthr   r   r   
embeddingss              r   r"   LxmertEmbeddings.forward  s	    #..*K%%F',,.s3K"))F ^
||JejjP#--a077D!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
r   )r   r   r   r   r   NN)	r&   r'   r(   r)   r8   r   r"   r*   r+   r,   s   @r   r   r     s    Q	> r   r   c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )LxmertAttentioni)  c                   > [         TU ]  5         UR                  UR                  -  S:w  a&  [	        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        Uc  UR                  n[        R                  " UR                  U R                  5      U l
        [        R                  " X R                  5      U l        [        R                  " X R                  5      U l        [        R                  " UR                  5      U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())r   r   r   num_attention_heads
ValueErrorrs   attention_head_size	head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   )r   r{   ctx_dimr   s      r   r   LxmertAttention.__init__*  s    : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W 11D4L4LL ?((GYYv114>>B
99Wnn5YYw7
zz&"E"EFr   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr   r   r[   r   r   )r   r   r   viewpermute)r   r!   new_x_shapes      r   transpose_for_scores$LxmertAttention.transpose_for_scores>  sT    ffhsm$$$$'
 
 FF;yyAq!$$r   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n	U R                  U5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   n[        R                  R                  USS9nU R                  U5      n[        R
                  " X5      nUR                  SSSS5      R                  5       nUR!                  5       S S U R"                  4-   nUR%                  U5      nU(       a  X4nU$ U4nU$ )Nr   )dimr   r[   r   r   )r   r   r   r   r9   matmulrt   mathsqrtr   r   
functionalsoftmaxr   r   
contiguousr   r   r   )r   hidden_statescontextattention_maskoutput_attentionsmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r   r"   LxmertAttention.forwardF  sS    JJ}5((7+ JJw///0AB--o>	//0AB !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t~~>O"O%**+BC6G=2 O\M]r   )r   r   r   r   r   r   r   r   NF)	r&   r'   r(   r)   r   r   r"   r*   r+   r,   s   @r   r   r   )  s    G(% r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertAttentionOutputif  c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  SS9U l        [        R                  " UR                  5      U l	        g Nr   r   )
r   r   r   r   r   denser   r   r   r   r   s     r   r   LxmertAttentionOutput.__init__g  sZ    YYv1163E3EF
f&8&8eDzz&"<"<=r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   r   r   r   input_tensors      r   r"   LxmertAttentionOutput.forwardm  5    

=1]3}'CDr   r   r   r   r%   r,   s   @r   r   r   f      > r   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )LxmertCrossAttentionLayerit  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r   r   r   attr   outputr   s     r   r   "LxmertCrossAttentionLayer.__init__u  s&    "6*+F3r   c                     U R                  XX4S9nU(       a  US   nU R                  US   U5      nU(       a  UW4nU$ U4nU$ Nr   r   r   r   r   )	r   r   
ctx_tensorctx_att_maskr   r   r   attention_outputr   s	            r   r"   !LxmertCrossAttentionLayer.forwardz  sX    ,Lf$QiO;;vay,?9J#_5 RbPcr   r   r   r%   r,   s   @r   r   r   t  s    4
 r   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )LxmertSelfAttentionLayeri  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r   r   r   r   r   r   r   s     r   r   !LxmertSelfAttentionLayer.__init__  s&    #F+	+F3r   c                     U R                  UUUUS9nU(       a  US   nU R                  US   U5      nU(       a  UW4nU$ U4nU$ r   )r   r   )r   r   r   r   r   r   r   r   s           r   r"    LxmertSelfAttentionLayer.forward  sg    /	  
 $QiO;;vay,?9J#_5 RbPcr   )r   r   Fr%   r,   s   @r   r   r     s    4
 r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertIntermediatei  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r   r   r   r   r   intermediate_sizer   r   
hidden_actintermediate_act_fnr   s     r   r   LxmertIntermediate.__init__  s?    YYv1163K3KL
#)&*;*;#< r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r  r   r   s     r   r"   LxmertIntermediate.forward  s&    

=100?r   r  r%   r,   s   @r   r  r    s    =
 r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertOutputi  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  SS9U l        [        R                  " UR                  5      U l
        g r   )r   r   r   r   r	  r   r   r   r   r   r   r   s     r   r   LxmertOutput.__init__  sZ    YYv779K9KL
f&8&8eDzz&"<"<=r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r   r"   LxmertOutput.forward  r   r   r   r%   r,   s   @r   r  r    r   r   r  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )LxmertLayeri  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g r   )r   r   r   	attentionr  intermediater  r   r   s     r   r   LxmertLayer.__init__  s3    1&9.v6"6*r   c                     U R                  XUS9nUS   nU R                  U5      nU R                  Xe5      nU4USS  -   nU$ )Nr   r   r   r  r  r   )r   r   r   r   r   r   intermediate_outputlayer_outputs           r   r"   LxmertLayer.forward  sW    ..Rc.d"1:"//0@A{{#6I/GABK/r   r  r   r%   r,   s   @r   r  r    s    + r   r  c                   L   ^  \ rS rSrU 4S jr SS jrS rS r SS jrSr	U =r
$ )	LxmertXLayeri  c                   > [         TU ]  5         [        U5      U l        [	        U5      U l        [	        U5      U l        [        U5      U l        [        U5      U l
        [        U5      U l        [        U5      U l        g r   )r   r   r   visual_attentionr   lang_self_attvisn_self_attr  
lang_interr  lang_output
visn_intervisn_outputr   s     r   r   LxmertXLayer.__init__  sk     9& A 6f=5f= -V4'/,V4'/r   c                 P    U R                  UUUUS9nU R                  UUUSS9nXg4$ )N)r   r   F)r%  )r   
lang_inputlang_attention_maskvisual_inputvisual_attention_maskoutput_x_attentionslang_att_outputvisual_att_outputs           r   	cross_attLxmertXLayer.cross_att  sT     //.1	 0 
 !11,#	 2 
 11r   c                 V    U R                  XSS9nU R                  X4SS9nUS   US   4$ )NFr   r   )r&  r'  )r   r.  r/  r0  r1  r3  r4  s          r   self_attLxmertXLayer.self_att  sE    ,,Z`e,f ..|fk.lq!#4Q#777r   c                     U R                  U5      nU R                  U5      nU R                  X15      nU R                  XB5      nXV4$ r   )r(  r*  r)  r+  )r   r.  r0  lang_inter_outputvisual_inter_outputr)  visual_outputs          r   	output_fcLxmertXLayer.output_fc  sM     OOJ7"ool; &&'8E(()<K))r   c                     U R                  UUUUUS9u  pgUSS  nU R                  US   UUS   U5      u  pgU R                  Xg5      u  pU(       a  U	U
US   4$ X4$ )N)r.  r/  r0  r1  r2  r   r   )r5  r8  r>  )r   
lang_featsr/  visual_featsr1  r   r3  r4  r   r)  r=  s              r   r"   LxmertXLayer.forward  s     .2^^! 3%"7 1 .< .
* *!"--1]]Aa !	.
* &*^^O%W" !	 "	
 -	
r   )r(  r)  r&  r*  r+  r'  r%  r  )r&   r'   r(   r)   r   r5  r8  r>  r"   r*   r+   r,   s   @r   r#  r#    s+    0* "2.8	*"   
  
r   r#  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertVisualFeatureEncoderi  c                   > [         TU ]  5         UR                  nUR                  n[        R
                  " X!R                  5      U l        [        R                  " UR                  SS9U l	        [        R
                  " X1R                  5      U l
        [        R                  " UR                  SS9U l        [        R                  " UR                  5      U l        g r   )r   r   visual_feat_dimvisual_pos_dimr   r   r   visn_fcr   visn_layer_normbox_fcbox_layer_normr   r   r   )r   r{   feat_dimpos_dimr   s       r   r   #LxmertVisualFeatureEncoder.__init__  s    ))'' yy+=+=>!||F,>,>EJ ii););< ll6+=+=5Izz&"<"<=r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nX4-   S-  nU R	                  U5      nU$ Nr[   )rI  rJ  rK  rL  r   )r   rB  
visual_posr!   yr   s         r   r"   "LxmertVisualFeatureEncoder.forward-  s\    LL&  #KK
#"%1f%r   )rK  rL  r   rI  rJ  r%   r,   s   @r   rE  rE    s    > r   rE  c                   6   ^  \ rS rSrU 4S jr  SS jrSrU =r$ )LxmertEncoderi8  c                   > [         TU ]  5         [        U5      U l        Xl        UR
                  U l        UR                  U l        UR                  U l
        [        R                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        R                  " [        U R                  5       Vs/ s H  n[!        U5      PM     sn5      U l        [        R                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l	        g s  snf s  snf s  snf r   )r   r   rE  rI  r{   l_layersnum_l_layersx_layersnum_x_layersr_layersnum_r_layersr   
ModuleListranger  layerr#  )r   r{   _r   s      r   r   LxmertEncoder.__init__9  s     2&9 #OO"OO"OO ]]tGXGXAY#ZAYAK$7AY#Z[
U4K\K\E]&^E]|F';E]&^_E$J[J[D\&]D\q{6':D\&]^ $[&^&]s    D2D7D<c           	         SnSnU(       d  U R                   R                  (       a  SOS n	U(       d  U R                   R                  (       a  SOS n
U(       d  U R                   R                  (       a  SOS nU R                  X45      nU R                   H!  nU" XUS9nUS   nX4-   nU
c  M  XS   4-   n
M#     U R                   H!  nU" X5US9nUS   nXs4-   nU	c  M  XS   4-   n	M#     U R
                   H+  nU" UUUUUS9nUS S u  pXs4-   nX4-   nUc  M#  XS   4-   nM-     UU(       a  U	OS 4nUU(       a  U
OS 4nUUU(       a  U4$ S 4$ )Nr$   r   r   r   r[   )r{   r   rI  r`  r\  rZ  )r   rA  r/  rB  rR  r1  r   r4   r3   r6   r5   r7   layer_module	l_outputs	v_outputs	x_outputsvisual_encoder_outputslang_encoder_outputss                     r   r"   LxmertEncoder.forwardK  s     "!#"3t{{7T7TBZ^$59V9Vb\`):dkk>[>[2ae ||L= !JJL$ZXijI"1J%;m%K"".&9q\O&K# ' !MML$\\mnI$Q<L#7/#I  ,$51$G! * !MML$#%"3I (1!}$J#7/#I %;m%K"'3+CQR|o+U( * !!2"

 ##4$ 

 # (9$
 	
 @D
 	
r   )r{   r`  rY  r]  r[  r\  rI  rZ  r   r%   r,   s   @r   rV  rV  8  s    _0 #;
 ;
r   rV  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertPooleri  c                    > [         [        U ]  5         [        R                  " UR
                  UR
                  5      U l        [        R                  " 5       U l        g r   )	r   rl  r   r   r   r   r   Tanh
activationr   s     r   r   LxmertPooler.__init__  s;    lD*,YYv1163E3EF
'')r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   ro  )r   r   first_token_tensorr2   s       r   r"   LxmertPooler.forward  s6     +1a40

#566r   )ro  r   r%   r,   s   @r   rl  rl    s    $
 r   rl  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertPredictionHeadTransformi  c                   > [         [        U ]  5         [        R                  " UR
                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  SS9U l
        g r   )r   ru  r   r   r   r   r   r   r
  transform_act_fnr   r   s     r   r   &LxmertPredictionHeadTransform.__init__  sZ    +T;=YYv1163E3EF
 &v'8'8 9f&8&8eDr   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rw  r   r  s     r   r"   %LxmertPredictionHeadTransform.forward  s4    

=1--m<}5r   )r   r   rw  r%   r,   s   @r   ru  ru    s    E r   ru  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertLMPredictionHeadi  c                 X  > [         [        U ]  5         [        U5      U l        [
        R                  " UR                  S5      UR                  S5      SS9U l        X R                  l	        [
        R                  " [        R                  " UR                  S5      5      5      U l        g )Nr   r   FrW   )r   r|  r   ru  	transformr   r   r   decoderrT   	Parameterr9   r   rW   r   r{   lxmert_model_embedding_weightsr   s      r   r   LxmertLMPredictionHead.__init__  s    $d466v> yy*//2*//2

 =LL-K-P-PQR-S!TU	r   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ r   )r  r  rW   r  s     r   r"   LxmertLMPredictionHead.forward  s-    }5]3dii?r   )rW   r  r  r%   r,   s   @r   r|  r|    s    V r   r|  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertVisualAnswerHeadi  c           	        > [         TU ]  5         UR                  n[        R                  " [        R
                  " X3S-  5      [        5       [        R                  " US-  SS9[        R
                  " US-  U5      5      U l        g )Nr[   r   r   )	r   r   r   r   
Sequentialr   r   r   logit_fc)r   r{   
num_labelshid_dimr   s       r   r   LxmertVisualAnswerHead.__init__  sb    $$IIg{+FLL1%0IIgk:.	
r   c                 $    U R                  U5      $ r   r  r  s     r   r"   LxmertVisualAnswerHead.forward  s    }}]++r   r  r%   r,   s   @r   r  r    s    
, ,r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertVisualObjHeadi  c                   > [         TU ]  5         [        U5      U l        0 nUR                  (       a  SUR
                  S.US'   UR                  (       a  SUR                  S.US'   UR                  (       a  SUR                  4UR                  S.US'   X l
        [        R                  " U R                   Vs0 s H4  o3[        R                  " UR                  U R                  U   S   5      _M6     sn5      U l        g s  snf )Nr   )ru   r   objattrr   featr   )r   r   ru  r  visual_obj_lossnum_object_labelsvisual_attr_lossnum_attr_labelsvisual_feat_lossrG  visual_lossesr   
ModuleDictr   r   decoder_dict)r   r{   r  r   r   s       r   r   LxmertVisualObjHead.__init__  s    6v>!!-26;S;S#TM% "".3F<R<R$SM&!""f445--%M&! + MM[_[m[mn[mTW"))F..0B0B30G0NOO[mn
ns   ;;Dc                     U R                  U5      n0 nU R                   H  nU R                  U   " U5      X#'   M     U$ r   )r  r  r  )r   r   r   r   s       r   r"   LxmertVisualObjHead.forward  sA    }5%%C++C0?FK &r   )r  r  r  r%   r,   s   @r   r  r    s    
, r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )LxmertPreTrainingHeadsi  c                    > [         [        U ]  5         [        X5      U l        [
        R                  " UR                  S5      U l        g rQ  )	r   r  r   r|  predictionsr   r   r   seq_relationshipr  s      r   r   LxmertPreTrainingHeads.__init__  s7    $d461&Y "		&*<*<a @r   c                 L    U R                  U5      nU R                  U5      nX44$ r   r  r  )r   sequence_outputr2   prediction_scoresseq_relationship_scores        r   r"   LxmertPreTrainingHeads.forward  s-     ,,_=!%!6!6}!E 88r   r  r%   r,   s   @r   r  r    s    A
9 9r   r  c                   *    \ rS rSr\r\rSrSr	S r
Srg)LxmertPreTrainedModeli  lxmertFc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weights        )meanstdN      ?)
isinstancer   r   rT   ry   normal_r{   initializer_rangerW   zero_r   r   r   fill_r|  )r   modules     r   _init_weights#LxmertPreTrainedModel._init_weights  s3   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 677KK""$ 8r   r$   N)r&   r'   r(   r)   r   config_classr   load_tf_weightsbase_model_prefix!_supports_param_buffer_assignmentr  r*   r$   r   r   r  r    s    L/O (-%%r   r  c                   x  ^  \ rS rSrU 4S jrS rS r\          SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\   S\\   S\\   S\\\\	R                     4   4S jj5       rSrU =r$ )LxmertModeli  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  5         g r   )	r   r   r   r   rV  encoderrl  pooler	post_initr   s     r   r   LxmertModel.__init__  s>     *62$V,"6*r   c                 .    U R                   R                  $ r   r   r   r   s    r   get_input_embeddings LxmertModel.get_input_embeddings  s    ...r   c                 $    XR                   l        g r   r  )r   new_embeddingss     r   set_input_embeddings LxmertModel.set_input_embeddings  s    *8'r   r   rB  rR  r   r1  r   r   r   output_hidden_statesreturn_dictreturnc                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[	        S5      eUc  [	        S5      eUc  [	        S5      eUb  UR                  OUR                  nUc  [        R                  " XS9nUc$  [        R                  " U[        R                  US9nUR                  S	5      R                  S
5      nUR                  U R                  S9nSU-
  [        R                  " U R                  5      R                   -  nUbj  UR                  S	5      R                  S
5      nUR                  U R                  S9nSU-
  [        R                  " U R                  5      R                   -  nOSnU R#                  XU5      nU R%                  UUUUUUS9nUSS
 u  nnUS   nUS   nSnU(       a  US	   nUS	   nUS
   nUUU4nU	(       a  UU4OSnUS   nUS   nU R'                  U5      nU
(       d  UUU4U-   U-   $ [)        UUUU	(       a  UOSU	(       a  UOSU(       a  WOSU(       a  WOSU(       a  WS9$ SS9$ )a  
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
    This input represents visual features. They ROI pooled object features from bounding boxes using a
    faster-RCNN model)

    These are currently not provided by the transformers library.
visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
    This input represents spatial features corresponding to their relative (via index) visual features. The
    pre-trained LXMERT model expects these spatial features to be normalized bounding boxes on a scale of 0 to
    1.

    These are currently not provided by the transformers library.
visual_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsz`visual_feats` cannot be `None`z`visual_pos` cannot be `None`r   r   r   r[   )r   r  )rB  rR  r1  r   r   r$   )r2   r0   r1   r3   r4   r5   r6   r7   )r{   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   r9   onesr   r   r   tor   finfominr   r  r  r.   )r   r   rB  rR  r   r1  r   r   r   r  r  r   r   extended_attention_maskextended_visual_attention_maskembedding_outputencoder_outputsrh  ri  r4   r3   all_attentionsr5   r6   r7   r   r=  r)  r2   s                                r   r"   LxmertModel.forward  s,   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU>??<==%.%:!!@T@T!"ZZCN!"[[EJJvVN #1":":1"="G"G"J #:"<"<4::"<"N#&)@#@EKKPTPZPZD[D_D_"_ !,-B-L-LQ-O-Y-YZ[-\*-K-N-NUYU_U_-N-`*.14R.RV[VaVabfblblVmVqVq-q*-1*  ??9mT ,,#%!"@/ ' 
 8Gr7J4 45a8!5a!8"6q"9 6q 9'6q'9$#!(N K_/1EFdf,R0,R0K0>NQ___ '''=Q#9W[9M!5SW7H 3d3D/$AR%=	
 		
 Y]	
 		
r   )r   r  r  )
NNNNNNNNNN)r&   r'   r(   r)   r   r  r  r   r   r9   
LongTensorr:   boolr   r.   r   r"   r*   r+   r,   s   @r   r  r    s.   /9  1548266:=A5959,0/3&*B
E,,-B
 u001B
 U../	B

 !!2!23B
  ((9(9:B
 !!1!12B
   1 12B
 $D>B
 'tnB
 d^B
 
 %(9(9"::	;B
 B
r   r  c            #         ^  \ rS rSrS/rU 4S jrS r S S\S\\   S\	S\
R                  4U 4S	 jjjrS\4S
 jrS rS rS\
R                   4S jrS rS r\              S!S\\R,                     S\\R.                     S\\R.                     S\\R.                     S\\R.                     S\\R,                     S\\R.                     S\\R,                     S\\\\\R.                  \R.                  4   4      S\\R,                     S\\R6                     S\\	   S\\	   S\\	   S\\\\R.                     4   4S jj5       rSrU =r $ )"LxmertForPreTrainingi  zcls.predictions.decoder.weightc                   > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l        [        U5      U l
        [        XR                  R                  R                  R                  5      U l        U R                  (       a  [!        U5      U l        U R                  (       a  [%        XR                  5      U l        U R)                  5         [+        SS9[-        SS9[-        5       S.U l        0 nUR0                  (       a  SUR2                  SS.US'   UR4                  (       a  SUR6                  SS.US'   UR8                  (       a  S	UR:                  4UR:                  S
S.US'   X l        g )Nnone)	reduction)l2	visual_cecer  r  )ru   r   r?   r  r  r   r  r  )r   r   r{   num_qa_labelsvisual_loss_normalizertask_mask_lmtask_obj_predicttask_matchedtask_qar  r  r  r   r   rT   clsr  obj_predict_headr  answer_headr  r
   r	   	loss_fctsr  r  r  r  r  rG  r  )r   r{   r  r   s      r   r   LxmertForPreTraining.__init__  s    #11&,&C&C# #// & 7 7"//~~ "&) *&++2H2H2X2X2_2_`  $7$?D!<<5f>P>PQD 	 0)F;"$
 !!//#$M% 
 ""--#%M&!
 ""f445--%M&!
 +r   c                     U R                   R                  R                  R                  U R                  R
                  R                  l        g r   )r  r   r   rT   r  r  r  r  s    r   _tie_weights!LxmertForPreTraining._tie_weights  s0    .2kk.D.D.T.T.[.[$$+r   new_num_tokenspad_to_multiple_ofmean_resizingr  c                    > [         TU ]  XU5      nU R                  U R                  R                  R
                  U5      U R                  R                  l        U$ r   )r   resize_token_embeddings_resize_biasr  r  rW   )r   r  r  r  r  r   s        r   r  ,LxmertForPreTraining.resize_token_embeddings  sM     8]jk$($5$5dhh6J6J6O6OQ_$`!r   c                     UR                   S   nX#::  a  US U nO8[        R                  " X#-
  UR                  S9n[        R                  " X/5      n[
        R                  " U5      nU$ )Nr   r  )ru   r9   r   r   catr   r  )r   rW   r  old_num_tokensnew_bias
extra_biass         r   r  !LxmertForPreTraining._resize_bias  s\    A+O^,H^%DT[[YJyy$!34H<<)r   c                     U R                  5       nUb  Uc  gU R                  U5      nXR                  l        Xl        U$ a  
Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size
will add newly initialized weights. Reducing the size will remove weights from the end

Args:
    num_labels (`int`, *optional*):
        New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
        weights at the end. Reducing the size will remove weights from the end. If not provided or `None`, just
        returns a pointer to the qa labels ``torch.nn.Linear``` module of the model without doing anything.

Return:
    `torch.nn.Linear`: Pointer to the resized Linear layer or the old Linear layer
Nget_qa_logit_layer_resize_qa_labelsr{   r  r   r  cur_qa_logit_layernew_qa_logit_layers       r   resize_num_qa_labels)LxmertForPreTraining.resize_num_qa_labels  I     "446!3!;!33J?$.!'!!r   c                     U R                  5       nU R                  X!5      nU R                  U5        U R                  5       $ r   r  _get_resized_qa_labels_set_qa_logit_layerr  s       r   r  &LxmertForPreTraining._resize_qa_labels  A    !446!889KX  !34&&((r   c                 X    [        U S5      (       a  U R                  R                  S   $ g)z
Returns the linear layer that produces question answering logits.

Returns:
    `nn.Module`: A torch module mapping the question answering prediction hidden states or `None` if LXMERT
    does not have a visual answering head.
r  r   Nhasattrr  r  r  s    r   r  'LxmertForPreTraining.get_qa_logit_layer  s-     4''##,,R00 (r   c                 4    XR                   R                  S'   g Nr   r  r  r   qa_logit_layers     r   r  (LxmertForPreTraining._set_qa_logit_layer      (6!!"%r   c                 @   Uc  U$ UR                   R                  5       u  p4X2:X  a  U$ [        USS 5      b  [        R                  " XB5      nO[        R                  " XBSS9nUR                  UR                   R                  5        U R                  U5        [        X25      nUR                   R                  S U2S S 24   UR                   R                  S U2S S 24'   [        USS 5      b0  UR                  R                  S U UR                  R                  S U& U$ NrW   Fr~  rT   r   rp   r   r   r  r   r  r  ry   rW   r   r  r  cur_qa_labels
hidden_dimr  num_labels_to_copys          r   r  +LxmertForPreTraining._get_resized_qa_labels!  #   %%$6$=$=$B$B$D!&%% %vt4@!#:!B!#:!N077>>? 	-. !;ASAZAZA_A_`sas`suv`vAw!!&&':(:':A'=>%vt4@@R@W@W@\@\]p^p@q##(()<*<=!!r   r   rB  rR  r   r1  r   r   labels
obj_labelsmatched_labelansr   r  r  c                    SU;   a,  [         R                  " S[        5        UR                  S5      nUb  UOU R                  R
                  nUb  UR                  OUR                  nU R                  UUUUUUUUUUS9
nUS   US   US   nnnU R                  UU5      u  nnU R                  (       a  U R                  U5      nOUS   S   nUc  U
c  U	c  Uc  SO[        R                  " SUS	9nUb_  U R                  (       aN  U R                  S
   " UR                  SU R                  R                   5      UR                  S5      5      nUU-  nU
bK  U R"                  (       a:  U R                  S
   " UR                  SS5      U
R                  S5      5      nUU-  nU	Gb  U R$                  (       Ga  [        R                  " SUR                  S	9nU R'                  U5      nU R(                  R+                  5        H  u  nnU	U   u  nn US   n!US   n"US   n#U R,                  n$U R                  U"   n%UU   n&U%" U&R                  SU!5      UR                  U#5      5      n'U'R/                  5       S:  a  U'R1                  S5      n'U'U R                  S5      -  R1                  5       U$-  n'UU'-  nM     UU-  nUbU  U R                  (       aD  U R                  S
   " UR                  SU R2                  5      UR                  S5      5      n(UU(-  nU(       d  UUU4USS -   n)Ub  U4U)-   $ U)$ [5        UUUUUR6                  UR8                  UR:                  UR<                  UR>                  S9	$ )a  
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
    This input represents visual features. They ROI pooled object features from bounding boxes using a
    faster-RCNN model)

    These are currently not provided by the transformers library.
visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
    This input represents spatial features corresponding to their relative (via index) visual features. The
    pre-trained LXMERT model expects these spatial features to be normalized bounding boxes on a scale of 0 to
    1.

    These are currently not provided by the transformers library.
visual_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
obj_labels (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
    each key is named after each one of the visual losses and each element of the tuple is of the shape
    `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
    the label score respectively
matched_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the whether or not the text input matches the image (classification) loss. Input
    should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

    - 0 indicates that the sentence does not match the image,
    - 1 indicates that the sentence does match the image.
ans (`Torch.Tensor` of shape `(batch_size)`, *optional*):
    a one hot representation hof the correct answer *optional*
masked_lm_labelszlThe `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.N
r   rB  rR  r   r   r1  r   r  r   r  r   r   r[   r  r  r  r   r   r?   ru   r   )	r?   rD   rE   r@   r3   r4   r5   r6   r7   ) warningswarnFutureWarningpopr{   r  r   r  r  r  r  r9   tensorr  r  r   r   r  r  r  r  itemsr  r   r  r  rB   r3   r4   r5   r6   r7   )*r   r   rB  rR  r   r1  r   r   r/  r0  r1  r2  r   r  r  kwargsr   lxmert_outputr)  r=  r2   lang_prediction_scoresrE   answer_score
total_lossmasked_lm_lossmatched_losstotal_visual_lossvisual_prediction_scores_dictr   key_infolabel	mask_conf
output_dimloss_fct_namelabel_shaperT   visual_loss_fctvisual_prediction_scoresvisual_lossanswer_lossr   s*                                             r   r"   LxmertForPreTraining.forward<  s   p 'MM
 ZZ 23F%0%<k$++B]B]%.%:!!@T@T%!))"7'!5/# $ 
 !!! %2]
 <@88KQ^;_8 8<<++M:L(+A.L =#8Z=OTWT_ c&1 	
 $"3"3!^^D1&++B0F0FGBN .(J$):):>>$/0H0M0MbRS0TVcVhVhikVlmL,&J!d&;&;&; %S9I9I J,0,A,A-,P)!%!3!3!9!9!;X#-c? y%e_
 ( 0&w/44"&.."?+H+M(-,11"jAJJ{+ ??$q("-"2"21"5K*Y^^B-??EEG&P![0! "<  ++J?t||...|/@/@TEWEW/XZ]ZbZbceZfgK+%J&( ab!	"F
 0:/EZMF*Q6Q)4%=%1#0#G#G!.!C!C - A A+==%2%K%K

 
	
r   )r  r  r{   r  r  r  r  r  r  r  r  r  r  )NT)NNNNNNNNNNNNNN)!r&   r'   r(   r)   _tied_weights_keysr   r  rs   r   r  r   r   r  r  r  r  Moduler  r  r  r   r9   r  r:   r   strr   Tensorr   rB   r"   r*   r+   r,   s   @r   r  r    s   :;5+n\ dh!7?}\`	  "0)	1BII 	17"6  1548266:=A5959-1W[48&*,0/3&*R
E,,-R
 u001R
 U../	R

 !!2!23R
  ((9(9:R
 !!1!12R
   1 12R
 ))*R
 T#uU->->@Q@Q-Q'R"RSTR
   0 01R
 ell#R
 $D>R
 'tnR
 d^R
" 
)51B1B+CC	D#R
 R
r   r  zR
    Lxmert Model with a visual-answering head on top for downstream QA tasks
    )custom_introc                     ^  \ rS rSrU 4S jrS rS rS\R                  4S jr	S r
S r\           SS	\\R                     S
\\R                      S\\R                      S\\R                      S\\R                      S\\R                     S\\R                      S\\R"                     S\\   S\\   S\\   S\\\\R                      4   4S jj5       rSrU =r$ )LxmertForQuestionAnsweringi  c                   > [         TU ]  U5        Xl        UR                  U l        UR                  U l        [        U5      U l        [        XR                  5      U l        U R                  5         [        5       U l        g r   )r   r   r{   r  r  r  r  r  r  r  r	   r?   r   s     r   r   #LxmertForQuestionAnswering.__init__  sj     #11&,&C&C# "&)1&:L:LM 	 %&	r   c                     U R                  5       nUb  Uc  gU R                  U5      nXR                  l        Xl        U$ r  r  r  s       r   r  /LxmertForQuestionAnswering.resize_num_qa_labels  r  r   c                     U R                  5       nU R                  X!5      nU R                  U5        U R                  5       $ r   r  r  s       r   r  ,LxmertForQuestionAnswering._resize_qa_labels  r  r   r  c                 X    [        U S5      (       a  U R                  R                  S   $ g)z
Returns the linear layer that produces question answering logits

Returns:
    `nn.Module`: A torch module mapping the question answering prediction hidden states. `None`: A NoneType
    object if Lxmert does not have the visual answering head.
r  r   Nr  r  s    r   r  -LxmertForQuestionAnswering.get_qa_logit_layer	  s-     4''##,,R00 (r   c                 4    XR                   R                  S'   g r   r!  r"  s     r   r  .LxmertForQuestionAnswering._set_qa_logit_layer  r%  r   c                 @   Uc  U$ UR                   R                  5       u  p4X2:X  a  U$ [        USS 5      b  [        R                  " XB5      nO[        R                  " XBSS9nUR                  UR                   R                  5        U R                  U5        [        X25      nUR                   R                  S U2S S 24   UR                   R                  S U2S S 24'   [        USS 5      b0  UR                  R                  S U UR                  R                  S U& U$ r'  r(  r)  s          r   r  1LxmertForQuestionAnswering._get_resized_qa_labels  r.  r   r   rB  rR  r   r1  r   r   r/  r   r  r  c                    Ub  UOU R                   R                  nU R                  UUUUUUUU
U	US9
nUS   nU R                  U5      nSnUb;  U R	                  UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  S9$ )ao  
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
    This input represents visual features. They ROI pooled object features from bounding boxes using a
    faster-RCNN model)

    These are currently not provided by the transformers library.
visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
    This input represents spatial features corresponding to their relative (via index) visual features. The
    pre-trained LXMERT model expects these spatial features to be normalized bounding boxes on a scale of 0 to
    1.

    These are currently not provided by the transformers library.
visual_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
    A one-hot representation of the correct answer
Nr5  r[   r   r   )r?   r@   r3   r4   r5   r6   r7   )r{   r  r  r  r?   r   r  r=   r3   r4   r5   r6   r7   )r   r   rB  rR  r   r1  r   r   r/  r   r  r  r=  r2   r?  r?   r   s                    r   r"   "LxmertForQuestionAnswering.forward3  s   J &1%<k$++B]B]%!))"7'!5/# $ 
 &a(''699\..r43E3EFTVXD"_}QR'88F'+'7D7V#CVC/%1#0#G#G!.!C!C - A A+==%2%K%K
 	
r   )r  r{   r?   r  r  r  )NNNNNNNNNNN)r&   r'   r(   r)   r   r  r  r   rQ  r  r  r  r   r   r9   r  r:   rS  r  r   r=   r   r"   r*   r+   r,   s   @r   rV  rV    s_   '&"0)
1BII 
17"6  1548266:=A5959)-,0/3&*E
E,,-E
 u001E
 U../	E

 !!2!23E
  ((9(9:E
 !!1!12E
   1 12E
 &E
 $D>E
 'tnE
 d^E
 
/u7H7H1II	JE
 E
r   rV  )rV  r  rV  r  r  rE  r#  ):r8   r   rc   r6  dataclassesr   typingr   r   r   r   r9   r   torch.nnr	   r
   activationsr   r   modeling_utilsr   utilsr   r   r   configuration_lxmertr   
get_loggerr&   ra   rQ  r   r.   r=   rB   r   r   r   r   r   r   r  r  r  r#  rE  rV  rl  ru  r|  r  r  r  r  r  r  rV  __all__r$   r   r   <module>rn     s^     	  ! / /   3 ' - 9 9 . 
		H	%299  *H *H *HZ $H{ $H $HN ,H ,H ,H^L^%ryy %P:bii :zBII 		 ryy *	 	299 "))  X
299 X
v 6N
BII N
b299 BII RYY *,RYY ,")) >	9RYY 	9 %O % %2 R
' R
 R
j i
0 i
 i
X	 
b
!6 b

b
Jr   