
    fTh`                       S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJrJrJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  \R>                  " \ 5      r!\ " S S\5      5       r"\ " S S\5      5       r#\ " S S\5      5       r$\ " S S\5      5       r%\ " S S\5      5       r&\ " S S\5      5       r'\ " S S\5      5       r(\ " S S\5      5       r)\ " S S \5      5       r*\ " S! S"\5      5       r+ " S# S$\
RX                  5      r- " S% S&\
RX                  5      r. " S' S(\
RX                  5      r/ " S) S*\
RX                  5      r0 " S+ S,\
RX                  5      r1 " S- S.\
RX                  5      r2 " S/ S0\
RX                  5      r3 " S1 S2\
RX                  5      r4 " S3 S4\
RX                  5      r5 " S5 S6\
RX                  5      r6 " S7 S8\
RX                  5      r7 " S9 S:\
RX                  5      r8\ " S; S<\5      5       r9\" S=S>9 " S? S@\95      5       r:SA r; " SB SC\
RX                  5      r<\" SDS>9 " SE SF\95      5       r=\" SGS>9 " SH SI\95      5       r>\" SJS>9 " SK SL\95      5       r?\" SMS>9 " SN SO\95      5       r@\" SPS>9 " SQ SR\95      5       rA\" SSS>9 " ST SU\95      5       rB\ " SV SW\95      5       rC\ " SX SY\95      5       rD/ SZQrEg)[zPyTorch LUKE model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)apply_chunking_to_forward)ModelOutputauto_docstringlogging   )
LukeConfigc                   t    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Srg)BaseLukeModelOutputWithPooling%   a  
Base class for outputs of the LUKE model.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) further processed by a
        Linear layer and a Tanh activation function.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length +
        entity_length, sequence_length + entity_length)`. Attentions weights after the attention softmax, used to
        compute the weighted average in the self-attention heads.
Nentity_last_hidden_state.entity_hidden_states __name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   __static_attributes__r       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/luke/modeling_luke.pyr   r   %   @    2 =Ahu'8'89@DH(5):):C)?#@AHr'   r   c                   t    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Srg)BaseLukeModelOutputD   a  
Base class for model's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr   .r   r   r   r   r'   r(   r+   r+   D   r)   r'   r+   c                   t   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S	'   Sr\\\R                  S
4      \	S'   Sr\\\R                  S
4      \	S'   Srg)LukeMaskedLMOutputc   a  
Base class for model's outputs, with potential hidden states and attentions.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        The sum of masked language modeling (MLM) loss and entity prediction loss.
    mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked language modeling (MLM) loss.
    mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked entity prediction (MEP) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossmlm_lossmep_losslogitsentity_logitshidden_states.r   
attentionsr   )r   r   r    r!   r"   r0   r   r#   r$   r%   r1   r2   r3   r4   r5   r   r   r6   r&   r   r'   r(   r.   r.   c   s    > )-D(5$$
%,,0Hhu(()0,0Hhu(()0*.FHU&&'.15M8E--.58<M8E%"3"345<DH(5):):C)?#@AH:>Ju00#567>r'   r.   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)EntityClassificationOutput   a-  
Outputs of entity classification models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nr0   r3   .r5   r   r6   r   r   r   r    r!   r"   r0   r   r#   r$   r%   r3   r5   r   r   r6   r&   r   r'   r(   r8   r8          , )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r'   r8   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)EntityPairClassificationOutput   a2  
Outputs of entity pair classification models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nr0   r3   .r5   r   r6   r   r:   r   r'   r(   r=   r=      r;   r'   r=   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)EntitySpanClassificationOutput   aA  
Outputs of entity span classification models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
        Classification scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nr0   r3   .r5   r   r6   r   r:   r   r'   r(   r@   r@      r;   r'   r@   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)LukeSequenceClassifierOutput   a  
Outputs of sentence classification models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr0   r3   .r5   r   r6   r   r:   r   r'   r(   rC   rC          2 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r'   rC   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)LukeTokenClassifierOutputi  a  
Base class for outputs of token classification models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
        Classification scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr0   r3   .r5   r   r6   r   r:   r   r'   r(   rG   rG     rE   r'   rG   c                   (   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   Sr\\\R                  S4      \	S
'   Srg) LukeQuestionAnsweringModelOutputi/  a!  
Outputs of question answering models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Span-end scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr0   start_logits
end_logits.r5   r   r6   r   )r   r   r    r!   r"   r0   r   r#   r$   r%   rJ   rK   r5   r   r   r6   r&   r   r'   r(   rI   rI   /  s    6 )-D(5$$
%,04L(5,,-4.2J**+2=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r'   rI   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)LukeMultipleChoiceModelOutputiT  a  
Outputs of multiple choice models.

Args:
    loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

        Classification scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nr0   r3   .r5   r   r6   r   r:   r   r'   r(   rM   rM   T  s    6 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r'   rM   c                   D   ^  \ rS rSrSrU 4S jr    SS jrS rSrU =r	$ )LukeEmbeddingsix  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        UR                  U l        [        R                  " UR                  UR
                  U R"                  S9U l	        g )Npadding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrR   selfconfig	__class__s     r(   rV   LukeEmbeddings.__init__}  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= "..#%<<**F,>,>DL\L\$
 r'   c                    UcC  Ub/  [        XR                  5      R                  UR                  5      nOU R	                  U5      nUb  UR                  5       nOUR                  5       S S nUc8  [        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      nU R                  U5      nXF-   U-   nU R                  U5      nU R                  U5      nU$ )Ndtypedevice)"create_position_ids_from_input_idsrR   torn   &create_position_ids_from_inputs_embedssizer#   zeroslongposition_idsr[   r]   r_   r`   rd   )	rf   	input_idstoken_type_idsru   inputs_embedsinput_shaper]   r_   
embeddingss	            r(   forwardLukeEmbeddings.forward  s     $A)M]M]^aabkbrbrs#JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
r'   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nrk   r   rl   r   )rr   r#   arangerR   rt   rn   	unsqueezeexpand)rf   rx   ry   sequence_lengthru   s        r(   rq   5LukeEmbeddings.create_position_ids_from_inputs_embeds  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r'   )r`   rd   rR   r]   r_   r[   )NNNN)
r   r   r    r!   r"   rV   r{   rq   r&   __classcell__rh   s   @r(   rO   rO   x  s+    
& B= =r'   rO   c                      ^  \ rS rSrS\4U 4S jjr S	S\R                  S\R                  S\\R                     4S jjr	Sr
U =r$ )
LukeEntityEmbeddingsi  rg   c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  SS9U l        UR                  UR                  :w  a/  [        R                  " UR                  UR                  SS9U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                   S9U l        [        R"                  " UR$                  5      U l        g )Nr   rQ   FbiasrS   )rU   rV   rg   r   rW   entity_vocab_sizeentity_emb_sizeentity_embeddingsrY   Linearentity_embedding_denser\   r]   r^   r_   r`   ra   rb   rc   rd   re   s     r(   rV   LukeEntityEmbeddings.__init__  s    !#f.F.FH^H^lm!n!!V%7%77*,))F4J4JFL^L^ej*kD'#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=r'   
entity_idsru   rw   c                 <   Uc  [         R                  " U5      nU R                  U5      nU R                  R                  U R                  R
                  :w  a  U R                  U5      nU R                  UR                  SS95      nUS:g  R                  U5      R                  S5      nXV-  n[         R                  " USS9nXVR                  SS9R                  SS9-  nU R                  U5      nXE-   U-   nU R                  U5      nU R                  U5      nU$ )Nr   )minrk   dimgHz>)r#   
zeros_liker   rg   r   rY   r   r]   clamptype_asr   sumr_   r`   rd   )	rf   r   ru   rw   r   r]   position_embedding_maskr_   rz   s	            r(   r{   LukeEntityEmbeddings.forward  s    !"--j9N 22:>;;&&$++*A*AA $ ; ;<M N"66|7I7Ia7I7PQ#/2#5">">?R"S"]"]^`"a1K#ii(;D14O4OTV4O4W4]4]bf4]4gg $ : :> J&<?TT
^^J/
\\*-
r'   )r`   rg   rd   r   r   r]   r_   N)r   r   r    r!   r   rV   r#   
LongTensorr   r{   r&   r   r   s   @r(   r   r     sQ    >z >$ 6:	$$ && !!1!12	 r'   r   c                   >   ^  \ rS rSrU 4S jrS r   SS jrSrU =r$ )LukeSelfAttentioni  c                 D  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        UR                  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        U R                  (       a  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R$                  " UR&                  5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)rU   rV   rY   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizeuse_entity_aware_attentionr   r   querykeyvalue	w2e_query	e2w_query	e2e_queryrb   attention_probs_dropout_probrd   re   s     r(   rV   LukeSelfAttention.__init__  s    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP*0*K*K'YYv1143E3EF
99V//1C1CDYYv1143E3EF
**YYv'9'94;M;MNDNYYv'9'94;M;MNDNYYv'9'94;M;MNDNzz&"E"EFr'   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nrk   r      r   r   )rr   r   r   viewpermute)rf   xnew_x_shapes      r(   transpose_for_scores&LukeSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r'   c                    UR                  S5      nUc  UnO[        R                  " X/SS9nU R                  U R	                  U5      5      nU R                  U R                  U5      5      n	U R                  (       Ga  UGb  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nUS S 2S S 2S U2S S 24   nUS S 2S S 2S U2S S 24   nUS S 2S S 2US 2S S 24   nUS S 2S S 2US 2S S 24   n[        R                  " XR                  SS5      5      n[        R                  " UUR                  SS5      5      n[        R                  " XR                  SS5      5      n[        R                  " UUR                  SS5      5      n[        R                  " UU/SS9n[        R                  " UU/SS9n[        R                  " UU/SS9nOGU R                  U R                  U5      5      n[        R                  " UUR                  SS5      5      nU[        R                  " U R                  5      -  nUb  UU-   n[         R"                  R%                  USS9nU R'                  U5      nUb  UU-  n[        R                  " UU	5      nUR)                  SSSS5      R+                  5       nUR                  5       S S U R,                  4-   nUR.                  " U6 nUS S 2S U2S S 24   nUc  S nOUS S 2US 2S S 24   nU(       a  UUU4nU$ UU4nU$ )Nr   r   rk   r   r   r   r   )rr   r#   catr   r   r   r   r   r   r   r   matmul	transposemathsqrtr   r   
functionalsoftmaxrd   r   
contiguousr   r   ) rf   word_hidden_statesr   attention_mask	head_maskoutput_attentions	word_sizeconcat_hidden_states	key_layervalue_layerw2w_query_layerw2e_query_layere2w_query_layere2e_query_layerw2w_key_layere2w_key_layerw2e_key_layere2e_key_layerw2w_attention_scoresw2e_attention_scorese2w_attention_scorese2e_attention_scoresword_attention_scoresentity_attention_scoresattention_scoresquery_layerattention_probscontext_layernew_context_layer_shapeoutput_word_hidden_statesoutput_entity_hidden_statesoutputss                                    r(   r{   LukeSelfAttention.forward
  sr    '++A.	'#5 #(99.@-W]^#_ --dhh7K.LM	//

;O0PQ***/C/O #77

CU8VWO"77GY8Z[O"77G[8\]O"77G[8\]O &aJYJ&9:M%aJYJ&9:M%aIJ&9:M%aIJ&9:M $)<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a  %*II/CEY.Z`a$b!&+ii1EG[0\bc&d#$yy*?AX)Y_`a 33DJJ?S4TUK$||K9L9LRQS9TU+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD$1!ZiZ2B$C!'*.'*79:q8H*I'02M_G  12MNGr'   )r   r   rd   r   r   r   r   r   r   r   r   NNF)	r   r   r    r!   rV   r   r{   r&   r   r   s   @r(   r   r     s%    G0% P Pr'   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )LukeSelfOutputi^  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrS   )rU   rV   r   r   rY   denser`   ra   rb   rc   rd   re   s     r(   rV   LukeSelfOutput.__init___  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r'   r5   input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rd   r`   rf   r5   r   s      r(   r{   LukeSelfOutput.forwarde  5    

=1]3}'CDr'   r`   r   rd   
r   r   r    r!   rV   r#   Tensorr{   r&   r   r   s   @r(   r   r   ^  6    >U\\  RWR^R^  r'   r   c                   >   ^  \ rS rSrU 4S jrS r   SS jrSrU =r$ )LukeAttentionil  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )rU   rV   r   rf   r   outputsetpruned_headsre   s     r(   rV   LukeAttention.__init__m  s0    %f-	$V,Er'   c                     [        S5      eNz4LUKE does not support the pruning of attention headsNotImplementedError)rf   headss     r(   prune_headsLukeAttention.prune_headss      !"XYYr'   c                 <   UR                  S5      nU R                  UUUUU5      nUc  US   nUn	O.[        R                  " US S SS9n[        R                  " X/SS9n	U R	                  X5      n
U
S S 2S U2S S 24   nUc  S nOU
S S 2US 2S S 24   nX4USS  -   nU$ )Nr   r   r   r   )rr   rf   r#   r   r   )rf   r   r   r   r   r   r   self_outputsconcat_self_outputsr   attention_outputword_attention_outputentity_attention_outputr   s                 r(   r{   LukeAttention.forwardv  s     '++A.	yy 
  '".q/#5 "'))L!,<!"D#(99.@-W]^#_ ;;':Q 0JYJ1A B'&*#&6q)*a7G&H# )B\RSRTEUUr'   )r   r   rf   r   )	r   r   r    r!   rV   r   r{   r&   r   r   s   @r(   r   r   l  s#    "Z " "r'   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )LukeIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rU   rV   r   r   rY   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnre   s     r(   rV   LukeIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r'   r5   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r  rf   r5   s     r(   r{   LukeIntermediate.forward  s&    

=100?r'   r  r   r   s   @r(   r  r    s(    9U\\ ell  r'   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
LukeOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rU   rV   r   r   r  rY   r   r`   ra   rb   rc   rd   re   s     r(   rV   LukeOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r'   r5   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r(   r{   LukeOutput.forward  r   r'   r   r   r   s   @r(   r  r    r   r'   r  c                   >   ^  \ rS rSrU 4S jr   SS jrS rSrU =r$ )	LukeLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g Nr   )
rU   rV   chunk_size_feed_forwardseq_len_dimr   	attentionr  intermediater  r   re   s     r(   rV   LukeLayer.__init__  sI    '-'E'E$&v.,V4 (r'   c                 B   UR                  S5      nU R                  UUUUUS9nUc  US   nO[        R                  " US S SS9nUSS  n	[	        U R
                  U R                  U R                  U5      n
U
S S 2S U2S S 24   nUc  S nOU
S S 2US 2S S 24   nX4U	-   n	U	$ )Nr   )r   r   r   r   )rr   r#  r#   r   r   feed_forward_chunkr!  r"  )rf   r   r   r   r   r   r   self_attention_outputsconcat_attention_outputr   layer_outputword_layer_outputentity_layer_outputs                r(   r{   LukeLayer.forward  s     '++A.	!% / "0 "
  '&<Q&?#&+ii0Fr0JPQ&R#(,0##T%A%A4CSCSUl
 )JYJ)9:'"&".q)*a/?"@$:WDr'   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r$  r   )rf   r  intermediate_outputr*  s       r(   r'  LukeLayer.feed_forward_chunk  s)    "//0@A{{#6Ir'   )r#  r!  r$  r   r"  r   )	r   r   r    r!   rV   r{   r'  r&   r   r   s   @r(   r  r    s#    ) #J r'   r  c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )LukeEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rU   rV   rg   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)rf   rg   _rh   s      r(   rV   LukeEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c           	         U(       a  SOS nU(       a  SOS n	U(       a  SOS n
[        U R                  5       H  u  pU(       a
  X4-   nX4-   n	Ub  XK   OS nU R                  (       a2  U R                  (       a!  U R	                  UR
                  UUUUU5      nOU" UUUUU5      nUS   nUb  US   nU(       d  M  XS   4-   n
M     U(       a
  X4-   nX4-   n	U(       d  [        S UUU
UU	4 5       5      $ [        UUU
UU	S9$ )Nr   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   .0vs     r(   	<genexpr>&LukeEncoder.forward.<locals>.<genexpr>(  "      
A     	)last_hidden_stater5   r6   r   r   )	enumerater7  r8  training_gradient_checkpointing_func__call__tupler+   )rf   r   r   r   r   r   output_hidden_statesreturn_dictall_word_hidden_statesall_entity_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                  r(   r{   LukeEncoder.forward  sS    (<)=24 $5b4(4OA#)?BW)W&+CF]+](.7.CilO**t}} $ A A ))&("#%! !-&("#%! "/q!1#/'4Q'7$  &91=M<O&O#=  5@  %;>S%S"'?BY'Y$ 
 '*'(,
 
 
 #00*%9!9
 	
r'   )rg   r8  r7  )NNFFTr   r   r    r!   rV   r{   r&   r   r   s   @r(   r2  r2    s%    , "D
 D
r'   r2  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
LukePooleri=  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rU   rV   r   r   rY   r   Tanh
activationre   s     r(   rV   LukePooler.__init__>  s9    YYv1163E3EF
'')r'   r5   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   rY  )rf   r5   first_token_tensorpooled_outputs       r(   r{   LukePooler.forwardC  s6     +1a40

#566r'   )rY  r   r   r   s   @r(   rV  rV  =  s(    $
U\\ ell  r'   rV  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EntityPredictionHeadTransformiL  c                 p  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR
                  UR                  S9U l        g r   )rU   rV   r   r   rY   r   r   r  r  r  r   transform_act_fnr`   ra   re   s     r(   rV   &EntityPredictionHeadTransform.__init__M  s~    YYv1163I3IJ
f''--$*6+<+<$=D!$*$5$5D!f&<&<&BWBWXr'   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rb  r`   r  s     r(   r{   %EntityPredictionHeadTransform.forwardV  s4    

=1--m<}5r'   )r`   r   rb  rT  r   s   @r(   r`  r`  L  s    Y r'   r`  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EntityPredictionHeadi]  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " UR                  5      5      U l        g )NFr   )rU   rV   rg   r`  	transformr   r   r   r   decoder	Parameterr#   rs   r   re   s     r(   rV   EntityPredictionHead.__init__^  s_    6v>yy!7!79Q9QX]^LLV-E-E!FG	r'   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ r   )ri  rj  r   r  s     r(   r{   EntityPredictionHead.forwarde  s-    }5]3dii?r'   )r   rg   rj  ri  rT  r   s   @r(   rg  rg  ]  s    H r'   rg  c                   J    \ rS rSr\rSrSrSS/rS\	R                  4S jrSrg	)
LukePreTrainedModelil  lukeTr   r   modulec                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       a  UR                  S:X  a%  UR                  R                  R                  5         O8UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNr         ?)r  r   r   weightdatanormal_rg   initializer_ranger   zero_rW   embedding_dimrR   r`   fill_)rf   rr  s     r(   _init_weights!LukePreTrainedModel._init_weightss  s1   fbii((MM&&CT[[5R5R&S{{&  &&( '--##q(""((*""**9V9V*W!!-""6#5#56<<> .--KK""$MM$$S) .r'   r   N)r   r   r    r!   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   Moduler~  r&   r   r'   r(   rp  rp  l  s/    L&*#(*@A*BII *r'   rp  zt
    The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any
    )custom_introc                       ^  \ rS rSrSS\S\4U 4S jjjrS rS rS r	S r
S	 r\             SS
\\R                     S\\R                      S\\R                     S\\R                     S\\R                     S\\R                      S\\R                     S\\R                     S\\R                      S\\R                      S\\   S\\   S\\   S\\\4   4S jj5       rS\R                  S\\R                     4S jrSrU =r$ )	LukeModeli  rg   add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l
        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)rU   rV   rg   rO   rz   r   r   r2  encoderrV  pooler	post_init)rf   rg   r  rh   s      r(   rV   LukeModel.__init__  sX    
 	 (0!5f!="6*,=j(4 	r'   c                 .    U R                   R                  $ r   rz   r[   rf   s    r(   get_input_embeddingsLukeModel.get_input_embeddings  s    ...r'   c                 $    XR                   l        g r   r  rf   r   s     r(   set_input_embeddingsLukeModel.set_input_embeddings  s    */'r'   c                 .    U R                   R                   $ r   r   r  s    r(   get_entity_embeddingsLukeModel.get_entity_embeddings  s    %%777r'   c                 $    XR                   l         g r   r  r  s     r(   set_entity_embeddingsLukeModel.set_entity_embeddings  s    380r'   c                     [        S5      er   r   )rf   heads_to_prunes     r(   _prune_headsLukeModel._prune_heads  r  r'   rv   r   rw   ru   r   entity_attention_maskentity_token_type_idsentity_position_idsr   rx   r   rJ  rK  r   c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  U
b  [	        S5      eUb"  U R                  X5        UR                  5       nO"U
b  U
R                  5       SS nO[	        S5      eUu  nnUb  UR                  OU
R                  nUc  [        R                  " UU4US9nUc$  [        R                  " U[        R                  US9nUbT  UR                  S5      nUc  [        R                  " UU4US9nUc&  [        R                  " UU4[        R                  US9nU R                  XR                   R                  5      n	U R                  UUUU
S9nU R                  X&5      nUc  SnOU R!                  XXU5      nU R#                  UUUU	UUUS	9nUS
   nU R$                  b  U R%                  U5      OSnU(       d
  UU4USS -   $ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )u
  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeModel

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
>>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
# Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"

>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"

>>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
>>> outputs = model(**encoding)
>>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state
# Input Wikipedia entities to obtain enriched contextualized representations of word tokens

>>> text = "Beyoncé lives in Los Angeles."
>>> entities = [
...     "Beyoncé",
...     "Los Angeles",
... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
>>> entity_spans = [
...     (0, 7),
...     (17, 28),
... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"

>>> encoding = tokenizer(
...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
... )
>>> outputs = model(**encoding)
>>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timerk   z5You have to specify either input_ids or inputs_embeds)rn   rl   r   )rv   ru   rw   rx   )r   r   r   rJ  rK  r   )rD  pooler_outputr5   r6   r   r   )rg   r   rJ  use_return_dictr   %warn_if_padding_and_no_attention_maskrr   rn   r#   onesrs   rt   get_head_maskr6  rz   get_extended_attention_maskr   r  r  r   r5   r6   r   r   )rf   rv   r   rw   ru   r   r  r  r  r   rx   r   rJ  rK  ry   
batch_size
seq_lengthrn   entity_seq_lengthword_embedding_outputextended_attention_maskentity_embedding_outputencoder_outputssequence_outputr]  s                            r(   r{   LukeModel.forward  sz   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZZ(@PN!"[[EJJvVN! * 2$,(-

J@Q3R[a(b%$,(-ZAR4S[`[e[ent(u% &&y++2O2OP	 !%%)'	 !0 !
 #'"B"B>"i &*#&*&<&<Z^s&t# ,,!#2/!5# ' 
 *!, 9=8OO4UY#]3oab6III--')77&11%4%M%M!0!E!E
 	
r'   word_attention_maskc                    UnUb  [         R                  " X2/SS9nUR                  5       S:X  a  USS2SSS2SS24   nO;UR                  5       S:X  a  USS2SSSS24   nO[        SUR                   S35      eUR                  U R                  S9nS	U-
  [         R                  " U R                  5      R                  -  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    word_attention_mask (`torch.LongTensor`):
        Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
    entity_attention_mask (`torch.LongTensor`, *optional*):
        Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
Nrk   r   r   r   z&Wrong shape for attention_mask (shape ))rm   rv  )	r#   r   r   r   shaperp   rm   finfor   )rf   r  r  r   r  s        r(   r  %LukeModel.get_extended_attention_maskG  s     - ,"YY'NTVWN1$&4Qa]&C#!Q&&4QdA5E&F#EnFZFZE[[\]^^"9"<"<4::"<"N#&)@#@EKKPTPZPZD[D_D_"_&&r'   )rg   rz   r  r   r  )T)NNNNNNNNNNNNN)r   r   r    r!   r   boolrV   r  r  r  r  r  r   r   r#   r   r$   r   r   r   r{   r  r&   r   r   s   @r(   r  r    s   z d  "/089Z  156:593715=A<@:>1559,0/3&*Y
E,,-Y
 !!2!23Y
 !!1!12	Y

 u//0Y
 U--.Y
  ((9(9:Y
  ((8(89Y
 &e&6&67Y
 E--.Y
   1 12Y
 $D>Y
 'tnY
 d^Y
 
u44	5Y
 Y
v'#(#3#3'LTUZUeUeLf' 'r'   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r#   cumsumr   rt   )rv   rR   maskincremental_indicess       r(   ro   ro   f  sP     <<$((*D <<!4<<TBdJ##%33r'   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )
LukeLMHeadiw  z*Roberta Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g r   )rU   rV   r   r   rY   r   r`   ra   
layer_normrX   rj  rk  r#   rs   r   re   s     r(   rV   LukeLMHead.__init__z  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIr'   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  rj  )rf   featureskwargsr   s       r(   r{   LukeLMHead.forward  s;    JJx GOOA LLOr'   c                     U R                   R                  R                  R                  S:X  a  U R                  U R                   l        g U R                   R                  U l        g )Nmeta)rj  r   rn   typer  s    r(   _tie_weightsLukeLMHead._tie_weights  sC     <<##((F2 $		DLL))DIr'   )r   rj  r   r  )
r   r   r    r!   r"   rV   r{   r  r&   r   r   s   @r(   r  r  w  s    4&* *r'   r  z
    The LUKE model with a language modeling head and entity prediction head on top for masked language modeling and
    masked entity prediction.
    c            $         ^  \ rS rSr/ SQrU 4S jrU 4S jrS rS r\	               SS\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\\\4   4 S jj5       rSrU =r$ )LukeForMaskedLMi  )zlm_head.decoder.weightzlm_head.decoder.biasz!entity_predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        [        R                  " 5       U l
        U R                  5         g r   )rU   rV   r  rq  r  lm_headrg  entity_predictionsr   r	   loss_fnr  re   s     r(   rV   LukeForMaskedLM.__init__  sQ     f%	!&)"6v">**, 	r'   c                    > [         TU ]  5         U R                  U R                  R                  U R
                  R                  R                  5        g r   )rU   tie_weights_tie_or_clone_weightsr  rj  rq  r   )rf   rh   s    r(   r  LukeForMaskedLM.tie_weights  s:    ""4#:#:#B#BDIID_D_DqDqrr'   c                 .    U R                   R                  $ r   r  rj  r  s    r(   get_output_embeddings%LukeForMaskedLM.get_output_embeddings  s    ||###r'   c                 $    XR                   l        g r   r  )rf   new_embeddingss     r(   set_output_embeddings%LukeForMaskedLM.set_output_embeddings  s    -r'   rv   r   rw   ru   r   r  r  r  labelsentity_labelsr   rx   r   rJ  rK  r   c                 T   Ub  UOU R                   R                  nU R                  UUUUUUUUUUUUSS9nSnSnU R                  UR                  5      nU	be  U	R                  UR                  5      n	U R                  UR                  SU R                   R                  5      U	R                  S5      5      nUc  UnSnSnUR                  bn  U R                  UR                  5      nU
bP  U R                  UR                  SU R                   R                  5      U
R                  S5      5      nUc  UnOUU-   nU(       d8  [        S UUUUUUR                  UR                  UR                   4 5       5      $ [#        UUUUUUR                  UR                  UR                   S9$ )a{  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
NTrv   r   rw   ru   r   r  r  r  r   rx   r   rJ  rK  rk   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  *LukeForMaskedLM.forward.<locals>.<genexpr>  s"      	A  	rC  )r0   r1   r2   r3   r4   r5   r   r6   )rg   r  rq  r  rD  rp   rn   r  r   rX   r   r  r   rI  r5   r   r6   r.   )rf   rv   r   rw   ru   r   r  r  r  r  r  r   rx   r   rJ  rK  r   r0   r1   r3   r2   r4   s                         r(   r{   LukeForMaskedLM.forward  s   b &1%<k$++B]B]))))%!"7"7 3'/!5  
  g778YYv}}-F||FKKDKK4J4J$KV[[Y[_]H|++7 33G4T4TUM(<<(:(:2t{{?\?\(]_l_q_qrt_uv<#D(?D  !))00&&	   "'!//!(!=!=))	
 		
r'   )r  r  r  rq  NNNNNNNNNNNNNNN)r   r   r    r!   _tied_weights_keysrV   r  r  r  r   r   r#   r   r$   r  r   r   r.   r{   r&   r   r   s   @r(   r  r    s    qs$.  156:593715<@<@:>-1481559,0/3&*!q
E,,-q
 !!2!23q
 !!1!12	q

 u//0q
 U--.q
  ((8(89q
  ((8(89q
 &e&6&67q
 ))*q
   0 01q
 E--.q
   1 12q
 $D>q
 'tnq
  d^!q
" 
u((	)#q
 q
r'   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    c            "         ^  \ rS rSrU 4S jr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )LukeForEntityClassificationi+  c                 0  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rU   rV   r  rq  
num_labelsr   rb   rc   rd   r   rY   
classifierr  re   s     r(   rV   $LukeForEntityClassification.__init__2  si     f%	 ++zz&"<"<=))F$6$68I8IJ 	r'   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
UUSS9nUR                  SS2SSS24   nU R	                  U5      nU R                  U5      nSnUb  UR                  UR                  5      nUR                  S:X  a!  [        R                  R                  UU5      nOM[        R                  R                  UR                  S5      UR                  S5      R                  U5      5      nU(       d5  [        S UUUR                   UR"                  UR$                  4 5       5      $ ['        UUUR                   UR"                  UR$                  S9$ )	u	  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
    Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
    used for the single-label classification. In this case, labels should contain the indices that should be in
    `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
    loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
    and 1 indicate false and true, respectively.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeForEntityClassification

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
>>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: person
```NTr  r   r   rk   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  6LukeForEntityClassification.forward.<locals>.<genexpr>        pA prC  r0   r3   r5   r   r6   )rg   r  rq  r   rd   r  rp   rn   ndimr   r   cross_entropy binary_cross_entropy_with_logitsr   r   rI  r5   r   r6   r8   rf   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   feature_vectorr3   r0   s                      r(   r{   #LukeForEntityClassification.forward>  sm   | &1%<k$++B]B]))))%!"7"7 3'/!5  
  !99!Q'Bn50 YYv}}-F{{a}}2266B}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   *!//!(!=!=))
 	
r'   r  rd   rq  r  NNNNNNNNNNNNNN)r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   r8   r{   r&   r   r   s   @r(   r  r  +  s}   
  156:593715=A<@:>1559.2,0/3&*k
E,,-k
 !!2!23k
 !!1!12	k

 u//0k
 U--.k
  ((9(9:k
  ((8(89k
 &e&6&67k
 E--.k
   1 12k
 **+k
 $D>k
 'tnk
 d^k
  
u00	1!k
 k
r'   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    c            "         ^  \ rS rSrU 4S jr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )LukeForEntityPairClassificationi  c                 8  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  S-  UR                  S5      U l        U R                  5         g )Nr   Fr  re   s     r(   rV   (LukeForEntityPairClassification.__init__  sp     f%	 ++zz&"<"<=))F$6$6$:F<M<MuU 	r'   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   c                 :   Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
UUSS9n[        R                  " UR
                  SS2SSS24   UR
                  SS2SSS24   /SS9nU R                  U5      nU R                  U5      nSnUb  UR                  UR                  5      nUR                  S:X  a!  [        R                  R                  UU5      nOM[        R                  R                  UR                  S5      UR                  S5      R!                  U5      5      nU(       d5  [#        S UUUR$                  UR&                  UR(                  4 5       5      $ [+        UUUR$                  UR&                  UR(                  S	9$ )
u	  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
    Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
    used for the single-label classification. In this case, labels should contain the indices that should be in
    `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
    loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
    and 1 indicate false and true, respectively.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeForEntityPairClassification

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
>>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [
...     (0, 7),
...     (17, 28),
... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: per:cities_of_residence
```NTr  r   r   r   rk   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  :LukeForEntityPairClassification.forward.<locals>.<genexpr>%  r  rC  r  )rg   r  rq  r#   r   r   rd   r  rp   rn   r  r   r   r  r  r   r   rI  r5   r   r6   r=   r  s                      r(   r{   'LukeForEntityPairClassification.forward  s   B &1%<k$++B]B]))))%!"7"7 3'/!5  
  --aAg68X8XYZ\]_`Y`8abhi
 n50 YYv}}-F{{a}}2266B}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   .!//!(!=!=))
 	
r'   r  r  )r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   r=   r{   r&   r   r   s   @r(   r   r     s}   
  156:593715=A<@:>1559-1,0/3&*p
E,,-p
 !!2!23p
 !!1!12	p

 u//0p
 U--.p
  ((9(9:p
  ((8(89p
 &e&6&67p
 E--.p
   1 12p
 ))*p
 $D>p
 'tnp
 d^p
  
u44	5!p
 p
r'   r   z
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    c            &         ^  \ rS rSrU 4S jr\                SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4"S jj5       rSrU =r$ )LukeForEntitySpanClassificationi4  c                 6  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  S-  UR                  5      U l        U R                  5         g )Nr   r  re   s     r(   rV   (LukeForEntitySpanClassification.__init__;  sn     f%	 ++zz&"<"<=))F$6$6$:F<M<MN 	r'   rv   r   rw   ru   r   r  r  r  entity_start_positionsentity_end_positionsr   rx   r  r   rJ  rK  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUUUUUSS9nUR                  R	                  S5      nU	R                  S5      R                  SSU5      n	U	R                  UR                  R                  :w  a%  U	R                  UR                  R                  5      n	[        R                  " UR                  SU	5      nU
R                  S5      R                  SSU5      n
U
R                  UR                  R                  :w  a%  U
R                  UR                  R                  5      n
[        R                  " UR                  SU
5      n[        R                  " UUUR                  /SS9nU R                  U5      nU R                  U5      nSnUb  UR                  UR                  5      nUR                  S:X  aJ  [         R"                  R%                  UR'                  SU R(                  5      UR'                  S5      5      nOM[         R"                  R+                  UR'                  S5      UR'                  S5      R-                  U5      5      nU(       d5  [/        S UUUR0                  UR2                  UR4                  4 5       5      $ [7        UUUR0                  UR2                  UR4                  S	9$ )
u  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
entity_start_positions (`torch.LongTensor`):
    The start positions of entities in the word token sequence.
entity_end_positions (`torch.LongTensor`):
    The end positions of entities in the word token sequence.
labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
    Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
    entropy loss is used for the single-label classification. In this case, labels should contain the indices
    that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length,
    num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
    labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeForEntitySpanClassification

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
>>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

>>> text = "Beyoncé lives in Los Angeles"
# List all possible entity spans in the text

>>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
>>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
>>> entity_spans = []
>>> for i, start_pos in enumerate(word_start_positions):
...     for end_pos in word_end_positions[i:]:
...         entity_spans.append((start_pos, end_pos))

>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
>>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
...     if predicted_class_idx != 0:
...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
Beyoncé PER
Los Angeles LOC
```NTr  rk   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  :LukeForEntitySpanClassification.forward.<locals>.<genexpr>  r  rC  r  )rg   r  rq  rD  rr   r   r   rn   rp   r#   gatherr   r   rd   r  r  r   r   r  r   r  r  r   rI  r5   r   r6   r@   )rf   rv   r   rw   ru   r   r  r  r  r  r  r   rx   r  r   rJ  rK  r   rY   start_states
end_statesr  r3   r0   s                           r(   r{   'LukeForEntitySpanClassification.forwardG  s   ^ &1%<k$++B]B]))))%!"7"7 3'/!5  
 //44R8!7!A!A"!E!L!LRQSU`!a!((G,E,E,L,LL%;%>%>w?X?X?_?_%`"||G$=$=rCYZ3==bAHHRQ\]&&'*C*C*J*JJ#7#:#:7;T;T;[;[#\ \\'";";RAUV
L*g>^>^#_efgn50YYv}}-F {{a}}226;;r4??3SU[U`U`acUde}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   .!//!(!=!=))
 	
r'   r  )NNNNNNNNNNNNNNNN)r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   r@   r{   r&   r   r   s   @r(   r  r  4  s   
  156:593715<@<@:>=A;?1559-1,0/3&*#H
E,,-H
 !!2!23H
 !!1!12	H

 u//0H
 U--.H
  ((8(89H
  ((8(89H
 &e&6&67H
 !))9)9 :H
 'u'7'78H
 E--.H
   1 12H
 ))*H
 $D>H
  'tn!H
" d^#H
$ 
u44	5%H
 H
r'   r  z
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c            "         ^  \ rS rSrU 4S jr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )LukeForSequenceClassificationi  c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  b  UR                  OUR                  5      U l	        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rU   rV   r  r  rq  r   rb   classifier_dropoutrc   rd   r   rY   r  r  re   s     r(   rV   &LukeForSequenceClassification.__init__  s      ++f%	zz)/)B)B)NF%%TZTnTn
 ))F$6$68I8IJ 	r'   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
UUSS9nUR                  nU R	                  U5      nU R                  U5      nSnUGb  UR                  UR                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aJ  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" UU5      nOU R                   R                  S:X  a=  [!        5       nU" UR#                  SU R                  5      UR#                  S5      5      nO-U R                   R                  S:X  a  [%        5       nU" UU5      nU(       d5  ['        S	 UUUR(                  UR*                  UR,                  4 5       5      $ [/        UUUR(                  UR*                  UR,                  S
9$ )a  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r   
regressionsingle_label_classificationmulti_label_classificationrk   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  8LukeForSequenceClassification.forward.<locals>.<genexpr>B  r  rC  r  )rg   r  rq  r  rd   r  rp   rn   problem_typer  rm   r#   rt   r   r
   squeezer	   r   r   rI  r5   r   r6   rC   )rf   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   r]  r3   r0   loss_fcts                       r(   r{   %LukeForSequenceClassification.forward  s#   V &1%<k$++B]B]))))%!"7"7 3'/!5  
   --]3/YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./ (=(=w?[?[]d]o]op   ,!//!(!=!=))
 	
r'   r  r  )r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   rC   r{   r&   r   r   s   @r(   r  r    s}   
  156:593715=A<@:>1559.2,0/3&*g
E,,-g
 !!2!23g
 !!1!12	g

 u//0g
 U--.g
  ((9(9:g
  ((8(89g
 &e&6&67g
 E--.g
   1 12g
 **+g
 $D>g
 'tng
 d^g
  
u22	3!g
 g
r'   r  z
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    c            "         ^  \ rS rSrU 4S jr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )LukeForTokenClassificationiQ  c                 `  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  b  UR                  OUR                  5      U l	        [
        R                  " UR                  UR                  5      U l        U R                  5         g NF)r  r  re   s     r(   rV   #LukeForTokenClassification.__init__Y  s      ++f>	zz)/)B)B)NF%%TZTnTn
 ))F$6$68I8IJ 	r'   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   c                 P   Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
UUSS9nUR                  nU R	                  U5      nU R                  U5      nSnUbW  UR                  UR                  5      n[        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d5  [        S UUUR                  UR                  UR                  4 5       5      $ [        UUUR                  UR                  UR                  S9$ )a  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
NTr  rk   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  5LukeForTokenClassification.forward.<locals>.<genexpr>  r  rC  r  )rg   r  rq  rD  rd   r  rp   rn   r	   r   r  rI  r5   r   r6   rG   )rf   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   r  r3   r0   r"  s                       r(   r{   "LukeForTokenClassification.forwardf  s1   V &1%<k$++B]B]))))%!"7"7 3'/!5  
  "33,,71YYv}}-F')HFKKDOO<fkk"oND (=(=w?[?[]d]o]op   )!//!(!=!=))
 	
r'   r  r  )r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   rG   r{   r&   r   r   s   @r(   r%  r%  Q  s}     156:593715=A<@:>1559.2,0/3&*U
E,,-U
 !!2!23U
 !!1!12	U

 u//0U
 U--.U
  ((9(9:U
  ((8(89U
 &e&6&67U
 E--.U
   1 12U
 **+U
 $D>U
 'tnU
 d^U
  
u//	0!U
 U
r'   r%  c            $         ^  \ rS rSrU 4S jr\               SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4 S jj5       rSrU =r$ )LukeForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r'  )
rU   rV   r  r  rq  r   r   rY   
qa_outputsr  re   s     r(   rV   !LukeForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r'   rv   r   rw   ru   r   r  r  r  r   rx   start_positionsend_positionsr   rJ  rK  r   c                 \   Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
UUSS9nUR                  nU R	                  U5      nUR                  SSS9u  nnUR                  S5      nUR                  S5      nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5        UR                  SU5        [        US9nU" UU5      nU" UU5      nUU-   S	-  nU(       d6  [        S
 UUUUR                  UR                  UR                  4 5       5      $ [        UUUUR                  UR                  UR                  S9$ )a  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
NTr  r   rk   r   r   )ignore_indexr   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  3LukeForQuestionAnswering.forward.<locals>.<genexpr>  s"      A  rC  )r0   rJ   rK   r5   r   r6   )rg   r  rq  rD  r0  splitr!  lenrr   clamp_r	   rI  r5   r   r6   rI   )rf   rv   r   rw   ru   r   r  r  r  r   rx   r2  r3  r   rJ  rK  r   r  r3   rJ   rK   
total_lossignored_indexr"  
start_lossend_losss                             r(   r{    LukeForQuestionAnswering.forward  s   P &1%<k$++B]B]))))%!"7"7 3'/!5  
  "331#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J   ))00&&   0%!!//!(!=!=))
 	
r'   )rq  r  r0  r  )r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   rI   r{   r&   r   r   s   @r(   r.  r.    s   	  156:594815=A<@:>15596:48,0/3&*!f
E,,-f
 !!2!23f
 !!1!12	f

 u001f
 U--.f
  ((9(9:f
  ((8(89f
 &e&6&67f
 E--.f
   1 12f
 "%"2"23f
   0 01f
 $D>f
 'tnf
  d^!f
" 
u66	7#f
 f
r'   r.  c            "         ^  \ rS rSrU 4S jr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )LukeForMultipleChoicei6  c                 ,  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  b  UR                  OUR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g r   )rU   rV   r  rq  r   rb   r  rc   rd   r   rY   r  r  re   s     r(   rV   LukeForMultipleChoice.__init__8  so     f%	zz)/)B)B)NF%%TZTnTn
 ))F$6$6: 	r'   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  r   c                 P   Ub  UOU R                   R                  nUb  UR                  S   OU
R                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnU
b1  U
R                  SU
R	                  S5      U
R	                  S5      5      OSn
Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUUU	U
UUSS9nUR                  nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb.  UR                  UR                  5      n[        5       nU" UU5      nU(       d5  [        S UUUR                  UR                  UR                  4 5       5      $ [!        UUUR                  UR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   rk   r   Tr  c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r=  s     r(   r@  0LukeForMultipleChoice.forward.<locals>.<genexpr>  rB  rC  r  )rg   r  r  r   rr   rq  r  rd   r  rp   rn   r	   rI  r5   r   r6   rM   )rf   rv   r   rw   ru   r   r  r  r  r   rx   r  r   rJ  rK  num_choicesr   r]  r3   reshaped_logitsr0   r"  s                         r(   r{   LukeForMultipleChoice.forwardD  s   F &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 BLAWZ__R)<=]a
 %0 "&&r+@+E+Eb+IJ 	 %0 "&&r+@+E+Eb+IJ 	 #.  $$R)<)A)A")EGZG_G_`bGcd 	 ))))%!"7"7 3'/!5  
   --]3/ ++b+6YY556F')HOV4D 
 #))00&&
 
 
 -"!//!(!=!=))
 	
r'   )r  rd   rq  r  )r   r   r    r!   rV   r   r   r#   r   r$   r  r   r   rM   r{   r&   r   r   s   @r(   rA  rA  6  s}   
  156:593715=A<@:>1559.2,0/3&*P
E,,-P
 !!2!23P
 !!1!12	P

 u//0P
 U--.P
  ((9(9:P
  ((8(89P
 &e&6&67P
 E--.P
   1 12P
 **+P
 $D>P
 'tnP
 d^P
  
u33	4!P
 P
r'   rA  )
r  r   r  rA  r.  r  r%  r  r  rp  )Fr"   r   dataclassesr   typingr   r   r   r#   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_luker   
get_loggerr   loggerr   r+   r.   r8   r=   r@   rC   rG   rI   rM   r  rO   r   r   r   r   r  r  r  r2  rV  r`  rg  rp  r  ro   r  r  r  r   r  r  r%  r.  rA  __all__r   r'   r(   <module>rW     s     ! ) )    A A ' K - 6 9 9 * 
		H	% I%? I I< I/ I I< '? '? '?T ? ? ?< ?[ ? ?< ?[ ? ?< ?; ? ?B ? ? ?B !?{ !? !?H  ?K  ?  ?FF=RYY F=R(299 (Vn		 ndRYY ,BII ,`ryy   1		 1hK
")) K
^ BII "299  */ * *0 
Y'# Y'
Y'x4"* *> L
) L
L
^ y
"5 y
y
x ~
&9 ~
~
B V
&9 V
V
r u
$7 u
u
p d
!4 d
d
N s
2 s
 s
l ^
/ ^
 ^
Br'   