
    fThcg                        S r SSKJr  SSKJrJrJrJr  SSKrSSK	rSSKJ
r
  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSK J!r!  \RD                  " \#5      r$\ " S S\5      5       r%\ " S S\5      5       r& " S S\
RN                  5      r(\ " S S\5      5       r)\" SS9 " S S\)5      5       r* " S S\\5      r+\" S S9 " S! S"\)\5      5       r,/ S#Qr-g)$zPyTorch Llava model.    )	dataclass)ListOptionalTupleUnionN)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPastModelOutput)PreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging   )	AutoModel   )LlavaConfigc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)LlavaModelOutputWithPast&   a  
Base class for Llava outputs, with hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations____static_attributes__r       `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/llava/modeling_llava.pyr   r   &   s    8 8<%"3"34;r(   r   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)LlavaCausalLMOutputWithPastG   a=  
Base class for Llava causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr   r   )r   r    r!   r"   r#   r-   r   r$   r%   r&   r.   r/   r   r0   r   r1   r   r'   r   r(   r)   r+   r+   G   s    < )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r(   r+   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LlavaMultiModalProjectoro   configc                   > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " UR                  R                  U-  UR                  R                  UR                  S9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  UR                  S9U l        g )Nr   bias)super__init__
isinstancevision_feature_layerintlenr   Linearvision_confighidden_sizetext_configmultimodal_projector_biaslinear_1r
   projector_hidden_actactlinear_2)selfr5   num_feature_layers	__class__s      r)   r:   !LlavaMultiModalProjector.__init__p   s    ",V-H-H#"N"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r(   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ N)rD   rF   rG   )rH   image_featuresr0   s      r)   forward LlavaMultiModalProjector.forward~   s2    n5/m4r(   )rF   rD   rG   )	r   r    r!   r"   r   r:   rO   r'   __classcell__rJ   s   @r)   r3   r3   o   s    
{ 
 r(   r3   c                   B    \ rS rSr\rSrSrSrSr	Sr
SrSrSrSrS rSrg)LlavaPreTrainedModel    Tr/   c                 4   [        U R                  SU R                  R                  5       R                  5      n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g g )Ninitializer_rangeg        )meanstdg      ?)getattrr5   get_text_configrX   r;   r   r?   weightdatanormal_r8   zero_	LayerNormfill_)rH   modulerZ   s      r)   _init_weights"LlavaPreTrainedModel._init_weights   s     dkk#68S8S8U8g8ghfbii((MM&&CS&9{{&  &&( '--MM$$S)KK""$ .r(   r   N)r   r    r!   r"   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendrd   r'   r   r(   r)   rT   rT      sA    L&*#"3 !N $!"&%r(   rT   zu
    The Llava model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc            %         ^  \ rS rSrSS0rS\4U 4S jjrS rS r  SS\	R                  S	\\\\\   4      S
\\   4S jjr\\              SS\	R&                  S\	R                  S\\	R(                     S\\	R&                     S\\\	R                        S\\	R                     S	\\\\\   4      S
\\   S\\   S\\   S\\   S\\   S\\	R&                     S\	R(                  S\\   S\\\4   4 S jj5       5       rSrU =r$ )
LlavaModel   zlanguage_model.modellanguage_modelr5   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g rM   )r9   r:   r   from_configr@   vision_towerr3   multi_modal_projectorrB   rt   	post_initrH   r5   rJ   s     r)   r:   LlavaModel.__init__   sY     %11&2F2FG%=f%E"'33F4F4FGr(   c                 6    U R                   R                  5       $ rM   )rt   get_input_embeddingsrH   s    r)   r}   LlavaModel.get_input_embeddings   s    ""7799r(   c                 :    U R                   R                  U5        g rM   )rt   set_input_embeddingsrH   values     r)   r   LlavaModel.set_input_embeddings   s    007r(   pixel_valuesr<   vision_feature_select_strategyc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUS;  a"  [        SU R                   R                   35      eUR	                  5        VVs0 s H  u  pVUc  M
  XV_M     nnnU R
                  " U4SS0UD6n[        U[        5      (       a!  UR                  U   nUS:X  a  USS2SS24   nOSU V	s/ s H  oR                  U	   PM     n
n	US:X  a  U
 Vs/ s H  oSS2SS24   PM     n
n[        R                  " U
SS	9nU R                  U5      nU$ s  snnf s  sn	f s  snf )
a@  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, List[int]]`, *optional*):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    vision_feature_select_strategy (`str`, *optional*):
        The feature selection strategy used to select the vision feature from the vision backbone.
        Can be one of `"default"` or `"full"`
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
N)defaultfullz$Unexpected select feature strategy: output_hidden_statesTr   r   dim)r5   r<   r   
ValueErroritemsrw   r;   r=   r0   r$   catrx   )rH   r   r<   r   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolhsrN   s                r)   get_image_featuresLlavaModel.get_image_features   s[   0 %9$D $++JjJj 	
 .9 +;; 	' *1DDCDKKDnDnCopqq#)<<>C>41Q$!$>C)),\T\U[\ *C00%2%@%@AU%V"-:)?12)F&OcdOc)229=OcGd-:/67wae9w7%*YYwB%?"334JK% D e 8s   2	D9?D9D?;E	input_idsattention_maskposition_idsr/   inputs_embeds	use_cacheoutput_attentionsr   return_dictcache_positionimage_sizesr   returnc                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nUS L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUGb  U R                  UUUUS9nUcx  X`R                  5       " [        R                  " U R                   R                  [        R                  UR                  S95      :H  nUR                  SS9R                  SS9S   nOwXR                   R                  :H  R                  S5      nUR!                  U5      R#                  UR                  5      nXR                   R                  :H  R                  5       n[%        5       (       d{  UU   R'                  5       UR'                  5       :w  aV  XR                   R                  :H  R                  5       nUR(                  S   UR(                  S   -  n[        SU S	U 35      eUR#                  UR                  UR*                  5      nUR-                  UU5      nU R.                  " SUUUUU	U
US
US.	UD6n[1        UR2                  UR4                  UR6                  UR8                  Ub  WS9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r<   r   r   )dtypedevicer   r   r   r   z6Image features and image tokens do not match: tokens: z, features T)	r   r   r/   r   r   r   r   r   r   )last_hidden_stater/   r0   r1   r   r   )r5   r   r   use_return_dictr<   r   r   r}   r   r$   tensorimage_token_idlongr   sum	unsqueeze	expand_astor   numelshaper   masked_scatterrt   r   r   r/   r0   r1   )rH   r   r   r   r   r/   r   r<   r   r   r   r   r   r   r   r   rN   special_image_maskn_image_tokensn_image_featuresoutputss                        r)   rO   LlavaModel.forward   s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' -t";<YZZ  557	BM#!44)%9/M'	 5 N  %26O6O6QLL!;!;5::VcVjVjk7 &" #5!9!9a!9!@!D!D!D!KA!N&/;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i""+{{/I/I"I!N!N!P+---@R2S2Y2Y2[_m_s_s_u2u"+{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%% 
)%+'/!5)
 
 (%77#33!//))2>2J
 	

 QU
 	
r(   )rt   rx   rw   )NN)NNNNNNNNNNNNNN)r   r    r!   r"   _checkpoint_conversion_mappingr   r:   r}   r   r$   r%   r   r   r=   r   strr   r   r   
LongTensorTensorboolr   r   r   r   rO   r'   rQ   rR   s   @r)   rr   rr      s    '=>N%O"{ :8 AE8<	5''5 'uS$s)^'<=5 )1	5n  '+*.1537=A59@D8<$(,0/3&*59$(T
##T
 ''T
 !.	T

 u//0T
 "$u'8'8"9:T
   1 12T
 'uS$s)^'<=T
 )1T
 D>T
 $D>T
 'tnT
 d^T
 !!1!12T
 \\T
  -.!T
" 
u..	/#T
  T
r(   rr   c                       \ rS rSrSrg)KwargsForCausalLMiG  r   N)r   r    r!   r"   r'   r   r(   r)   r   r   G  s    3r(   r   zS
    The LLAVA model which consists of a vision backbone and a language model.
    c            )         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S\
R                  4S jrS r\S 5       r\S 5       r\S 5       r\\                S+S\R*                  S\R,                  S\\R0                     S\\R*                     S\\\R,                        S\\R,                     S\\\\\   4      S\\   S\\R*                     S\\   S\\   S\\   S\\   S\\R*                     S \\\R0                  4   S!\\R0                     S"\\   S\\ \!4   4$S# jj5       5       r"      S,U 4S$ jjr#\$S\R0                  S%\S&\S'\RJ                  S\R0                  S(\4S) j5       r&S*r'U =r($ )-LlavaForConditionalGenerationiJ  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr5   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr7   )r9   r:   rr   modelr   r?   rB   rA   
vocab_sizer   ry   rz   s     r)   r:   &LlavaForConditionalGeneration.__init__X  sS     '
yy!3!3!?!?ASASA^A^ejkr(   c                 6    U R                   R                  5       $ rM   )r   r}   r~   s    r)   r}   2LlavaForConditionalGeneration.get_input_embeddings^  s    zz..00r(   c                 :    U R                   R                  U5        g rM   )r   r   r   s     r)   r   2LlavaForConditionalGeneration.set_input_embeddingsa  s    

''.r(   r   c                     U R                   $ rM   r   r~   s    r)   get_output_embeddings3LlavaForConditionalGeneration.get_output_embeddingsd  s    ||r(   c                     Xl         g rM   r   )rH   new_embeddingss     r)   set_output_embeddings3LlavaForConditionalGeneration.set_output_embeddingsg  s    %r(   c                 .    U R                   R                  $ rM   )r   rt   r~   s    r)   rt   ,LlavaForConditionalGeneration.language_modelk  s    zz(((r(   c                 .    U R                   R                  $ rM   )r   rw   r~   s    r)   rw   *LlavaForConditionalGeneration.vision_towero  s    zz&&&r(   c                 .    U R                   R                  $ rM   )r   rx   r~   s    r)   rx   3LlavaForConditionalGeneration.multi_modal_projectors  s    zz///r(   r   r   r   r   r/   r   r<   r   labelsr   r   r   r   r   logits_to_keepr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nU R                  " SUUUUUUUUU
UUSUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                   UR"                  UR$                  S9$ )a   
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, LlavaForConditionalGeneration

>>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
```NT)r   r   r   r   r/   r   r<   r   r   r   r   r   r   r   r   )r.   r   r   )r-   r.   r/   r0   r1   r   r   )r5   r   r   r   r<   r   r   r;   r=   slicer   loss_functionrB   r   r+   r/   r0   r1   r   )rH   r   r   r   r   r/   r   r<   r   r   r   r   r   r   r   r   r   r   r   r0   slice_indicesr.   r-   s                          r)   rO   %LlavaForConditionalGeneration.forwardw  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' ** 
%)%+'!5+I/!5)#
 
$  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r(   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r/   r   r   r   r   r   r   )r9   prepare_inputs_for_generation)rH   r   r/   r   r   r   r   r   r   model_inputsrJ   s             r)   r   ;LlavaForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r(   sequence_lengthtarget_lengthr   
batch_sizec                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   )diagonal)r   r   r   )r   r$   finfominr   r   triuarangereshapeexpandcloner   r   masked_fill)r   r   r   r   r   r   r   causal_mask	min_dtypemask_lengthpadding_masks              r)   5_prepare_4d_causal_attention_mask_with_cache_positionSLlavaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r(   )r   r   )NNNNNNNNNNNNNNr   N)NNNNNN))r   r    r!   r"   r   _tied_weights_keysr   r:   r}   r   r   Moduler   r   propertyrt   rw   rx   r   r   r$   r   r%   r   r   r   r   r=   r   r   r   r   r   r+   rO   r   staticmethodr   r   r'   rQ   rR   s   @r)   r   r   J  s    "8-"?#,	&" ++{ 1/ryy & ) ) ' ' 0 0  '+*.1537=A59@D8<-1$(,0/3&*5934.2#a
##a
 ''a
 !.	a

 u//0a
 "$u'8'8"9:a
   1 12a
 'uS$s)^'<=a
 )1a
 ))*a
 D>a
 $D>a
 'tna
 d^a
 !!1!12a
  c5<</0!a
" ell+#a
$ *+%a
& 
u11	2'a
  a
L < 444 4 {{	4
 4 4 4r(   r   )r   rT   rr   ).r#   dataclassesr   typingr   r   r   r   r$   torch.utils.checkpointr   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   autor   configuration_llavar   
get_loggerr   loggerr   r+   r   r3   rT   rr   r   r   __all__r   r(   r)   <module>r     s    ! / /    ! ) B D - & d d  , 
		H	% <6 < <@ $<+ $< $<Nryy , %? % %6 
^
% ^

^
B ?,j > 
a$8/ a
aH Rr(   