
    fThtC                     R   S r SSKJrJrJrJr  SSKrSSKrSSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  \R4                  " \5      r\ " S S\5      5       r " S S\\5      r\" SS9 " S S\5      5       r\" SS9 " S S\\
5      5       r / SQr!g)zPyTorch Fuyu model.    )ListOptionalTupleUnionN)nn   )GenerationMixin)FlashAttentionKwargs)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)
LossKwargsauto_docstringcan_return_tuplelogging   )
FuyuConfigc                   >    \ rS rSr\rSrSrSrSr	Sr
Sr/ rSrS rSrg)FuyuPreTrainedModel#   fuyuTpast_key_valuesc                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Ng        )meanstd)configinitializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler   s      ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weights!FuyuPreTrainedModel._init_weights/   s    kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .     N)__name__
__module____qualname____firstlineno__r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr*   __static_attributes__r-   r,   r)   r   r   #   s;    L&*#"&!N"3	?r,   r   c                       \ rS rSrSrg)KwargsForCausalLM;   r-   N)r.   r/   r0   r1   r;   r-   r,   r)   r=   r=   ;   s    3r,   r=   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                      ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  S	\\	R                     S
\	R                  S\	R                  4S jrS\	R                  4S jr\           SS\	R                   S\	R                  S\	R                  S\\	R                     S\\	R                      S\\\	R                        S\\	R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )	FuyuModel>   zlanguage_model.modellanguage_modelr   c                   > [         TU ]  U5        UR                  U l        UR                  R
                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  -  UR                  -  UR                  5      U l        SU l        U R!                  5         g )NF)super__init__pad_token_idr&   text_config
vocab_sizer   from_configrC   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr'   r   	__class__s     r)   rF   FuyuModel.__init__F   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r,   c                 6    U R                   R                  5       $ N)rC   get_input_embeddingsr'   s    r)   rV   FuyuModel.get_input_embeddingsS   s    ""7799r,   c                 :    U R                   R                  U5        g rU   )rC   set_input_embeddingsr'   values     r)   rZ   FuyuModel.set_input_embeddingsV   s    007r,   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         UR                   S   [        U5      :X  d)  [        S[        U5      < SUR                   S   < 35      eUR                  5       n[	        UR                   S   5       H  n[
        R                  " X5   S:  SS9S   nX5   U   nUR                   S   X%   R                   S   :  a-  [        SX%   R                   < SUR                   < SU S	35      eX%   U   R                  UR                  5      XEU4'   M     U$ )
ay  This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
    word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Tensor of word embeddings.
    continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
        [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
        indices in image_patch_input_indices for that batch element.
    image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor of indices of the image patches in the input_ids tensor.
r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r'   r^   r_   r`   output_embeddings	batch_idxdst_indicessrc_indicess           r)   gather_continuous_embeddings&FuyuModel.gather_continuous_embeddingsY   sI   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78I  --(A(LPQ(Q\`abcdK 4>{KK  #&;&F&L&LQ&OO ^7L7W7]7]6_ `I6A6G6G5II[\e[ffgi  9N8XYd8e8h8h!((945 9  ! r,   pixel_valuesc                     U Vs/ s HP  nU R                  UR                  U R                   R                  R                  5      5      R	                  S5      PMR     nnU$ s  snf )z
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
r   )rN   rl   r    dtypesqueeze)r'   rt   patchpatch_embeddingss       r)   get_image_featuresFuyuModel.get_image_features   sc     &
% $$UXXd.F.F.M.M.S.S%TU]]^_`% 	 
  	
s   AA 	input_idsimage_patchesimage_patches_indicesattention_maskposition_idsr   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pOUb  UR                  u  pnO[        S5      eUnSnUb  US   S   R                  S   nUU-   nUcT  Ub  UR                  OUR                  n[        R                  " UUU-   [        R                  US9nUR                  S5      nUcH  U R                  R                  5       " U5      nUb%  Uc"  U R                  U5      nU R                  UUUS9nU R                  " SUUUUU	U
UUS.UD6nU$ )	a  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
zDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr      )rv   rm   )r^   r_   r`   )r   r   r   r   r   r   r   r   r-   )r   r   r   r   use_return_dictrg   re   rm   rj   arangelong	unsqueezerC   rV   rz   rr   )r'   r|   r}   r~   r   r   r   r   r   r   r   r   kwargs
batch_size
seq_length_seq_length_with_pastpast_key_values_lengthrm   ry   outputss                        r)   forwardFuyuModel.forward   s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%JASTT)!"&%4Q%7%:%@%@%C"#7:P#P )2)>Y%%MDXDXF <<&
5K(KSXS]S]flL (11!4L  //DDFyQM(_-D#'#:#:=#I  $ A A$1*:.C !B ! %% 

')%+/!5#

 

 r,   )rO   rC   r&   rN   rI   )NNNNNNNNNNN)r.   r/   r0   r1   _checkpoint_conversion_mappingr   rF   rV   rZ   rj   Tensorr   rr   FloatTensorrz   r   
LongTensorr   boolr   r   r   r   r;   __classcell__rR   s   @r)   rA   rA   >   s    '=>N%O"z :8*!*!  $ELL1*! $)<<	*!
 
*!X u/@/@    '+&*.21537=A59$(,0/3&*K##K ||K  %||	K
 !.K u//0K "$u'8'8"9:K   1 12K D>K $D>K 'tnK d^K 
u,,	-K Kr,   rA   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c            !         ^  \ rS rSrSSSS.rS/rS\4U 4S jjrS	 rS
 r	S r
S rS rS r\\             S!S\R"                  S\R$                  S\R$                  S\\R$                     S\\R"                     S\\\R*                        S\\R*                     S\\   S\\R$                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       r     S"U 4S jjr\S 5       rS rU =r $ )#FuyuForCausalLM   zmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r#   )rE   rF   rA   modelr   r   rH   rM   rI   r   rP   rQ   s     r)   rF   FuyuForCausalLM.__init__   sS     v&
yy!3!3!?!?ASASA^A^ejkr,   c                 6    U R                   R                  5       $ rU   )r   rV   rW   s    r)   rV   $FuyuForCausalLM.get_input_embeddings   s    zz..00r,   c                 :    U R                   R                  U5        g rU   )r   rZ   r[   s     r)   rZ   $FuyuForCausalLM.set_input_embeddings   s    

''.r,   c                     U R                   $ rU   r   rW   s    r)   get_output_embeddings%FuyuForCausalLM.get_output_embeddings   s    ||r,   c                     Xl         g rU   r   )r'   new_embeddingss     r)   set_output_embeddings%FuyuForCausalLM.set_output_embeddings   s    %r,   c                 :    U R                   R                  U5        g rU   )r   set_decoder)r'   decoders     r)   r   FuyuForCausalLM.set_decoder  s    

w'r,   c                 6    U R                   R                  5       $ rU   )r   get_decoderrW   s    r)   r   FuyuForCausalLM.get_decoder  s    zz%%''r,   r|   r}   r~   r   r   r   r   r   labelsr   r   r   logits_to_keepra   c                 Z   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUUUU
UUSS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                   S9$ )ap  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import requests

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```NT)r|   r}   r~   r   r   r   r   r   r   r   r   r   )logitsr   rI   )lossr   r   hidden_states
attentionsr-   )r   r   r   r   r   r   r   intslicer   loss_functionrH   rI   r   r   r   r   )r'   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   slice_indicesr   r   s                       r)   r   FuyuForCausalLM.forward  sM   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]**'"7')%+/!5  
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD &#33!//))
 	
r,   c           	      P   > [         T	U ]  " U4UUUUUS.UD6nUb
  S US'   S US'   U$ )N)r   r   r   r}   r~   r~   r}   )rE   prepare_inputs_for_generation)
r'   r|   r   r   r   r}   r~   r   model_inputsrR   s
            r)   r   -FuyuForCausalLM.prepare_inputs_for_generationf  sX     w<
+)''"7
 
 &48L01,0L)r,   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr-   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectrl   rm   ).0
past_statebeam_idxs     r)   	<genexpr>1FuyuForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)tuple)r   r   reordered_past
layer_pasts    `  r)   _reorder_cacheFuyuForCausalLM._reorder_cache  s8    )Jncmnn N * r,   )r   r   )NNNNNNNNNNNNr   )NNNNN)!r.   r/   r0   r1   r   _tied_weights_keysr   rF   rV   rZ   r   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r;   r   r   s   @r)   r   r      s    "8 ;#,&"
 ++z 1/&((  '+&*.21537=A59$()-,0/3&*()[
##[
 ||[
  %||	[

 !.[
 u//0[
 "$u'8'8"9:[
   1 12[
 D>[
 &[
 $D>[
 'tn[
 d^[
 ![
  
u,,	-![
  [
@ "8  r,   r   )r   r   rA   )"__doc__typingr   r   r   r   rj   torch.utils.checkpointr   
generationr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   models.auto.modeling_autor   utilsr   r   r   r   configuration_fuyur   
get_loggerr.   loggerr   r=   rA   r   __all__r-   r,   r)   <module>r      s     / /    ) B 6 - 2 J J * 
		H	% ?/ ? ?. ?,j > 
\# \
\~ 
b)? b
bJ Br,   