
    fTh$                         S r SSKJrJr  SSKrSSKJr  SSKJr   " S S\R                  5      r
 " S S	\R                  5      r " S
 S\R                  5      rg)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )IdeficsConfigc                      ^  \ rS rSrS\S\S\S\S\S\SS	4U 4S
 jjrS\R                  S\R                  4S jr	Sr
U =r$ )IdeficsPerceiverResampler0   config	embed_dimdepthn_headshead_dim	n_latentsreturnNc                 "  > [         TU ]  5         X$XV4u  U l        U l        U l        U l        UR                  R                  U l        [        R                  " [        R                  " U R
                  U R                  5      SS9U l        [        UR                  S5      (       d  U R                  S-  OUR                  R                  S-  U l        [        R"                  " [%        U5       Vs/ s Hc  n[        R"                  " ['        U R                  U R                  U R                  U R                  5      [)        U R                   U5      /5      PMe     sn5      U l        [        R,                  " U R                  5      U l        gs  snf )a  
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

Args:
    config (`IdeficsConfig`): config object
    embed_dim (`int`): The size of each embedding vector
    depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
    n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
    head_dim (`int`): Dimensionality of each head projection in the Transformer block.
    n_latents (`int`):
        Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

T)requires_gradr      N)super__init__r   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normsnn	Parametertorchrandnlatentshasattrvision_configintermediate_dim
ModuleListrangeIdeficsPerceiverAttention
IdeficsMLPblocks	LayerNorm
layer_norm)	selfr
   r   r   r   r   r   _	__class__s	           ]/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/idefics/perceiver.pyr   "IdeficsPerceiverResampler.__init__1   s;   ( 	FOZbFmCdmT^$55NN ||EKK$O_cd 6//== NNQ%%//!3 	 mm u &A 1$..$,,PTP]P]_c_r_rs"4#8#8&A &

 ,,t~~6s   0A*Fcontextc                     U R                   R                  UR                  S   SS5      nU R                   H  u  p4U" X5      U-   nU" U5      U-   nM     U R	                  U5      $ )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   r   )r   repeatshaper%   r'   )r(   r-   r   attnffs        r+   forward!IdeficsPerceiverResampler.forward_   se     ,,%%gmmA&61= HD7,w6GkG+G $ w''    )	r%   r   r   r    r   r'   r   r   r   )__name__
__module____qualname____firstlineno__r   intr   r   Tensorr3   __static_attributes____classcell__r*   s   @r+   r   r   0   sa    ,7#,703,7<?,7JM,7Y\,7il,7	,7\
(u|| 
( 
( 
(r5   r   c            
          ^  \ rS rSrS\S\S\S\SS4
U 4S jjrS	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )r#   l   r   r   r   r   r   Nc                   > [         TU ]  5         XUsU l        U l        U l        X@l        [        R                  " U R                  5      U l        [        R                  " U R                  5      U l	        U R
                  (       aJ  [        R                  " U R                  5      U l
        [        R                  " U R                  5      U l        U R                  S-  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  USS9U l        g)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`g      FbiasN)r   r   r   r   r   r   r   r&   context_layer_normlatents_layer_normq_layer_normk_layer_normqk_scaleLinearq_projk_projv_projoutput_proj)r(   r   r   r   r   r*   s        r+   r   "IdeficsPerceiverAttention.__init__m   s,   6?(3dm,"$,,t~~">"$,,t~~"> "T]] ;D "T]] ;Dt+ iit}}0LSXYiit}}0LSXYiit}}0LSXY99T\\DMM%A9SXYr5   r-   r   c           	      v   U R                  U5      nU R                  U5      nUR                  SS u  p4nU R                  U5      nU R	                  [
        R                  " X/SS95      nU R                  [
        R                  " X/SS95      nXgU4 V	s/ s HF  oR                  X9R                  S   U R                  U R                  5      R                  SS5      PMH     sn	u  pgnU R                  (       a"  U R                  U5      nU R                  U5      n[
        R                  " SX`R                   -  U5      n
XR#                  SS	S
9R%                  5       -
  nUR'                  SS9n[
        R                  " SX5      nU R)                  UR                  SS5      R+                  S5      5      $ s  sn	f )a  
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

Args:
    context (`torch.Tensor`):
        Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
    latents (`torch.Tensor`):
        Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

Returns:
    `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
    from context.
N   )dimr      z... i d, ... j d -> ... i jT)rR   keepdimz... i j, ... j d -> ... i d)rD   rE   r0   rJ   rK   r   catrL   reshaper   r   	transposer   rF   rG   einsumrH   amaxdetachsoftmaxrM   flatten)r(   r-   r   
batch_size
seq_lengthr   qkvxscoresstabilized_scoresr1   	resampleds                 r+   r3   !IdeficsPerceiverAttention.forward   s    ))'2))'2,3MM"1,=)
	 KK KK		7"4"=>KK		7"4"=>
 mnrsktuktfg99ZT\\4==Q[[\]_`aktua!!!$A!!!$A;Q=NPQR"kkb$k&G&N&N&PQ ((R(0 LL!>H		 3 3Aq 9 A A" EFF vs   AF6)rD   r   r   rG   rK   rE   r   rM   rF   rJ   r   rH   rL   )r6   r7   r8   r9   r:   boolr   r   r;   r3   r<   r=   r>   s   @r+   r#   r#   l   sc    Z# Z Zs ZTX Z]a Z*(Gu|| (Gell (Gu|| (G (Gr5   r#   c                   v   ^  \ rS rSrS\4U 4S jjrS\\\R                        S\R                  4S jr
SrU =r$ )r$      r
   c                 h  > [         TU ]  5         UR                  R                  U l        [        R
                  " U R                  5      U l        [        R                  " U R                  USS9U l        [        R                  " 5       U l
        [        R                  " XR                  SS9U l        g)z:Simple MLP block with intermediate_size and embedding sizeFrB   N)r   r   r   r   r   r&   lnrI   fcReLUactc_proj)r(   intermediate_sizer
   r*   s      r+   r   IdeficsMLP.__init__   sr    --77,,t~~.))DNN,=EJ779ii 1>>Nr5   hidden_statesr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)rl   rm   ro   rp   )r(   rs   s     r+   r3   IdeficsMLP.forward   s@    ../M2r5   )ro   rp   r   rm   rl   )r6   r7   r8   r9   r   r   r   r   r   FloatTensorr3   r<   r=   r>   s   @r+   r$   r$      s?    O- OXeE4E4E.F%G EL]L]  r5   r$   )__doc__typingr   r   r   torch.nnr   configuration_ideficsr   Moduler   r#   r$    r5   r+   <module>r}      sL   4 #   09(		 9(x>G		 >GB r5   