
    fTh'                     .   S r SSKJrJr  SSKrSSKJr  SSKJ	r	   " S S	\R                  R                  R                  5      r " S
 S\R                  R                  R                  5      r " S S\R                  R                  R                  5      rg)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )
shape_list   )IdeficsConfigc                      ^  \ rS rSrS\S\S\S\S\S\SS	4U 4S
 jjrU 4S jrS\R                  S\R                  4S jr
SrU =r$ )TFIdeficsPerceiverResampler0   config	embed_dimdepthn_headshead_dim	n_latentsreturnNc                   > [         T	U ]  " S0 UD6  X$XV4u  U l        U l        U l        U l        UR                  R                  U l        [        UR                  S5      (       d  U R                  S-  OUR                  R                  S-  U l        / U l        [        U5       Hn  nU R                  R                  [        U R                  U R                  U R                  U R                  SU S3S9[!        U R                  USU S3S9/5        Mp     ["        R$                  R&                  R)                  SSS	9U l        g
)a  
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

Args:
    config (`IdeficsConfig`): config object
    embed_dim (`int`): The size of each embedding vector
    depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
    n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
    head_dim (`int`): Dimensionality of each head projection in the Transformer block.
    n_latents (`int`):
        Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

r      zblocks.z.0namez.1h㈵>
layer_normepsilonr   N )super__init__r   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normshasattrvision_configintermediate_dimblocksrangeappendTFIdeficsPerceiverAttentionTFIdeficsMLPtfkeraslayersLayerNormalizationr   )
selfr   r   r   r   r   r   kwargsi	__class__s
            `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/idefics/perceiver_tf.pyr   $TFIdeficsPerceiverResampler.__init__1   s#   ( 	"6"FOZbFmCdmT^$55NN 6//== NNQ%%//!3 	 uAKK/dmmTEXEXahijhkkm_n !!6!6wqcQS_U	  ((//<<TP\<]    c                 ~   > U R                  U R                  U R                  4SSSS9U l        [        TU ]  U5        g )Nrandom_normalTlatents)shapeinitializer	trainabler   )
add_weightr   r   r6   r   build)r-   input_shaper0   s     r1   r;   !TFIdeficsPerceiverResampler.build\   s>    >>4>>2[_fo ' 
 	k"r3   contextc                    [         R                  " U R                  SS9n[         R                  " U[         R                  " U5      S   SS/5      nU R
                   H  u  p4U" X5      U-   nU" U5      U-   nM     U R                  U5      $ )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   axisr   )r)   expand_dimsr6   tiler7   r$   r   )r-   r>   r6   attnffs        r1   call TFIdeficsPerceiverResampler.callc   s{     ..A6'''BHHW$5a$8!Q#?@HD7,w6GkG+G $ w''r3   )	r$   r   r   r#   r6   r   r   r   r    )__name__
__module____qualname____firstlineno__r   intr   r;   r)   TensorrF   __static_attributes____classcell__r0   s   @r1   r
   r
   0   sn    )^#)^03)^<?)^JM)^Y\)^il)^	)^V#	(BII 	(")) 	( 	(r3   r
   c            
          ^  \ rS rSrS\S\S\S\SS4
U 4S jjrS	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )r'   o   r   r   r   r    r   Nc                   > [         TU ]  " S0 UD6  XUsU l        U l        U l        X@l        [        R                  R                  R                  SSS9U l
        [        R                  R                  R                  SSS9U l        U R
                  (       aZ  [        R                  R                  R                  SSS9U l        [        R                  R                  R                  SSS9U l        U R                  S-  U l        [        R                  R                  R                  U R                  U R                  -  SS	S
9U l        [        R                  R                  R                  U R                  U R                  -  SSS
9U l        [        R                  R                  R                  U R                  U R                  -  SSS
9U l        [        R                  R                  R                  USSS
9U l        g)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`r   context_layer_normr   latents_layer_normq_layer_normk_layer_normg      Fq_projuse_biasr   k_projv_projoutput_projNr   )r   r   r   r   r   r    r)   r*   r+   r,   rT   rU   rV   rW   qk_scaleDenserX   r[   r\   r]   )r-   r   r   r   r    r.   r0   s         r1   r   $TFIdeficsPerceiverAttention.__init__p   s   "6"6?(3dm,"$((//"D"DTXl"D"m"$((//"D"DTXl"D"m " B B4Vd B eD " B B4Vd B eDt+ hhoo++DLL4==,HSX_g+hhhoo++DLL4==,HSX_g+hhhoo++DLL4==,HSX_g+h88??00UQ^0_r3   r>   r6   c                    U R                  U5      nU R                  U5      n[        U5      u  p4nU R                  U5      nU R	                  [
        R                  " X/SS95      nU R                  [
        R                  " X/SS95      nXgU4 V	s/ s HS  n	[
        R                  " [
        R                  " XU	R                  S   U R                  U R                  45      / SQS9PMU     sn	u  pgnU R                  (       a"  U R                  U5      nU R                  U5      n[
        R                   " SX`R"                  -  U5      n
U
[
        R$                  " U
SSS	9-
  n[
        R&                  R)                  USS9n[
        R                   " S
X5      nU R+                  [
        R                  " [
        R                  " U/ SQS9USU R                  U R                  -  45      5      $ s  sn	f )a  
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

Args:
    context (`tf.Tensor`):
        Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
    latents (`tf.Tensor`):
        Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

Returns:
    `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
    from context.
r@   r   )r      r   r   )permz... i d, ... j d -> ... i jT)rA   keepdimsz... i j, ... j d -> ... i d)rT   rU   r   rX   r[   r)   concatr\   	transposereshaper7   r   r   r    rV   rW   einsumr^   
reduce_maxnnsoftmaxr]   )r-   r>   r6   
batch_size
seq_lengthr   qkvxscoresstabilized_scoresrD   	resampleds                 r1   rF    TFIdeficsPerceiverAttention.call   s    ))'2))'2,6w,?)
	 KK KK		7"42>?KK		7"42>? AY
 LLAAGGAJdmm'\]dpq
a
 !!!$A!!!$A8!mm:KQO"R]]6T%RRuu}}.R}8 II;TE	JJr||ILAJPRTXT`T`cgcpcpTpCqr
 	

s   AG,)rT   r   r   rW   r[   rU   r   r]   rV   rX   r    r^   r\   )rH   rI   rJ   rK   rL   boolr   r)   rM   rF   rN   rO   rP   s   @r1   r'   r'   o   s^    `# ` `s `TX `gk `*+
BII +
		 +
bii +
 +
r3   r'   c                   v   ^  \ rS rSrS\4U 4S jjrS\\\R                        S\R                  4S jr
SrU =r$ )r(      r   c                   > [         TU ]  " S0 UD6  UR                  R                  U l        [        R
                  R                  R                  SSS9U l        [        R
                  R                  R                  USSS9U l
        [        R
                  R                  R                  SS9U l        [        R
                  R                  R                  U R                  SS	S9U l        g
)z:Simple MLP block with intermediate_size and embedding sizer   lnr   FfcrY   actr   c_projNr   )r   r   r"   r   r)   r*   r+   r,   r|   r_   r}   ReLUr~   r   )r-   intermediate_sizer   r.   r0   s       r1   r   TFIdeficsMLP.__init__   s    "6"--77((//44T4M((//''(9EPT'U88??''U'3hhoo++DNNUQY+Zr3   hidden_statesr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)r|   r}   r~   r   )r-   r   s     r1   rF   TFIdeficsMLP.call   s@    ../M2r3   )r~   r   r   r}   r|   )rH   rI   rJ   rK   r   r   r   r   r)   rM   rF   rN   rO   rP   s   @r1   r(   r(      s;    [- [(5+;"<   r3   r(   )__doc__typingr   r   
tensorflowr)   modeling_tf_utilsr   configuration_ideficsr   r*   r+   Layerr
   r'   r(   r   r3   r1   <module>r      sj   4 #  + 0<("((//"7"7 <(~A
"((//"7"7 A
H288??(( r3   