
    fTh                       S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKJs  Jr  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJrJrJr  SSKJrJrJ r J!r!  SSK"J#r#J$r$J%r%  \ RL                  " \'5      r(S r)S r*S r+SWS jr,S\RZ                  S\RZ                  4S jr.\ " S S\5      5       r/\ " S S\5      5       r0\ " S S\5      5       r1 " S S\Rd                  5      r3 " S S\Rd                  5      r4 " S S \Rd                  5      r5 " S! S"\Rd                  5      r6 " S# S$\Rd                  5      r7 " S% S&\Rd                  5      r8 " S' S(\Rd                  5      r9 " S) S*\Rd                  5      r: " S+ S,\Rd                  5      r; " S- S.\Rd                  5      r< " S/ S0\Rd                  5      r= " S1 S2\Rd                  5      r> " S3 S4\Rd                  5      r? " S5 S6\Rd                  5      r@ " S7 S8\Rd                  5      rA " S9 S:\Rd                  5      rBS;\A0rC " S< S=\Rd                  5      rD " S> S?\Rd                  5      rE " S@ SA\Rd                  5      rF " SB SC\Rd                  5      rG " SD SE\Rd                  5      rH " SF SG\Rd                  5      rI\ " SH SI\5      5       rJ " SJ SK\J5      rK\" SLSM9 " SN SO\J5      5       rL\ " SP SQ\J5      5       rM\ " SR SS\J5      5       rN\ " ST SU\J5      5       rO/ SVQrPg)XzPyTorch CLAP model.    N)	dataclass)AnyListOptionalTupleUnion)nn   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     U R                   u  p#nU SS2SS2SSS24   R                  SSUS5      nUR                  X#U-  U5      nU$ )aI  
Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

Args:
    hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
        Input hidden states
    ratio (`int`):
        The ratio of the length of the output to the length of the input.
Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/clap/modeling_clap.pyinterpolater'   *   sT     .;-@-@*ZkaD!m,33Aq%CI!!*E.A;OI    c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )a2  
Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
num_channels)`

Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
        Input hidden states
    window_size (`int`):
        Window size
r   r   r
            r   viewpermute
contiguous)r    window_sizer"   heightwidthnum_channelswindowss          r&   window_partitionr7   ;   so     /<.A.A+J!&&k);8LkM ##Aq!Q15@@BGGKfrsGNr(   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SSSSSS5      R                  5       R                  SX#U5      n U $ )a_  
Merges windows to produce higher resolution features.
Args:
    windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
        Input windows
    window_size (`int`):
        Window size
    height (`int`):
        Height of the resized audio
    width (`int`):
        Width of the resized audio
r-   r   r   r
   r*   r+   r,   r.   )r6   r2   r3   r4   r5   s        r&   window_reverser9   P   se     ==$Lll2v4e6JKfrsGooaAq!Q/::<AA"fUabGNr(   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicess        r&   "create_position_ids_from_input_idsrH   d   sW     <<$((*D <<!4<<TBE[[_cc##%33r(   logitsreturnc                     [         R                  " [        U 5      U R                  S9n[        R
                  R                  X5      $ )Ndevice)r?   arangelenrM   r	   
functionalcross_entropy)rI   labelss     r&   contrastive_lossrS   v   s/    \\#f+fmm<F==&&v66r(   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
ClapTextModelOutput{   a  
Base class for text model's outputs that also contains a pooling of the last hidden states.

Args:
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Ntext_embedslast_hidden_state.r    
attentions )__name__
__module____qualname____firstlineno____doc__rW   r   r?   FloatTensor__annotations__rX   r    r   rY   __static_attributes__rZ   r(   r&   rU   rU   {   sr    * 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r(   rU   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
ClapAudioModelOutput   a+  
ClapAudio model output to mimic the output of the original implementation.

Args:
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Naudio_embedsrX   .r    rY   rZ   )r[   r\   r]   r^   r_   rf   r   r?   r`   ra   rX   r    r   rY   rb   rZ   r(   r&   rd   rd      sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r(   rd   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)
ClapOutput   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
Nlosslogits_per_audiologits_per_textrW   rf   text_model_outputaudio_model_outputrJ   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))rm   rn   N)getattrto_tuple).0kselfs     r&   	<genexpr>&ClapOutput.to_tuple.<locals>.<genexpr>   s<      
   KKDGQXY]_`QaQjQjQll s   25)tuplekeysru   s   `r&   rr   ClapOutput.to_tuple   s#     
YY[
 
 	
r(   rZ   )r[   r\   r]   r^   r_   rj   r   r?   r`   ra   rk   rl   rW   rf   rm   r   rn   r   r   rr   rb   rZ   r(   r&   rh   rh      s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448185929
%* 
r(   rh   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )ClapDropPath   z
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
refactored version of the `SwinDropPath` implementation.
c                 .   > [         TU ]  5         Xl        g N)super__init__	drop_prob)ru   r   	__class__s     r&   r   ClapDropPath.__init__   s    "r(   c                 P   U R                   S:X  d  U R                  (       d  U$ SU R                   -
  nUR                  S   4SUR                  S-
  -  -   nU[        R
                  " X1R                  UR                  S9-   nUR                  5         UR                  U5      U-  nU$ )N        r   r   )r   dtyperM   )
r   trainingr   ndimr?   randr   rM   floor_div)ru   r    	keep_probr   random_tensoroutputs         r&   forwardClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJu<O<OXeXlXl$mm""9-=r(   )r   r   )	r[   r\   r]   r^   r_   r   r   rb   __classcell__r   s   @r&   r}   r}      s    
# r(   r}   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )ClapAudioAFFBlock   z
ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
the 1D version.
configc                   > [         TU ]  5         UR                  nUR                  n[	        X#-  5      n[
        R                  " [
        R                  " X$SSSS9[
        R                  " U5      [
        R                  " SS9[
        R                  " XBSSSS9[
        R                  " U5      5      U l
        [
        R                  " [
        R                  " S5      [
        R                  " X$SSSS9[
        R                  " U5      [
        R                  " SS9[
        R                  " XBSSSS9[
        R                  " U5      5      U l        [
        R                  " 5       U l        g )Nr   r   kernel_sizestridepaddingT)inplace)r   r   patch_embeds_hidden_sizeaff_block_rr>   r	   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)ru   r   channelsdownsize_ratiointer_channelsr   s        r&   r   ClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInAaQRSNN8$
 zz|r(   c                     X-   nU R                  U5      U R                  U5      -   nU R                  U5      nSU-  U-  SU-  SU-
  -  -   nU$ )Nr*   r   )r   r   r   )ru   r    residualattention_inputfused_layer_outputr   s         r&   r   ClapAudioAFFBlock.forward  s`    '2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar(   )r   r   r   
r[   r\   r]   r^   r_   r   r   r   rb   r   r   s   @r&   r   r      s    
$ $0 r(   r   c                   >   ^  \ rS rSrSrS\4U 4S jjrSS jrSrU =r	$ )ClapAudioPatchEmbedi  z~
This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
Transformer block.
r   c                   > [         TU ]  5         [        UR                  [        5      (       a  UR                  UR                  4OUR                  n[        UR
                  [        5      (       a  UR
                  UR
                  4OUR
                  n[        UR                  [        5      (       a  UR                  UR                  4OUR                  nX l        X@l        US   US   -  US   US   -  4U l        U R                  S   U R                  S   -  U l	        UR                  U l        UR                  U l        US   US   -
  S-  US   US   -
  S-  4nU R                  (       a  UR                  S:X  a  SOSn[        R                  " UR                   U-  UR"                  UUUS9U l        UR&                  (       a   [        R(                  " UR"                  5      O[        R*                  " 5       U l        U R                  (       aX  [/        U5      U l        [        R                  " UR                   UR"                  US   US   S-  4US   US   S-  4US9U l        g g )Nr   r   r*   channel_mapr+   r   r
   )r   r   
isinstance	spec_sizer>   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer	   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)ru   r   r   r   r   r   scale_factorr   s          r&   r   ClapAudioPatchEmbed.__init__$  s0   ;EfFVFVX[;\;\F$$f&6&67bhbrbr6@ARARTW6X6XV 1 12^d^o^o 	 ;EVEXEXZ]:^:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab!//f6H6HM6Yq`aII--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r(   c                    U R                   (       Ga  US S 2SS2S S 2S S 24   nUR                  u  pEpgX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      nUR                  S5      n[        U5      S:  a  XSS 2S S 2S S 24   R                  5       n	U	R                  u  pEpgU	R                  XE-  SXg5      n	U R                  U	5      n	U	R                  u  ppgU	R                  XEXU5      n	U	R                  S5      R                  5       R                  S	5      n	U	R                  S5      n[        R                  R                  R                  U	SX-
  4S
S5      n	U R!                  X2   U	5      X2'   UnOwUR                  u    pnX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      nU R                  (       a!  UR                  S5      R#                  SS5      nU R%                  U5      nU$ )Nr   r   zInput audio size (*z) doesn't match model (z).r-   )r   r*   r
   r   r+   r
   constantr*   )r   r   r   
ValueErrorr   sizerO   r1   r/   r   r0   r   r?   r	   rP   padr   	transposer   )ru   r    is_longer_idxglobal_hidden_statesr"   r5   r3   r4   output_widthlocal_hidden_states_featureslocal_widths                r&   r   ClapAudioPatchEmbed.forwardN  s   #0AaCA#>  7K6P6P3Jfq))UmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&312q!4K&L&W&W&Y#:M:S:S7
&&9&>&>z?XZ[]c&k#&*oo6I&J#-@-F-F*V&9&>&>zYakp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\-G)H*VW'# 7;6G6G(79L7$3 1M"/"5"5Aq%q))UmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r(   )
r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r&   r   r     s    
( (T/ /r(   r   c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )ClapAudioSelfAttentioni  c                 
  > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        [        U[        R                  R                  5      (       a  UOXD4U l        [        R                  " [        R                  " SU R                  S   -  S-
  SU R                  S   -  S-
  -  U5      5      U l        [        R"                  " U R                  S   5      n[        R"                  " U R                  S   5      n[        R$                  " ['        XV/SS95      n[        R(                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  n	U	R+                  SSS5      R-                  5       n	U	S S 2S S 2S4==   U R                  S   S-
  -  ss'   U	S S 2S S 2S4==   U R                  S   S-
  -  ss'   U	S S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   U	R/                  S	5      n
U R1                  S
U
5        [        R2                  " U R                  U R                  UR4                  S9U l        [        R2                  " U R                  U R                  UR4                  S9U l        [        R2                  " U R                  U R                  UR4                  S9U l        [        R<                  " UR>                  5      U l         g )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r*   r   ij)indexingr-   relative_position_indexbias)!r   r   r   num_attention_headsr>   attention_head_sizeall_head_sizer   collectionsabcIterabler2   r	   	Parameterr?   zerosrelative_position_bias_tablerN   stackr   r   r0   r1   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)ru   r   r<   	num_headsr2   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r   s              r&   r   ClapAudioSelfAttention.__init__  s   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP%k;??3K3KLLKS^Rl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr(   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ Nr-   r   r*   r   r
   r   r   r   r/   r0   ru   xnew_x_shapes      r&   transpose_for_scores+ClapAudioSelfAttention.transpose_for_scores  L    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r(   r    attention_mask	head_maskoutput_attentionsrJ   c                    UR                   u  pVnU R                  U5      nU R                  U R                  U5      5      n	U R                  U R	                  U5      5      n
U R                  U5      n[
        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nU R                  U R                  R                  S5         nUR                  U R                  S   U R                  S   -  U R                  S   U R                  S   -  S5      nUR                  SSS5      R!                  5       nXR#                  S5      -   nUbm  UR                   S   nUR                  X^-  XR$                  Xf5      nXR#                  S5      R#                  S5      -   nUR                  SU R$                  Xf5      n[&        R(                  R+                  USS9nU R-                  U5      nUb  X-  n[
        R                  " X5      nUR                  SSSS5      R!                  5       nUR/                  5       S S U R0                  4-   nUR                  U5      nU(       a  UU4nU$ U4nU$ )Nr-   r   r   r*   r;   r
   )r   r   r	  r   r   r?   matmulr   mathsqrtr   r   r   r/   r2   r0   r1   	unsqueezer   r	   rP   softmaxr   r   r   )ru   r    r  r  r  r"   r<   r5   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r&   r   ClapAudioSelfAttention.forward  s{    )6(;(;%
 JJ}5--dhh}.EF	//

=0IJ//0AB !<<5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.N.Nq.QQ%'--a0J/44(*6N6NPS   02J2J12M2W2WXY2ZZ/44R9Q9QSV\ --//0@b/I ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r(   )	r   r   r   r   r   r   r   r   r2   NNF)r[   r\   r]   r^   r   r	  r?   Tensorr   r`   boolr   r   rb   r   r   s   @r&   r   r     s{    #GJ% 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6 6r(   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapAudioSelfOutputi  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )r   r   r	   r   denser   r   r   ru   r   r<   r   s      r&   r   ClapAudioSelfOutput.__init__  s4    YYs(
zz&"E"EFr(   r    input_tensorrJ   c                 J    U R                  U5      nU R                  U5      nU$ r   r(  r   ru   r    r+  s      r&   r   ClapAudioSelfOutput.forward  s$    

=1]3r(   r-  
r[   r\   r]   r^   r   r?   r#  r   rb   r   r   s   @r&   r&  r&    s7    G
U\\  RWR^R^  r(   r&  c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )ClapAudioAttentioni  c                    > [         TU ]  5         [        XX45      U l        [	        X5      U l        [        5       U l        g r   )r   r   r   ru   r&  r   setpruned_heads)ru   r   r<   r   r2   r   s        r&   r   ClapAudioAttention.__init__  s2    *6	O	)&6Er(   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g Nr   r   r;   rO   r   ru   r   r   r5  r   r   r   r   r   r(  r   unionru   headsindexs      r&   prune_headsClapAudioAttention.prune_heads     u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r(   r    r  r  r  rJ   c                 f    U R                  XX45      nU R                  US   U5      nU4USS  -   nU$ Nr   r   ru   r   )ru   r    r  r  r  self_outputsattention_outputr   s           r&   r   ClapAudioAttention.forward  sB     yy	];;|AF#%QR(88r(   r   r5  ru   r"  )r[   r\   r]   r^   r   r>  r?   r#  r   r`   r$  r   r   rb   r   r   s   @r&   r2  r2    sy    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
 
r(   r2  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapAudioIntermediatei  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r	   r   r>   	mlp_ratior(  r   
hidden_actstrr   intermediate_act_fnr)  s      r&   r   ClapAudioIntermediate.__init__  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r(   r    rJ   c                 J    U R                  U5      nU R                  U5      nU$ r   r(  rN  ru   r    s     r&   r   ClapAudioIntermediate.forward$  &    

=100?r(   rQ  r0  r   s   @r&   rI  rI    (    9U\\ ell  r(   rI  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapAudioOutputi+  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
r   r   r	   r   r>   rK  r(  r   hidden_dropout_probr   r)  s      r&   r   ClapAudioOutput.__init__,  sF    YYs6#3#3c#9:C@
zz&"<"<=r(   r    rJ   c                 J    U R                  U5      nU R                  U5      nU$ r   r-  rR  s     r&   r   ClapAudioOutput.forward1  s$    

=1]3r(   r-  r0  r   s   @r&   rW  rW  +  s(    >
U\\ ell  r(   rW  c                      ^  \ rS rSrSU 4S jjrS rS rS r   SS\R                  S\
\\4   S\\R                     S	\\   S
\\   S\
\R                  \R                  4   4S jjrSrU =r$ )ClapAudioLayeri8  c                   > [         TU ]  5         UR                  U l        X`l        UR                  U l        X0l        [        R                  " X!R                  S9U l	        [        XX@R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        g )Neps)r2   r   )r   r   chunk_size_feed_forward
shift_sizer2   input_resolutionr	   r   layer_norm_epslayernorm_beforer2  	attentionr}   r   	drop_pathlayernorm_afterrI  intermediaterW  r   )ru   r   r<   rd  r   drop_path_raterc  r   s          r&   r   ClapAudioLayer.__init__9  s    '-'E'E$$!-- 0 "S6K6K L+FP`P`a9G#9Mn5SUS^S^S`!||C5J5JK1&>%f2r(   c                    [        U5      U R                  ::  an  [        S5      U l        [        R
                  R                  5       (       a*  [        R                   " [        R                  " U5      5      O
[        U5      U l        g g Nr   )minr2   r   rc  r?   jit
is_tracingtensor)ru   rd  s     r&   set_shift_and_window_size(ClapAudioLayer.set_shift_and_window_sizeF  s_     D$4$44'lDO=BYY=Q=Q=S=S		%,,'789Y\]mYn  5r(   c           	         U R                   S:  Gaw  [        R                  " SXS4X4S9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4nSnU H  n	U H  n
XS S 2XS S 24'   US-  nM     M     [        XPR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  [        S5      5      R                  US:H  [        S5      5      nU$ S nU$ )Nr   r   r   r-   r*   g      Yr   )
rc  r?   r   slicer2   r7   r/   r  masked_fillfloat)ru   r3   r4   r   rM   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r&   get_attn_maskClapAudioLayer.get_attn_maskN  s   ??Q{{Ava#8UHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir(   c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ rn  )r2   r	   rP   r   )ru   r    r3   r4   	pad_right
pad_bottom
pad_valuess          r&   	maybe_padClapAudioLayer.maybe_padj  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r(   r    input_dimensionsr  r  always_partitionrJ   c                    U(       d  U R                  U5        O Uu  pgUR                  5       u  pn
UnU R                  U5      nUR                  XXz5      nU R	                  XU5      u  pUR
                  u  ppU R                  S:  a.  [        R                  " XR                  * U R                  * 4SS9nOUn[        XR                  5      nUR                  SU R                  U R                  -  U
5      nU R                  XUR                  UR                  S9nU R                  UUX4S9nUS   nUR                  SU R                  U R                  U
5      n[        UU R                  X5      nU R                  S:  a-  [        R                  " UU R                  U R                  4SS9nOUnUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R!                  5       nUR                  XU-  U
5      nXR#                  U5      -   nU R%                  U5      nU R'                  U5      nXR)                  U5      -   nU(       a	  UUS	   4nU$ U4nU$ )
Nr   )r   r*   )shiftsdimsr-   r   )r  r
   r,   r   )rs  r   rf  r/   r  r   rc  r?   rollr7   r2   r  r   rM   rg  r9   r1   rh  ri  rj  r   )ru   r    r  r  r  r  r3   r4   r"   r   r   shortcutr  
height_pad	width_padshifted_hidden_stateshidden_states_windowsr  attention_outputsrE  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r&   r   ClapAudioLayer.forwardq  s     **+;<("/"4"4"6
x --m<%**:uO %)NN=%$P!&3&9&9#y??Q$)JJ}FVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&)<)<EZEaEa ' 
	 !NN!9i + 
 -Q/,11"d6F6FHXHXZbc():D<L<Ljd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:~xX >>2C#DD++M:((6${{<'@@@Q'8';< YeWfr(   )
rg  rb  rh  rd  rj  ri  rf  r   rc  r2   )r   r   NFF)r[   r\   r]   r^   r   rs  r  r  r?   r#  r   r>   r   r`   r$  r   rb   r   r   s   @r&   r^  r^  8  s    38) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*A Ar(   r^  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\\4   S\	\R                     S\	\   S\	\   S\\R                     4S	 jjrS
rU =r$ )ClapAudioStagei  c                 R  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H+  n[        UUUUXh   US-  S:X  a  SOUR                  S-  S9PM-     sn5      U l	        Ub  U" X2[        R                  S9U l        OS U l        SU l        g s  snf )Nr*   r   )r   r<   rd  r   rk  rc  )r<   
norm_layerF)r   r   r   r<   r	   
ModuleListranger^  r2   blocksr   
downsamplepointing)
ru   r   r<   rd  depthr   rh  r  ir   s
            r&   r   ClapAudioStage.__init__  s    mm u
 &A !%5'#,<%&UaZqf6H6HA6M &

 !()9r||\DO"DO'
s   2B$r    r  r  r  r  rJ   c                    Uu  pg[        U R                  5       H  u  pUb  X8   OS n
U	" XXU5      nUS   nM     UnU R                  b%  US-   S-  US-   S-  pXgX4nU R                  X5      nOXgXg4nXU4nU(       a  UWSS  -  nU$ )Nr   r   r*   )	enumerater  r  )ru   r    r  r  r  r  r3   r4   r  layer_modulelayer_head_maskr  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                    r&   r   ClapAudioStage.forward  s     )(5OA.7.CilO(UeM *!,M  6 -:)??&5;aZA4EPQ	VWGW 1!'0B V OO,M`M!' >&K\]]12..Mr(   )r  r   r<   r  r  r  )r[   r\   r]   r^   r   r?   r#  r   r>   r   r`   r$  r   rb   r   r   s   @r&   r  r    s    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	 r(   r  c            	          ^  \ rS rSrSr\R                  4S\\   S\S\R                  SS4U 4S jjjr
S	 rS
\R                  S\\\4   S\R                  4S jrSrU =r$ )ClapAudioPatchMergingi  a  
Patch Merging Layer.

Args:
    input_resolution (`Tuple[int]`):
        Resolution of input feature.
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
rd  r<   r  rJ   Nc                    > [         TU ]  5         Xl        X l        [        R
                  " SU-  SU-  SS9U l        U" SU-  5      U l        g )Nr+   r*   Fr   )r   r   rd  r<   r	   r   	reductionr   )ru   rd  r<   r  r   s       r&   r   ClapAudioPatchMerging.__init__   sE     01s7AG%@q3w'	r(   c                     US-  S:H  =(       d    US-  S:H  nU(       a-  SSSUS-  SUS-  4n[         R                  R                  X5      nU$ )Nr*   r   r   )r	   rP   r   )ru   input_featurer3   r4   
should_padr  s         r&   r  ClapAudioPatchMerging.maybe_pad  sS    qjAo:519>
Q519a!<JMM--mHMr(   r  r  c                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      nUS S 2SS S2SS S2S S 24   nUS S 2SS S2SS S2S S 24   n	US S 2SS S2SS S2S S 24   n
US S 2SS S2SS S2S S 24   n[        R                  " XX/S5      nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ )Nr   r*   r   r-   r+   )r   r/   r  r?   catr   r  )ru   r  r  r3   r4   r"   r<   r5   input_feature_0input_feature_1input_feature_2input_feature_3s               r&   r   ClapAudioPatchMerging.forward  s   ((5(;(;%
%**:uS}eD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?_"fhjk%**:r1|;KL		-0}5r(   )r<   rd  r   r  )r[   r\   r]   r^   r_   r	   r   r   r>   Moduler   r  r?   r#  r   rb   r   r   s   @r&   r  r    s|    
 XZWcWc (s (# (299 (hl ( (U\\ U3PS8_ Y^YeYe  r(   r  c                      ^  \ rS rSrU 4S jrS r       SS\\R                     S\\R                     S\\	   S\\	   S\\	   S	\\	   S
\\	   S\
\\4   4S jjrSrU =r$ )ClapAudioEncoderi)  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        U5      U l        UR                  U l        U R                  R                  U l	        UR                  U l
        UR                  UR                  -  U l        [        UR                  SU R                  S-
  -  -  5      U l        [         R"                  " SUR$                  ['        UR                  5      SS9 Vs/ s H  o"R)                  5       PM     nnU R                  R*                  n[-        U R                  5       Vs/ s H  oTS   SU-  -  US   SU-  -  4PM     snU l        [0        R2                  " [-        U R                  5       Vs/ s H  n[5        U[        UR                  SU-  -  5      U R.                  U   UR                  U   UR6                  U   U['        UR                  S U 5      ['        UR                  S US-    5       X`R                  S-
  :  a  [8        OS S9PM     sn5      U l        SU l        [0        R>                  " UR                  5      U l         [0        RB                  " U R                  5      U l"        UR                  U l        [0        RF                  " S5      U l$        g s  snf s  snf s  snf )Nr*   r   r   cpurL   )r   r<   rd  r  r   rh  r  F)%r   r   rO   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior>   r   num_featuresr?   linspacerk  r   itemr   r  input_resolutionsr	   r  r  r   r  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)ru   r   r  rk  r   r  i_layerr   s          r&   r   ClapAudioEncoder.__init__*  s_   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vw,vq&&(,vw$$..	\abfbqbq\r!s\rWXQ<AqD#99Q<AqD;Q"R\r!smm  %T__5  6G !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@??UVCV9V4]a  6
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   >KKB#Kc                 n   UR                   u    p#n[        U R                  U R                  -  5      nU R                  U R                  -  nX5:  d  XF:  a  [	        S5      eX5:  a!  [
        R                  R                  XU4SSS9nXF:  a!  [
        R                  R                  XU4SSS9nUR                   u  pxpUR                  XxU R                  -  XR                  -  U
5      nUR                  SSSS5      R                  5       nUR                  XxXR                  -  XR                  -  5      nU$ )	z
The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r
   r*   )r   r>   r   r  r   r	   rP   r'   r   r0   r1   )ru   normalized_input_featuresr   r#   freq_length
spec_widthspec_heightbatchr   timefreqs              r&   reshape_mel2img ClapAudioEncoder.reshape_mel2imgR  sD   
 *C)H)H&1;$//9:
nn7#{'@_`` #(*(A(A)+D9dh )B )% $(*(A(A)+EIei )B )% '@&E&E# %>$E$Edoo-t/F%
! %>$E$EaAq$Q$\$\$^!$=$E$ETOO3T__5L%
! )(r(   	is_longerr  r  output_hidden_states(output_hidden_states_before_downsamplingr  return_dictrJ   c	                    UR                  SS5      nU R                  U5      n	U	R                  SS5      n	S n
U R                  (       a7  UR                  UR                  5      n[
        R                  " US:H  5      S   n
U R                  U	5      nUR                  S   nU R                  X5      nU(       a  SOS nU(       a  SOS nU(       a  SOS nU R                  S   nU(       aD  UR                  u  nnnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nUU4-  n[        U R                  5       GHF  u  nnUb  UU   OS nU R                  U   nU R                  (       a1  U R                   (       a   U R#                  UR$                  UUUU5      nOU" UUUXG5      nUS   nUS   nUS   nUS   US   4nU(       aU  U(       aN  UR                  u  nnnUR                  " U/US   US   4QUP76 nUR                  SSSS5      nUU4-  nUU4-  nORU(       aK  U(       dD  UR                  u  nnnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nUU4-  nU(       d  GM>  UUSS  -  nGMI     U R'                  U5      nUR                  u  nnnUS[)        U R*                  5      S-
  -  -  U R,                  S   -  nUS[)        U R*                  5      S-
  -  -  U R,                  S   -  nUR                  SSS5      R/                  5       R1                  UUUU5      nUR                  u  nnn n!U U R2                  -  n"UR1                  UUU U"-  U"U!5      nUR                  SSSSS5      R/                  5       R1                  UUU"S5      nU R5                  [
        R6                  " US5      5      n#[
        R6                  " U#S5      n#U(       d  [9        S	 UU#UU4 5       5      $ [;        UU#UUS
9$ )Nr   r
   r   r*   rZ   r  r-   r+   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rZ   rs   vs     r&   rv   +ClapAudioEncoder.forward.<locals>.<genexpr>  s"      	A     	)rX   pooler_outputr    rY   )r   r  r   torM   r?   wherer  r   r  r  r/   r0   r  r  r  r   _gradient_checkpointing_func__call__r   rO   r  r   r1   r   r  r  r   rx   r   )$ru   input_featuresr  r  r  r  r  r  r  r  is_longer_list_idxis_longer_listr    
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsr  r"   r   hidden_sizereshaped_hidden_stater  r  r  r  r  r  rX   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs$                                       r&   r   ClapAudioEncoder.forwardv  s    (11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((K"6BD+?RT"$5b411!4)6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&+@*BB&(5OA|.7.CilO#55a8**t}} $ A A ))=:JO]n! !-!#3_FW! *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*/D.FF*  #}QR'88#Q  6T !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7 	 &!.'		 	 	 */'4*	
 	
r(   )r  r  r   r  r   r  r  r  r  r   r  r  r  r   r   )NNFFFFT)r[   r\   r]   r^   r   r  r   r?   r`   r$  r   r   rd   r   rb   r   r   s   @r&   r  r  )  s    &/P")N 2615,1/4CH+0&*z
 E--.z
 E--.	z

 $D>z
 'tnz
 3;4.z
 #4.z
 d^z
 
u**	+z
 z
r(   r  c                   @   ^  \ rS rSrS\\\4   4U 4S jjrS rSr	U =r
$ )ClapProjectionLayeri  r   c                    > [         TU ]  5         Xl        UR                  nUR                  n[
        R                  " X#5      U l        [        UR                     U l
        [
        R                  " X35      U l        g r   )r   r   r   r  projection_dimr	   r   linear1r   projection_hidden_act
activationlinear2)ru   r   r  r  r   s       r&   r   ClapProjectionLayer.__init__  s[    ((..yy= !=!=>yy@r(   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r
  r  rR  s     r&   r   ClapProjectionLayer.forward  s2    ]36]3r(   )r
  r   r  r  )r[   r\   r]   r^   r   r   r   r   r   rb   r   r   s   @r&   r  r    s%    Au_n%DE A r(   r  c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )ClapTextEmbeddingsi  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)rD   r`  position_embedding_typeabsoluteposition_ids)r   r-   T)
persistenttoken_type_idsr   )r   r   r	   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   re  r   rY  r   rq   r  r   r?   rN   expandr   r  r   rB   rD   ru   r   r   s     r&   r   ClapTextEmbeddings.__init__  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	

 "..#%<<**F,>,>DL\L\$
 r(   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr-   r   r  r   r   r  )rH   rD   &create_position_ids_from_inputs_embedsr   hasattrr  r   r?   r   rB   r  rM   r  r  r  r  r   r   )ru   rC   r  r  inputs_embedsrE   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  s                r&   r   ClapTextEmbeddings.forward%  sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r(   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr-   r   r   r   )r   r?   rN   rD   rB   rM   r  r   )ru   r&  r'  sequence_lengthr  s        r&   r$  9ClapTextEmbeddings.create_position_ids_from_inputs_embedsM  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r(   )r   r   rD   r  r  r  r  )NNNNr   )
r[   r\   r]   r^   r_   r   r   r$  rb   r   r   s   @r&   r  r    s$    

4 rs&P= =r(   r  c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )ClapTextSelfAttentioni`  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizer   r   r   r  r  relative_keyrelative_key_queryr*   r   )r   r   r  r   r%  r   r>   r   r   r	   r   r   r   r   r   r   r   rq   r  r  r  distance_embedding
is_decoderru   r   r  r   s      r&   r   ClapTextSelfAttention.__init__a  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r(   r  rJ   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ r  r  r  s      r&   r	  *ClapTextSelfAttention.transpose_for_scores{  r  r(   r    r  r  encoder_hidden_statesencoder_attention_maskpast_key_valuer  c                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   r*   r;   r-   r  r4  r5  r   r  zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) r   r	  r   r   r?   r  r7  r  r   r  r   rr  rB   rM   r/   rN   r6  r  r  r   einsumr  r  r   r	   rP   r  r   r0   r1   r   r   )ru   r    r  r  r<  r=  r>  r  r  is_cross_attentionr  r  r  	use_cacher  query_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyr  r  r  r   s                               r&   r   ClapTextSelfAttention.forward  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr(   )r   r   r6  r   r7  r   r  r   r  r   r   r   NNNNNF)r[   r\   r]   r^   r   r?   r#  r	  r   r`   r   r$  r   rb   r   r   s   @r&   r1  r1  `  s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr(   r1  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr`  )r   r   r	   r   r  r(  r   re  r   rY  r   r!  s     r&   r   ClapTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r(   r    r+  rJ   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r(  r   r   r.  s      r&   r   ClapTextSelfOutput.forward  5    

=1]3}'CDr(   r   r(  r   r0  r   s   @r&   rO  rO    6    >U\\  RWR^R^  r(   rO  eagerc                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )ClapTextAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr  )	r   r    CLAP_TEXT_SELF_ATTENTION_CLASSES_attn_implementationru   rO  r   r4  r5  r8  s      r&   r   ClapTextAttention.__init__  s@    4V5P5PQ
	 )0Er(   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g r8  r9  r;  s      r&   r>  ClapTextAttention.prune_heads  r@  r(   r    r  r  r<  r=  r>  r  rJ   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ rB  rC  )ru   r    r  r  r<  r=  r>  r  rD  rE  r   s              r&   r   ClapTextAttention.forward  sW     yy!"
  ;;|AF#%QR(88r(   rG  r   rM  )r[   r\   r]   r^   r   r>  r?   r#  r   r`   r   r$  r   rb   r   r   s   @r&   r[  r[    s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r(   r[  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapTextIntermediatei/  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r	   r   r  intermediate_sizer(  r   rL  rM  r   rN  r!  s     r&   r   ClapTextIntermediate.__init__0  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r(   r    rJ   c                 J    U R                  U5      nU R                  U5      nU$ r   rQ  rR  s     r&   r   ClapTextIntermediate.forward8  rT  r(   rQ  r0  r   s   @r&   rf  rf  /  rU  r(   rf  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapTextOutputi?  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g rQ  )r   r   r	   r   rh  r  r(  r   re  r   rY  r   r!  s     r&   r   ClapTextOutput.__init__@  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r(   r    r+  rJ   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rT  r.  s      r&   r   ClapTextOutput.forwardF  rV  r(   rW  r0  r   s   @r&   rm  rm  ?  rX  r(   rm  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )ClapTextLayeriN  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is addedr  r]  )r   r   rb  seq_len_dimr[  rg  r7  add_cross_attentionr   crossattentionrf  rj  rm  r   r!  s     r&   r   ClapTextLayer.__init__O  s    '-'E'E$*62 ++#)#=#= ##?? D6)g!hii"3FT^"_D08$V,r(   r    r  r  r<  r=  r>  r  rJ   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nr*   )r  r>  r   r   r-   rw  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )	rg  r7  r%  r   rw  r   feed_forward_chunkrb  ru  )ru   r    r  r  r<  r=  r>  r  self_attn_past_key_valueself_attention_outputsrE  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputsr  s                    r&   r   ClapTextLayer.forward]  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr(   c                 J    U R                  U5      nU R                  X!5      nU$ r   )rj  r   )ru   rE  intermediate_outputr  s       r&   rz   ClapTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir(   )rv  rg  rb  rw  rj  r7  r   ru  rM  )r[   r\   r]   r^   r   r?   r#  r   r`   r   r$  r   rz  rb   r   r   s   @r&   rs  rs  N  s    -" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r(   rs  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )ClapTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r   r	   r  r  num_hidden_layersrs  layerr  )ru   r   r   r   s      r&   r   ClapTextEncoder.__init__  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A&r    r  r  r<  r=  past_key_valuesrB  r  r  r  rJ   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
NrZ   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r-   r   r*   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rZ   r  s     r&   rv   *ClapTextEncoder.forward.<locals>.<genexpr>  s"      
A  r  )rX   r  r    rY   cross_attentions)r   rv  r  r   loggerwarning_oncer  r  r  r  rx   r   )ru   r    r  r  r<  r=  r  rB  r  r  r  r  r  all_cross_attentionsnext_decoder_cacher  r  r  r>  r  s                       r&   r   ClapTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r(   )r   r  r  )	NNNNNNFFT)r[   r\   r]   r^   r   r?   r#  r   r`   r   r$  r   r   r   rb   r   r   s   @r&   r  r    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r(   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapTextPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r   r   r	   r   r  r(  Tanhr
  r!  s     r&   r   ClapTextPooler.__init__  s9    YYv1163E3EF
'')r(   r    rJ   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ rn  )r(  r
  )ru   r    first_token_tensorpooled_outputs       r&   r   ClapTextPooler.forward	  s6     +1a40

#566r(   )r
  r(  r0  r   s   @r&   r  r    s(    $
U\\ ell  r(   r  c                   &    \ rS rSr\rSrSrS rSr	g)ClapPreTrainedModeli  clapFc                    U R                   R                  n[        U[        5      (       ac  UR                  R
                  R                  R                  SUS-  S9  UR                  R
                  R                  R                  SUS-  S9  g[        U[        5      (       aW  [        R                  R                  UR                  US-  S9  [        R                  R                  UR                  US-  S9  g[        U[        R                  5      (       a(  UR
                  R                  R                  SUS-  S9  g[        U[        R                  5      (       aJ  UR                   R                  R#                  5         UR
                  R                  R%                  S5        g[        U[        R&                  [        R(                  45      (       a  U R                   R*                  S-  SU R                   R,                  -  S-  -  U-  n[        R                  R                  UR
                  US9  UR                   b%  UR                   R                  R#                  5         ggg)	zInitialize the weightsr   g{Gz?)meanstd)r  g      ?g      r*   N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModelr	   initlogit_scale_alogit_scale_tr  r   r   zero_fill_r   r   r  r  )ru   modulefactorin_proj_stds       r&   _init_weights!ClapPreTrainedModel._init_weights  s   //f011&&--22::RV:W((//44<<#6TX=<Y	**GGOOF00ftmODGGOOF00ftmOD--MM&&CVd]&C--KK""$MM$$S)BII 677;;22D8a$++B_B_>_dh=hilrrKGGOOFMM{O;{{&  &&( ' 8r(   rZ   N)
r[   r\   r]   r^   r   config_classbase_model_prefixsupports_gradient_checkpointingr  rb   rZ   r(   r&   r  r    s    L&+#)r(   r  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\R                     S\\   S	\\   S
\\   S\\\4   4S jj5       rSrU =r$ )ClapAudioModeli/  r  r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r   r   r  audio_encoder	post_initr!  s     r&   r   ClapAudioModel.__init__3  s'     -f5r(   rJ   c                 B    U R                   R                  R                  $ r   )r  r  r   rz   s    r&   get_input_embeddings#ClapAudioModel.get_input_embeddings9  s    !!--222r(   r  r  r  r  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9$ )a  
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
    retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, ClapAudioModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

>>> inputs = processor(audios=audio_sample, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```r  r  r  r  r  )r   use_return_dictr  r  r  )ru   r  r  r  r  r  s         r&   r   ClapAudioModel.forward<  sy    D &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 !!)/!5# " 
 	
r(   )r  NNNNN)r[   r\   r]   r^   r   r  main_input_namer   r	   r  r  r   r   r?   r`   
BoolTensorr$  r   r   r   r   rb   r   r   s   @r&   r  r  /  s    "L&O 3bii 3  7;04,0/3&*-
 !2!23-
 E,,--
 $D>	-

 'tn-
 d^-
 
u00	1-
 -
r(   r  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                      ^  \ rS rSr\rSU 4S jjrS rS r\	             SS\
\R                     S\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\\R                        S\
\   S\
\   S\
\   S\
\   S\\\R                     \4   4S jj5       rSrU =r$ )ClapTextModelim  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
r   r   r   r  r+  r  encoderr  poolerr  )ru   r   add_pooling_layerr   s      r&   r   ClapTextModel.__init__~  sK    
 	 ,V4&v.0AnV,t 	r(   c                 .    U R                   R                  $ r   r+  r  rz   s    r&   r  "ClapTextModel.get_input_embeddings  s    ...r(   c                 $    XR                   l        g r   r  ru   r   s     r&   set_input_embeddings"ClapTextModel.set_input_embeddings  s    */'r(   rC   r  r  r  r  r&  r<  r=  r  rB  r  r  r  rJ   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUc  [        R                  " UUU-   4US9nUcs  [        U R                  S5      (       a4  U R                  R                  S S 2S U24   nUR!                  UU5      nUnO$[        R"                  " U[        R$                  US	9nU R'                  X.5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R)                  U5      nOS nU R+                  XPR                   R,                  5      nU R                  UUUUUS
9nU R/                  UUUUUU	U
UUUS9
nUS   nU R0                  b  U R1                  U5      OS nU(       d
  UU4USS  -   $ [3        UUUR4                  UR6                  UR8                  UR:                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr   r*   rL   r  r   )rC   r  r  r&  rE   )	r  r  r<  r=  r  rB  r  r  r  r   )rX   r  r  r    rY   r  )r   r  r  r  r7  rB  r   %warn_if_padding_and_no_attention_maskr   rM   r   r?   onesr%  r+  r  r   r   rB   get_extended_attention_maskinvert_attention_maskget_head_maskr  r  r  r   r  r    rY   r  )ru   rC   r  r  r  r  r&  r<  r=  r  rB  r  r  r  r'  r"   r(  rM   rE   r)  r*  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                                  r&   r   ClapTextModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r(   )r   r+  r  r  )T)NNNNNNNNNNNNN)r[   r\   r]   r^   r   r  r   r  r  r   r   r?   r#  r   r`   r$  r   r   r   r   rb   r   r   s   @r&   r  r  m  sl    "L /0  -11515/3,0048<9==A$(,0/3&*l
ELL)l
 !.l
 !.	l

 u||,l
 ELL)l
  -l
  (5l
 !) 6l
 "$u'8'8"9:l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$PP	Ql
 l
r(   r  c                     ^  \ rS rSr\rS\4U 4S jjr\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\         SS\\	R                     S\\	R                     S\\	R                      S\\	R                     S\\	R                     S\\   S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )r  i  r   c                 N  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  n[        R                  " [        R                  " [        R                  " UR                  5      5      5      U l        [        R                  " [        R                  " [        R                  " UR                  5      5      5      U l        UR$                  U l        ['        U5      U l        [+        U5      U l        [/        U5      U l        [+        U5      U l        U R5                  5         g )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )r   r   r   text_configr   	TypeErrortypeaudio_configr   r	   r   r?   rr  r  loglogit_scale_init_valuer  r  r  r  
text_modelr  text_projectionr  audio_modelaudio_projectionr  )ru   r   r  r  r   s       r&   r   ClapModel.__init__  sC    &,,n==++,-Q0 
 &--??,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r(   rC   r  r  r  r  r  rJ   c           	      D   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9nUb  US   OUR
                  nU R                  U5      n	[        R                  " U	SS9n	U	$ )a>  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`ClapTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, ClapModel

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

>>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```rC   r  r  r  r  r  r   r-   r;   )	r   r  r  r  r  r  r  F	normalize)
ru   rC   r  r  r  r  r  text_outputsr  text_featuress
             r&   get_text_featuresClapModel.get_text_features(  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 ,7+BQHbHb,,];Mr:r(   r  r  c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUS9nU(       d  US   OUR
                  nU R                  U5      n	[        R                  " U	SS9n	U	$ )aZ  
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
    retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Returns:
    audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
    applying the projection layer to the pooled output of [`ClapAudioModel`].

Examples:

```python
>>> from transformers import AutoFeatureExtractor, ClapModel
>>> import torch

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
>>> random_audio = torch.rand((16_000))
>>> inputs = feature_extractor(random_audio, return_tensors="pt")
>>> audio_features = model.get_audio_features(**inputs)
```)r  r  r  r   r-   r;   )	r   r  r  r  r  r  r  r  r  )
ru   r  r  r  r  r  r  audio_outputsr  audio_featuress
             r&   get_audio_featuresClapModel.get_audio_featuresX  s    D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()# ) 
 1<a(A\A\..}=^<r(   return_lossc
           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUU	S9n
U R                  UUUUUU	S9nU	(       d  U
S   OU
R                  nU R                  U5      nU	(       d  US   OUR                  nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       nU R                  R                  5       n[        R                  " XR                  5       5      U-  n[        R                  " XR                  5       5      U-  nSnU(       a,  [!        U5      n[!        UR                  5       5      nUU-   S	-  nU	(       d  UUXX4nUb  U4U-   $ U$ [#        UUUUUUU
S
9$ )aO  
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
    retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, ClapModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

>>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]

>>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)
>>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
>>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
```Nr  r  r   r*   r-   T)pr<   keepdimg       @)rj   rk   rl   rW   rf   rm   rn   )r   r  r  r  r  r  r  r  r  r   r  expr  r?   r  trS   rh   )ru   rC   r  r  r  r  r  r  r  r  r  r  rf   rW   logit_scale_textlogit_scale_audiorl   rk   rj   caption_loss
audio_lossr   s                         r&   r   ClapModel.forward  s   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()/!5# ) 
 )%/!5# ' 
 0;}Q'@[@[,,\:-8l1ol>X>X**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  --113 ..224,,{NN4DEHXX <<mmoFIZZ+O<L)*:*<*<*>?J :-4D&T`pF)-)9TGf$EvE-+#%*,
 	
r(   )r  r  r  r  r  r  r  NNNNNN)	NNNNNNNNN)r[   r\   r]   r^   r   r  r   r   r   r?   r#  r$  r`   r  r  
LongTensorr  r   r   rh   r   rb   r   r   s   @r&   r  r    s7   Lz @  -115/3,0/3&*-ELL)- !.- u||,	-
 $D>- 'tn- d^- 
		- -^  26,015,0/3&*2 .2 ELL)2 !.	2
 $D>2 'tn2 d^2 
		2 2h  156:041537&*,0/3&*d
E,,-d
 !!2!23d
 E,,-	d

 !.d
 u//0d
 d^d
 $D>d
 'tnd
 d^d
 
uj 	!d
 d
r(   r  c                     ^  \ rS rSr\rS\4U 4S jjrS\R                  4S jr	S r
\      SS\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )ClapTextModelWithProjectioni  r   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r   r   r  r  r  r  r  r!  s     r&   r   $ClapTextModelWithProjection.__init__  s3     '/26:r(   rJ   c                 B    U R                   R                  R                  $ r   r  r+  r  rz   s    r&   r  0ClapTextModelWithProjection.get_input_embeddings   s    ))999r(   c                 8    XR                   R                  l        g r   r  r  s     r&   r  0ClapTextModelWithProjection.set_input_embeddings  s    5:""2r(   rC   r  r  r  r  r  c           	      P   Ub  UOU R                   R                  nU R                  UUUUUUS9nU(       d  US   OUR                  nU R	                  U5      n	U(       d  XS   4USS -   n
[        S U
 5       5      $ [        U	UR                  UR                  UR                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, ClapTextModelWithProjection

>>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
>>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

>>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```Nr  r   r   r*   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rZ   rs   r   s     r&   rv   6ClapTextModelWithProjection.forward.<locals>.<genexpr>/       LgFg   	)rW   rX   r    rY   )
r   r  r  r  r  rx   rU   rX   r    rY   )ru   rC   r  r  r  r  r  r  r  rW   r   s              r&   r   #ClapTextModelWithProjection.forward  s    0 &1%<k$++B]B])%/!5# ' 
 0;Q@Z@Z**=9"O4|AB7GGGLgLLL"#*<<&44#..	
 	
r(   )r  r  r
  )r[   r\   r]   r^   r   r  r   r	   r  r  r  r   r   r?   r#  r$  r   r   rU   r   rb   r   r   s   @r&   r  r    s    !L~ :bii :;  -115/3,0/3&*/
ELL)/
 !./
 u||,	/

 $D>/
 'tn/
 d^/
 
u))	*/
 /
r(   r  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\R                     S\\   S	\\   S
\\   S\\\4   4S jj5       rSrU =r$ )ClapAudioModelWithProjectioni9  r  r   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r   r   r  r  r  r  r  r!  s     r&   r   %ClapAudioModelWithProjection.__init__>  s4     )&1 3F ;r(   rJ   c                 V    U R                   R                  R                  R                  $ r   )r  r  r  r   rz   s    r&   r  1ClapAudioModelWithProjection.get_input_embeddingsE  s     --99>>>r(   r  r  r  r  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9nU(       d  US   OUR
                  nU R                  U5      nU(       d  XS   4USS -   n	[        S U	 5       5      $ [        UUR                  UR                  UR                  S9$ )a"  
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
    retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import ClapAudioModelWithProjection, ClapProcessor

>>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
>>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> inputs = processor(audios=audio_sample, return_tensors="pt")
>>> outputs = model(**inputs)
>>> audio_embeds = outputs.audio_embeds
```Nr  r   r   r*   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rZ   r  s     r&   rv   7ClapAudioModelWithProjection.forward.<locals>.<genexpr>}  r  r  )rf   rX   rY   r    )r   r  r  r  r  r  r  rx   rd   rX   rY   r    )
ru   r  r  r  r  r  r  r  rf   r   s
             r&   r   $ClapAudioModelWithProjection.forwardH  s    B &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 (()/!5# ) 
 1<a(A\A\,,];#1%56qr9JJGLgLLL#%+==$//'55	
 	
r(   )r  r  r  )r[   r\   r]   r^   r   r  r  r   r	   r  r  r   r   r?   r`   r  r$  r   r   rd   r   rb   r   r   s   @r&   r  r  9  s    "L&O ?bii ?  7;04,0/3&*;
 !2!23;
 E,,-;
 $D>	;

 'tn;
 d^;
 
u**	+;
 ;
r(   r  )r  r  r  r  r  r  )r   )Qr_   r   r  dataclassesr   typingr   r   r   r   r   r?   torch.nn.functionalr	   rP   r  activationsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   utilsr   r   r   r   configuration_clapr   r   r   
get_loggerr[   r  r'   r7   r9   rH   r#  rS   rU   rd   rh   r  r}   r   r   r   r&  r2  rI  rW  r^  r  r  r  r  r  r1  rO  r^  r[  rf  rm  rs  r  r  r  r  r  r  r  r  __all__rZ   r(   r&   <module>r1     s#      ! 4 4     ! 
 . v v D D K K 
		H	%"*(4$7U\\ 7ell 7
 ?+ ? ?8 ?; ? ?8 !
 !
 !
J299 2%		 %P_")) _FaRYY aJ
")) 
# #NBII  	bii 	zRYY z|9RYY 9z3BII 3lG
ryy G
T")) &V= V=tCBII CN  "$  0		 0h299  RYY SBII SnZ
bii Z
|RYY  )/ ) )8;
( ;
| F
' F
F
R m
# m
 m
` @
"5 @
 @
F J
#6 J
 J
Zr(   