o
    wZh                     @   s  d dl Z d dlmZ d dlZd dlZd dlm  mZ d dl	m
Z
mZmZmZmZmZmZmZ d dlmZ ddlmZ e eZ				dCd	ejd
ejdejdeej fddZdDdedefddZdDdedefddZdDdedefddZ	dDdejdedefddZ dDdefddZ!dDdedefddZ"dDdedefdd Z#dDdedefd!d"Z$dDdedefd#d$Z%d%d& Z&d'ejde'eje(e(f fd(d)Z)d*ejfd+d,Z*d*ejd-e(d.e(d/e(dejf
d0d1Z+d2d3 Z,d*ejd4e(d5edejfd6d7Z-d8d9 Z.d:ejfd;d<Z/d=d> Z0d	ejd
ejdejdeej de'ejejejeej f f
d?d@Z1					dEd	ejd
ejdejdeej fdAdBZ2dS )F    N)Optional)can_use_cudnn_attentioncan_use_efficient_attentioncan_use_flash_attentioncudnn_sdp_enabledflash_sdp_enabledmath_sdp_enabledmem_efficient_sdp_enabled
SDPAParams)
SDPBackend   )NestedTensor        Fquerykeyvalue	attn_maskc                 C   sN  t | trt |trt |ts td| j d|j d|j d| j|jks,| j|jkr=td| j d|j d|j d| j|jksI| j|jkrZtd| j d	|j d
|j d|  dk sl| dk sl| dk rtd|   d|  d|  d| j|jks| j|jkrtd| j d|j d|j d|d urtdd S )NzNExpected query, key, and value to be nested tensors, but got query.is_nested: z, key.is_nested: z, and value.is_nested: z	 instead.zLExpected query, key, and value to have the same dtype, but got query.dtype: z, key.dtype: z, and value.dtype: zSExpected query, key, and value to have the same device type, but got query.device: z, key.device: z, and value.device:    zUExpected query, key, and value to all be  at least 3 dimensional, but got query.dim: z, key.dim: z and value.dim: z[Expected query, key, and value to all be ragged on the same dimension, but got ragged dims z, z, and z, respectively.zMasks are not yet supported!)

isinstancer   
ValueError	is_nesteddtypedevicedim_ragged_idxtorchbool)r   r   r   r   	dropout_p	is_causalscale r    J/var/www/auris/lib/python3.10/site-packages/torch/nested/_internal/sdpa.py_validate_sdpa_input   st   
$r"   paramsreturnc                 C   s4   | j d}| jd}| jd}||ko||kS )Nr   )r   sizer   r   )r#   debugq_batch_sizek_batch_sizev_batch_sizer    r    r!   _check_batch_size_nestedM   s   r*   c                 C   l   d}| j d}| jd}| jd}||ko||k}|r(|d dkr(||ks4|r2td||| dS dS )N      r   zFor NestedTensor inputs, Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 256. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FTr   r%   r   r   logwarningr#   r&   max_sizeZquery_size_lastZkey_size_lastZvalue_size_lastZsame_head_dim_sizer    r    r!   !_check_head_dim_size_flash_nestedZ   &   r4   c                 C   r+   )N   r-   r.   r   zFor NestedTensor inputs, cuDNN attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 128. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FTr/   r2   r    r    r!   !_check_head_dim_size_cudnn_nestedt   r5   r7   param
param_namec                 C   sT   t | ts	J d| jdkr|rtd| dS |  dkr(|r&td| dS dS )Nzparam should be a jagged NTr   zMFused kernels do not support ragged num_head_dims, %s has a ragged num_heads.Fr   zAFused kernels do not support seq_len == 0, %s has a seq len of 0.T)r   r   r   r0   r1   _get_min_seqlen)r8   r9   r&   r    r    r!   :_check_for_seq_len_0_and_consistent_head_dim_nested_helper   s    
r;   c              
   C   s`   t | ||}| |kr| dks||kr|dks||kr.|dkr.|r,td||| |||| dS dS )Nr   zzBoth fused kernels require query, key and value to have broadcastable %s, got Query %s %d, Key %s %d, Value %s %d instead.FT)maxr0   r1   )Zq_sizeZk_sizeZv_sizer9   r&   r3   r    r    r!   _try_broadcast_param_size   s"   r=   c           	      C   s   | j jrt| j d|nd}|sdS | jjrt| jd|nd}|s"dS | jjr-t| jd|nd}|s3dS | j d}| jd}| jd}||koL||k}|sl| j js[| jjs[| jjrd|rbtd dS t	|||d|S dS )	Nr   TFr   r   r   zFBoth fused kernels do not support training with broadcasted NT inputs.z	num heads)
r   r   r;   r   r   r%   requires_gradr0   r1   r=   )	r#   r&   Z	q_is_safeZ	k_is_safeZ	v_is_safeq_num_headsk_num_headsv_num_headsZsame_num_headsr    r    r!   _check_for_seq_len_0_nested   sX   
rB   c                 C   s(   t ttf}|D ]
}|| |s dS qdS NFT)r*   r4   rB   r#   r&   constraints
constraintr    r    r!   _can_use_flash_sdpa_jagged   s   
rG   c                 C   s&   t tf}|D ]
}|| |s dS qdS rC   )r*   rB   rD   r    r    r!   _can_use_efficient_sdpa_jagged  s   
rH   c                 C   sd   | j dd r| jdd r| jdd s$|r"td dS | jr0|r.td dS dS )Nr      zGIf inputs are nested tensors they must be contiguous after transposing.FzENested tensors for query / key are not supported when is_causal=True.T)r   	transposeis_contiguousr   r   r0   r1   r   )r#   r&   r    r    r!   _can_use_math_sdpa_jagged  s$   rL   c           
      C   sL  t  st st st stjS tjtjtjtj	f}t
| ||||||}|D ]E}	|	tj	kr5t|r5tj	  S |	tjkrGt|rGt|rGtj  S |	tjkrYt|rYt|rYtj  S |	tjkrjt rjt|rjtj  S q%td t|dd t|dd td t|dd t|dd td t|dd td t|dd tjS )Nz)Memory efficient kernel not used because:T)r&   z(Flash attention kernel not used because:z'Math attention kernel not used because:z(cuDNN attention kernel not used because:)r   r	   r   r   r   ERRORFLASH_ATTENTIONEFFICIENT_ATTENTIONMATHCUDNN_ATTENTIONr
   r   r   rG   r   rH   rL   r0   r1   )
r   r   r   r   Zdropoutr   
enable_gqaZorderingr#   backendr    r    r!   _select_sdp_backend%  sT   












rT   qkvc                 C   s   t | ts	td|  d u r&|  jtj| jd}| 	 }| 
 jd }n|  djtj| jd}| 	 }t|d  }|||fS )Nz<QKV must be nested for flash cumulative_seq_len calculation.)r   r   r   r-   )r   r   r   lengthsoffsetstor   Zint32r   _get_max_seqlenvaluesshapeZcumsumintitem)rU   Zcumulative_seqlen
max_seqlenZn_elemr    r    r!   _cumulative_and_max_seq_len_nnzT  s   

r_   tensorc                 C   sf   t | tsJ |  }| j}|dd }|dkrdS |d }|dd  D ]}||kr. dS |}q%dS )Nr   r   TrI   F)r   r   rW   Z_stridesr%   )r`   rW   stridesZ	n_tensorsZprev_strideZstrider    r    r!   !_is_safe_to_get_storage_as_tensoro  s   
rb   Nnz	num_headshead_dimc                 C   s   | j r|  S | |||S N)r   rZ   view)r`   rc   rd   re   r    r    r!   _view_as_dense  s   rh   c                 C   sf  |  d}| d}| d}|  d}| d}| d}||kr.||kr.||kr.||ks2td|  d}	|  d}
| d}| dd}|dd}|dd}t|\}}}t|\}}}| smt|sm| }| syt|sy| }| st|s| }t|||	|
}t|||	|
}t|||	|}| |	 |
 | d}||||||||fS )Nr   r   z<This path is currently not implemented for jagged layout NT.r   rI   )rW   rV   r^   
min_seqlen)r%   RuntimeErrorrJ   r_   rK   rb   
contiguousrh   rW   rV   rY   r:   )r   r   r   r'   r(   r)   r?   r@   rA   rd   Zhead_dim_qkZ
head_dim_vZq_tZk_tZv_tcumulative_sequence_length_qmax_seqlen_batch_qZNnz_qcumulative_sequence_length_kvmax_seqlen_batch_kvZNnz_kvquery_buffer_reshapedkey_buffer_reshapedvalue_buffer_reshapedoutput_nt_infor    r    r!   _sdpa_nested_preprocessing$  sd   








	rt   alignment_sizeslicec                 C   sR   |  d}|| dkr| S |||  }tjj| d|g} |r'| dd|f S | S )Nr-   r   .)r%   r   nn
functionalpad)r`   ru   rv   Zlast_dim_sizeZ	pad_countr    r    r!   _pad_last_dimn  s   
rz   c                 C   s(   |d ur|}|S t d| d }|S )Ng      ?r-   )r   Zsym_sqrtr%   )r   r   Zsoftmax_scaler    r    r!   _calculate_scale  s   r{   outc                 C   s(   | j s| d|kr| dd|f } | S )Nr-   .r   )r   r%   )r|   og_sizer    r    r!   _post_process_flash_output  s   r~   c                 C   s8   t j s| jjdkrt jj }tdd |D S dS )Nmetac                 s   s"    | ]}t |tjjjkV  qd S rf   )typer   utilsZflop_counterZ_FlopCounterMode).0xr    r    r!   	<genexpr>  s
    
z+_is_computing_meta_flops.<locals>.<genexpr>F)	r   ZjitZis_scriptingr   r   r   Z_python_dispatchZ _get_current_dispatch_mode_stackany)r   Ztorch_dispatch_mode_stackr    r    r!   _is_computing_meta_flops  s   
r   c                    sN   | j j t| st s| |||fS  fdd}|| ||||||fS )a*  
    [Autocasting SDPA for NJT]

    Normal autocasting doesn't work for NJT+SDPA right now:
    * NJT intercepts the __torch_function__ call for scaled_dot_product_attention, which happens
      before we get to any aten ops or dispatcher logic; then the torch_function logic calls into
      efficient attention or flash attention. So, autocasting on the scaled_dot_product_attention
      op won't work because we never see that aten op.
    * If we put autocasting on `_flash_attention_forward`, then we'll get autocasting to run, but
      the kernel selection logic in torch_function handling (ie. jagged_scaled_dot_product_attention)
      won't work correctly: the kernel selection logic will run before autocasting, and choose
      a kernel based on the un-autocasted dtypes; but then autocasting will run and the actual
      attention computation will happen in a different dtype.

    An alternative is to just change the backend selection logic for SDPA+NJT to be autocast-aware
    and rely on autocasting to do the actual conversions for flash attention / efficient attention.
    However, by manually doing the actual autocast before the backend selection, we ensure that the
    autocast handling for backend selection doesn't diverge from the autocast handling for the
    actual dtype conversions.
    c                    sB   | d u r| S t  }| jjr| j|ks| jt jkr| S | |S rf   )r   Zget_autocast_dtyper   Zis_floating_pointZfloat64rX   )r   Ztarget_dtypeZdevice_typer    r!   cvt  s   


z_autocast.<locals>.cvt)r   r   r   r   Zis_autocast_enabled)r   r   r   r   r   r    r   r!   	_autocast  s
   r   c           2      C   s  t | |||\} }}}t| |||||| t| tr$t|tr$t|ts&J ddlm} |  dkrm| dkrm| dkrm| jdkrmtj	| 
 |
 |
 t|trW|
 n||||d}	||	|  |  | j| jdS | jpu|jpu|j}
t| ||||||}t| rtj}|tjkr| d}t| dd	}t|dd	}t|dd	}t| |}t|||\}}}}}}}}tjjj|||||||||d	|d
\}}}}}||fi |dd}t||S |tjkr%t| ||\}}} }}}}}tjjj| d| d|  dd |||||t!||
|d
\}}!}"}#}$}||"dfi |ddS |tj#krbt| ||\}}} }}}}}tjjj$||| ||||||
||d	|d
\	}}%}&}'}$}(}"}#})||fi |ddS |tj%kr|  }*|  }+| j},| j}-| j&d }.|j&d }/dd }0|0| } |0|}|0|}tj'| ||||||d
d }1|1dd( 
 }1|1)d|.|/}1||1|*|+|,|-ddd}1|1S t*d)Nr   )'nested_view_from_values_offsets_lengthsr   r   )r   r   r   r   )ri   r^   r-   r.   F)r   rI   c                 S   sd   | j dd  | j d d  }t| dd}| jt|dd}tjt|}|dd }|S )Nr   r-   rI   r   )r   )	_offsetsr   rJ   rZ   splitlistnestedZas_nested_tensorrk   )Zjagged_layout_ntrV   rJ   Ztensor_listZ
strided_ntr    r    r!    get_strided_layout_nested_tensor  s   zMjagged_scaled_dot_product_attention.<locals>.get_strided_layout_nested_tensor)rV   ri   r^   z=No viable backend for scaled_dot_product_attention was found.)+r   r"   r   r   Z$torch.nested._internal.nested_tensorr   r   r   FZscaled_dot_product_attentionrZ   rW   rV   Z_maybe_min_seqlenZ_maybe_max_seqlenr>   rT   r   r   rN   r%   rz   r{   rt   r   opsZatenZ_flash_attention_forwardrJ   r~   rO   Z_efficient_attention_forwardZ	unsqueezer\   ZsqueezerQ   Z_cudnn_attention_forwardrP   _sizeZ"_scaled_dot_product_attention_mathrk   rg   rj   )2r   r   r   r   r   r   r   rR   r   outputZcompute_logsumexpZbackend_choicer}   Zquery_paddedZ
key_paddedZvalue_paddedZog_scalerp   rq   rr   rl   rn   rm   ro   rs   Z	attentionZ
_logsumexpZ_philox_seedZ_philox_offsetZ_debug_attn_maskZquery_reshapedZkey_reshapedZvalue_reshapedZ
log_sumexpseedoffsetZmax_seqlen_qZ	logsumexpZcum_seqlen_qZcum_seqlen_kvZmax_seqlen_kv_rW   Z	q_lengthsri   r^   Zd1Zd2r   Zattn_outr    r    r!   #jagged_scaled_dot_product_attention  sx  
.







	



r   )Nr   FN)F)Nr   FNF)3loggingtypingr   r   Ztorch.nnZtorch.nn.functionalrw   rx   r   Ztorch.backends.cudar   r   r   r   r   r   r	   r
   Ztorch.nn.attentionr   Znested_tensorr   	getLogger__name__r0   ZTensorr"   r   r*   r4   r7   strr;   r=   rB   rG   rH   rL   rT   tupler\   r_   rb   rh   rt   rz   r{   r~   r   r   r   r    r    r    r!   <module>   s   (


3
; /
 J

2