o
    wZhK5                     @   s*  U d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddlZddlZddlZej Zeo:ej dkZerBedndZerMedd Znedd Zed	d Zed
d Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Z edd Z!dd Z"dd Z#edd Z$edd Z%dd Z&dd Z'dd  Z(ed!d Z)e*e+d"< ed#d Z,e*e+d$< ed%d Z-e*e+d&< ed'd Z.e*e+d(< eoe Z/e*e+d)< ed*d Z0e*e+d+< d,d- Z1ed.d Z2e*e+d/< ed0d Z3e*e+d1< er+zddl4Z5e5j6 Z7W n e8y*   d2Z7d2ZY nw d2Z7d2a9d3d4 Z:ej;d5d6 Z<ej;dQd8d9Z=ej;d:d; Z>dQd<d=Z?d>d? Z@d@dA ZAdBdC ZBdDdE ZCdFdG ZDdHdI ZEeD ZFeE ZGdJejHjIdfdKdLZJdJejKejHjIdfdMdNZLdOdP ZMesej rJ dS dS )Rz>This file is allowed to initialize CUDA context when imported.    N)LazyVal
TEST_NUMBATEST_WITH_ROCM	TEST_CUDA
IS_WINDOWS   zcuda:0c                   C   s   t S N)r    r	   r	   R/var/www/auris/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py<lambda>   s    r   c                   C   s   t otjjtjdtdS )N      ?device)r   torchbackendscudnnZis_acceptableZtensorCUDA_DEVICEr	   r	   r	   r
   r          c                   C   s   t rtjj S dS )Nr   )
TEST_CUDNNr   r   r   versionr	   r	   r	   r
   r      s    c                   C      t j ot j dkS )N)      r   cudais_availableget_device_capabilityr	   r	   r	   r
   r          c                   C   r   )N)   r   r   r	   r	   r	   r
   r      r   c                   C   r   )N)   r   r   r	   r	   r	   r
   r      r   c                   C   r   )N)r   r   r   r	   r	   r	   r
   r      r   c                   C   r   )N)   r   r   r	   r	   r	   r
   r       r   c                   C   r   Nr    	   r   r	   r	   r	   r
   r   !   r   c                   C   r   )N)r#   r   r   r	   r	   r	   r
   r   "   r   c                   C   r   )N)
   r   r   r	   r	   r	   r
   r   #   r   c                   C   s.   t j ot j d dkot j d dkS )Nr   r$      r   r	   r	   r	   r
   r   %   s    c                   C   s   t j ot j dv ptS )N))r   r   )r    r   )r   r   r   r   IS_THORr	   r	   r	   r
   r   '   r   c                   C   s   t j ot j dkS r!   r   r	   r	   r	   r
   r   (   r   c                      s,   t rtjdj t fdddD S dS )Nr   c                 3   s    | ]}| v V  qd S r   r	   ).0archgcn_arch_namer	   r
   	<genexpr>-       zCDNA2OrLater.<locals>.<genexpr>>   Zgfx90aZgfx942F)r   r   r   get_device_propertiesgcnArchNameanyr	   r	   r)   r
   CDNA2OrLater*   s   r0   c                 C   s2   t j sdS t jdj}tjd|}|| kS )NFr   Z/PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE)r   r   r   r-   r.   osenvironget)Zmatching_archr*   r(   r	   r	   r
   evaluate_gfx_arch_exact0   s
   
r4   c                   C      t dS )Ngfx90a:sramecc+:xnack-r4   r	   r	   r	   r
   r   7       c                   C   r5   )Ngfx942:sramecc+:xnack-r7   r	   r	   r	   r
   r   8   r8   c                   C   s&   t r
tdp	tdS trt otS dS )Nr6   r9   F)r   r4   r   r   SM80OrLaterr	   r	   r	   r
   *evaluate_platform_supports_flash_attention:   s
   
r;   c                   C   s    t r
tdp	tdS trdS dS )Nr6   r9   TF)r   r4   r   r	   r	   r	   r
   .evaluate_platform_supports_efficient_attentionA   s
   r<   c                   C   s   t  ototdkS )Ni_ )r   r:   TEST_CUDNN_VERSIONr	   r	   r	   r
   *evaluate_platform_supports_cudnn_attentionH      r>   c                   C      t  S r   )r;   r	   r	   r	   r
   r   K       !PLATFORM_SUPPORTS_FLASH_ATTENTIONc                   C   r@   r   )r<   r	   r	   r	   r
   r   L   rA   #PLATFORM_SUPPORTS_MEM_EFF_ATTENTIONc                   C   r@   r   )r>   r	   r	   r	   r
   r   M   rA   !PLATFORM_SUPPORTS_CUDNN_ATTENTIONc                   C   s   t ptptS r   )rB   rD   rC   r	   r	   r	   r
   r   O   s    !PLATFORM_SUPPORTS_FUSED_ATTENTIONPLATFORM_SUPPORTS_FUSED_SDPAc                   C      t otS r   )r   r:   r	   r	   r	   r
   r   U   r8   PLATFORM_SUPPORTS_BF16c                  C   s   t j rMt jjrDtdd t jjdd d D } dg}| dkr(|dg | dkr1|d	 |D ]}|t j	d
j
v rA dS q3dS tpLt j dkS dS )Nc                 s       | ]}t |V  qd S r   int)r'   vr	   r	   r
   r+   Z   r,   z1evaluate_platform_supports_fp8.<locals>.<genexpr>.r   Zgfx94)r   r   Zgfx120)r   r   Zgfx95r   Tr"   F)r   r   r   r   hiptuplesplitextendappendr-   r.   SM90OrLaterr   )ZROCM_VERSIONZarchsr(   r	   r	   r
   evaluate_platform_supports_fp8W   s   
$
rT   c                   C   r@   r   )rT   r	   r	   r	   r
   r   g   rA   PLATFORM_SUPPORTS_FP8c                   C   rG   r   )r   SM100OrLaterr	   r	   r	   r
   r   i   r8   PLATFORM_SUPPORTS_MX_GEMMFc                  C   sD   t sJ dts ttj D ]} tjdd|  d qdad S d S )Nz?CUDA must be available when calling initialize_cuda_context_rngr%   zcuda:r   T)r   __cuda_ctx_rng_initializedranger   r   device_countrandn)ir	   r	   r
   initialize_cuda_context_rng{   s   r]   c                  c   s    t jjjj} z6dt jjj_t jjjd d d dd d V  W d    n1 s'w   Y  W | t jjj_d S W | t jjj_d S | t jjj_w )NFenabledZ	benchmarkZdeterministic
allow_tf32r   r   r   matmulr`   r   flagsold_allow_tf32_matmulr	   r	   r
   tf32_off   s   rf   h㈵>c                 c   s    t jjjj}| j}z?dt jjj_|| _t jjjd d d dd d V  W d    n1 s-w   Y  W |t jjj_|| _d S W |t jjj_|| _d S |t jjj_|| _w )NTr^   )r   r   r   rb   r`   	precisionr   rc   )selftf32_precisionre   Zold_precisionr	   r	   r
   tf32_on   s    

rk   c                  c   s    t jjjj} z6dt jjj_t jjjddddd dV  W d   n1 s'w   Y  W | t jjj_dS W | t jjj_dS | t jjj_w )z
    Context manager to temporarily enable TF32 for CUDA operations.
    Restores the previous TF32 state after exiting the context.
    TNr^   ra   rd   r	   r	   r
   tf32_enabled   s   rl   c                    s&   dd  fddfdd}|S )Nc                 S   s2   t   |  W d    d S 1 sw   Y  d S r   rf   ri   Zfunction_callr	   r	   r
   with_tf32_disabled   s   "z+tf32_on_and_off.<locals>.with_tf32_disabledc                    s6   t |   |  W d    d S 1 sw   Y  d S r   )rk   rn   )rj   r	   r
   with_tf32_enabled   s   "z*tf32_on_and_off.<locals>.with_tf32_enabledc                    s8   t j}t|  t fdd}|S )Nc                     s   t | D ]\}}| |< qtj }d v r#|o"t d jdk}d v r3|o2 d tjtjhv }|rO d  fdd  d  fdd d S di   d S )	Nr   r   dtyperi   c                          di S Nr	   r	   r	   fkwargsr	   r
   r          zCtf32_on_and_off.<locals>.wrapper.<locals>.wrapped.<locals>.<lambda>c                      rr   rs   r	   r	   rt   r	   r
   r      rw   r	   )zipr   r   Zis_tf32_supportedr   typeZfloat32Z	complex64)argsrv   krL   Zcond)	arg_namesru   ro   rp   )rv   r
   wrapped   s   

z1tf32_on_and_off.<locals>.wrapper.<locals>.wrapped)inspect	signature
parametersrO   keys	functoolswraps)ru   paramsr}   )ro   rp   )r|   ru   r
   wrapper   s
   z tf32_on_and_off.<locals>.wrapperr	   )rj   r   r	   )rj   ro   rp   r
   tf32_on_and_off   s   r   c                    s   t   fdd}|S )Nc                     s8   t    | i |W  d    S 1 sw   Y  d S r   rm   )rz   rv   ru   r	   r
   r}      s   $zwith_tf32_off.<locals>.wrapped)r   r   )ru   r}   r	   r   r
   with_tf32_off   s   r   c                  C   s^   dt j vr	dS t j d} t j | td d  dd }tdd |dD S )	NZMagmar   r   zMagma 
r   c                 s   rI   r   rJ   r'   xr	   r	   r
   r+      r,   z%_get_magma_version.<locals>.<genexpr>rM   )r   Z
__config__showfindlenrP   rO   )positionversion_strr	   r	   r
   _get_magma_version   s
   $r   c                  C   s4   t jjd u rdS tt jj} tdd | dD S )Nr   c                 s   rI   r   rJ   r   r	   r	   r
   r+     r,   z*_get_torch_cuda_version.<locals>.<genexpr>rM   )r   r   r   strrO   rP   )Zcuda_versionr	   r	   r
   _get_torch_cuda_version   s   r   c                  C   s:   t sdS ttjj} | dd } tdd | dD S )Nr   -r   c                 s   rI   r   rJ   r   r	   r	   r
   r+   
  r,   z*_get_torch_rocm_version.<locals>.<genexpr>rM   )r   r   r   r   rN   rP   rO   )rocm_versionr	   r	   r
   _get_torch_rocm_version  s
   r   c                   C   s   t  S r   )r   r	   r	   r	   r
   !_check_cusparse_generic_available  s   r   c                  C   sX   t sdS tjjs
dS ttjj} | dd } tdd | dD }|d u p*|dk  S )NFr   r   c                 s   rI   r   rJ   r   r	   r	   r
   r+     r,   z5_check_hipsparse_generic_available.<locals>.<genexpr>rM   )r   r%   )r   r   r   rN   r   rP   rO   )r   Zrocm_version_tupler	   r	   r
   "_check_hipsparse_generic_available  s   r   r   c           
      C   s   t jt jddt jddj| d}t jt jddt jddj| d}t   t| | D ]	\}}|| q8W d    n1 sLw   Y  ddi}|d ur^|	| || fi |}|| fi |}	||||	fS )Nr    r   lrr   )
r   nnZ
SequentialZLineartoZno_gradrx   r   Zcopy_update)
r   optimizer_ctoroptimizer_kwargsZmod_controlZmod_scalingcsrv   Zopt_controlZopt_scalingr	   r	   r
   !_create_scaling_models_optimizers  s   **

r   c              	   C   s   t jd|| dt jd|| dft jd|| dt jd|| dft jd|| dt jd|| dft jd|| dt jd|| dfg}t j | }d}t| ||d|||f S )N)r    r    )rq   r   r   )r   r   r   )r   r[   r   ZMSELossr   r   )r   rq   r   r   dataZloss_fnZ	skip_iterr	   r	   r
   _create_scaling_case1  s   r   c                 C   s   t s| S t| S r   )IS_SM89unittestZexpectedFailure)funcr	   r	   r
   xfailIfSM89@  r?   r   )rg   )N__doc__r   r   Z
torch.cudaZ$torch.testing._internal.common_utilsr   r   r   r   r   r~   
contextlibr1   r   r   Zis_initializedZ"CUDA_ALREADY_INITIALIZED_ON_IMPORTrZ   ZTEST_MULTIGPUr   r   r   r=   ZSM53OrLaterZSM60OrLaterZSM70OrLaterZSM75OrLaterr:   ZSM89OrLaterrS   rV   r&   Z	IS_JETSONr   r0   r4   ZGFX90A_ExactZGFX942_Exactr;   r<   r>   rB   bool__annotations__rC   rD   rE   rF   rH   rT   rU   rW   Z
numba.cudaZnumbar   ZTEST_NUMBA_CUDA	ExceptionrX   r]   contextmanagerrf   rk   rl   r   r   r   r   r   r   r   ZTEST_CUSPARSE_GENERICZTEST_HIPSPARSE_GENERICZoptimZSGDr   floatr   r   r	   r	   r	   r
   <module>   s   





,%