
    Phl              
       \   d dl Z d dlZd dlmZmZ d dlZd dlZd dlmc m	Z
 d dlmZmZmZmZmZmZmZ ddlmZ  e j*                  e      Z	 	 	 	 d)dej0                  dej0                  dej0                  d	eej0                     fd
Zd*dedefdZd*dedefdZ	 d*dej0                  dedefdZd*defdZd*dedefdZ d*dedefdZ!d*dedefdZ"d*dedefdZ#d*dedefdZ$d Z%dej0                  deej0                  e&e&f   fdZ'dej0                  fdZ(dej0                  de&de&de&dej0                  f
d Z)d! Z*dej0                  d"e&d#edej0                  fd$Z+d% Z,d&ej0                  fd'Z-	 	 	 	 d)dej0                  dej0                  dej0                  d	eej0                     fd(Z.y)+    N)OptionalTuple)can_use_efficient_attentioncan_use_flash_attentionflash_sdp_enabledmath_sdp_enabledmem_efficient_sdp_enabled
SDPAParams
SDPBackend   )NestedTensorquerykeyvalue	attn_maskc           	      2   t        | t              r t        |t              rt        |t              s3t        d| j                   d|j                   d|j                   d      | j                  |j                  k7  s| j                  |j                  k7  r3t        d| j                   d|j                   d|j                   d      | j
                  |j
                  k7  s| j
                  |j
                  k7  r3t        d| j
                   d	|j
                   d
|j
                   d      | j                         dk  s&|j                         dk  s|j                         dk  r?t        d| j                          d|j                          d|j                          d      | j                  |j                  k7  s| j                  |j                  k7  r3t        d| j                   d|j                   d|j                   d      |t        d      y )NzNExpected query, key, and value to be nested tensors, but got query.is_nested: z, key.is_nested: z, and value.is_nested: z	 instead.zLExpected query, key, and value to have the same dtype, but got query.dtype: z, key.dtype: z, and value.dtype: zSExpected query, key, and value to have the same device type, but got query.device: z, key.device: z, and value.device:    zUExpected query, key, and value to all be  at least 2 dimensional, but got query.dim: z, key.dim: z and value.dim: z[Expected query, key, and value to all be ragged on the same dimension, but got ragged dims z, z, and z, respectively.zMasks are not yet supported!)

isinstancer   
ValueError	is_nesteddtypedevicedim_ragged_idxtorchbool)r   r   r   r   	dropout_p	is_causalscales          fC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/nested/_internal/sdpa.py_validate_sdpa_inputr!      s    ul+#|,%.((-'88I#-- Y$$)OO#4I?
 	

 {{cii5;;%++#=$$)KK=cii[ I  %}I7
 	

 ||szz!U\\U\\%A%%*\\N. M!!&i9
 	

 yy{Q#'')a-599;?cyy{m;swwyk1A%))+iY
 	
 COO+u/@/@EDUDU/U%%&b(9@Q@Q?RRac
 	
 788     paramsreturnc                     | j                   j                  d      }| j                  j                  d      }| j                  j                  d      }||k(  xr ||k(  S )Nr   )r   sizer   r   )r#   debugq_batch_sizek_batch_sizev_batch_sizes        r    _check_batch_size_nestedr+   J   sU     <<$$Q'L::??1%L<<$$Q'L
 <'HLL,HHr"   c                    d}| j                   j                  d      }| j                  j                  d      }| j                  j                  d      }||k(  xr ||k(  }|r|dz  dk(  r||k  s|rt        j                  d|||       yy)N      r   zFor NestedTensor inputs, Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 256. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FT)r   r&   r   r   logwarning)r#   r'   max_sizequery_size_lastkey_size_lastvalue_size_lastsame_head_dim_sizes          r    !_check_head_dim_size_flash_nestedr7   W   s    Hll''+OJJOOB'Mll''+O=(O_-O  	q A%(KKX   r"   param
param_namec                     t        | t              sJ d       | j                  dk(  r|rt        j	                  d|       y| j
                  dk(  r|rt        j	                  d|       yy)Nzparam should be a jagged NTr   zMFused kernels do not support ragged num_head_dims, %s has a ragged num_heads.Fr   zAFused kernels do not support seq_len == 0, %s has a seq len of 0.T)r   r   r   r0   r1   _min_seqlen)r8   r9   r'   s      r    :_check_for_seq_len_0_and_consistent_head_dim_nested_helperr<   q   so     e\*I,II*AKK_  AKKS r"   c           
          t        | ||      }| |k7  r| dk7  s||k7  r|dk7  s
||k7  r$|dk7  r|rt        j                  d||| ||||       yy)Nr   zzBoth fused kernels require query, key and value to have broadcastable %s, got Query %s %d, Key %s %d, Value %s %d instead.FT)maxr0   r1   )q_sizek_sizev_sizer9   r'   r2   s         r    _try_broadcast_param_sizerB      sl    666*H	8	!h6Q;h6Q;KKC
 r"   c                    | j                   j                  rt        | j                   d|      nd}|sy| j                  j                  rt        | j                  d|      nd}|sy| j                  j                  rt        | j                  d|      nd}|sy| j                   j                  d      }| j                  j                  d      }| j                  j                  d      }||k(  xr ||k(  }|si| j                   j                  s,| j                  j                  s| j                  j                  r|rt        j                  d       yt        |||d|      S y)	Nr   TFr   r   r   zFBoth fused kernels do not support training with broadcasted NT inputs.z	num heads)
r   r   r<   r   r   r&   requires_gradr0   r1   rB   )	r#   r'   	q_is_safe	k_is_safe	v_is_safeq_num_headsk_num_headsv_num_headssame_num_headss	            r    _check_for_seq_len_0_nestedrL      sU    <<!! 	CLL'5	
    :: 	CJJu	
    <<!! 	CLL'5	
    ,,##A&K**//!$K,,##A&K K/NK;4NNLL&&zz''||))\ (k;
 	
 r"   c                     | j                   j                  s,| j                  j                  s| j                  j                  r|rt        j                  d       yy)NzMMemory efficient attention currently doesn't support training with NT inputs.FT)r   rD   r   r   r0   r1   r#   r'   s     r    _check_requires_grad_nestedrO      sE    ""::##<<%% KK_ r"   c                 J    t         t        t        f}|D ]  } || |      r y yNFT)r+   r7   rL   r#   r'   constraints
constraints       r    _can_use_flash_sdpa_jaggedrU      s0     )#K
 "
&%( " r"   c                 J    t         t        t        f}|D ]  } || |      r y yrQ   )rO   r+   rL   rR   s       r    _can_use_efficient_sdpa_jaggedrW      s0    # #K
 "
&%( " r"   c                 x   | j                   j                  dd      j                         rT| j                  j                  dd      j                         r*| j                  j                  dd      j                         s|rt
        j                  d       y| j                  r|rt
        j                  d       yy)Nr   r   zGIf inputs are nested tensors they must be contiguous after transposing.FzENested tensors for query / key are not supported when is_causal=True.T)r   	transposeis_contiguousr   r   r0   r1   r   rN   s     r    _can_use_math_sdpa_jaggedr[     s    LL""1a(668zz##Aq)779||%%a+99;KKY KKW r"   c                 n   t               s$t               st               st        j                  S t        j
                  t        j                  t        j                  f}t        | |||||      }|D ]  }|t        j
                  k(  r(t        |      rt        |      rt        j
                  c S |t        j                  k(  r(t        |      rt        |      rt        j                  c S |t        j                  k(  st               st        |      st        j                  c S  t        j                  d       t        |d       t        |d       t        j                  d       t        |d       t        |d       t        j                  d       t        |d       t        j                  S )Nz)Memory efficient kernel not used because:T)r'   z(Flash attention kernel not used because:z'Math attention kernel not used because:)r   r	   r   r   ERRORFLASH_ATTENTIONEFFICIENT_ATTENTIONMATHr
   r   rU   r   rW   r[   r0   r1   )	r   r   r   r   dropoutr   orderingr#   backends	            r    _select_sdp_backendrd     sF   )+ " 	""&&H sE9gyIFj000&v.3Mf3U!111j444*627U8 "555joo%!&?&G!&  KK;<d3"66KK:;F$/vT2KK9:fD1r"   qkvc                 *   t        | t              st        d      | j                         c| j	                         j                  t        j                  | j                        }| j                  }| j                         j                  d   }n| j                         j                  d      j                  t        j                  | j                        }| j                  d      }| j                  }t        |d   j                               }|||fS )Nz<QKV must be nested for flash cumulative_seq_len calculation.)r   r   r   r.   )r   r   r   lengthsoffsetstor   int32r   _max_seqlenvaluesshapecumsumr&   intitem)re   cumulative_seqlen
max_seqlenn_elem
batch_sizes        r    _cumulative_and_max_seq_len_nnzru   @  s     c<(WXX
{{}KKM,,5;;szz,R__
##A& KKM  #&&U[[&L 	 XXa[
__
&r*//12j&00r"   tensorc                     t        | t              sJ | j                         }| j                  }|j	                  d      dz
  }|dk  ry|d   }|dd  D ]  }||k  r y|} y)Nr   r   Tr   F)r   r   rh   _strider&   )rv   rh   strides	n_tensorsprev_stridestrides         r    !_is_safe_to_get_storage_as_tensorr}   \  sw     fl+++nnGnnGQ!#IA~ !*K!"+&    r"   Nnz	num_headshead_dimc                 `    | j                   r| j                         S | j                  |||      S )N)r   rl   view)rv   r~   r   r   s       r    _view_as_denser   {  s,     }};;sIx00r"   c                    | j                  d      }|j                  d      }|j                  d      }| j                  d      }|j                  d      }|j                  d      }||k(  r||k(  r
||k(  r||k(  st        d      | j                  d      }	| j                  d      }
|j                  d      }| j                  dd      }|j                  dd      }|j                  dd      }t        |      \  }}}t        |      \  }}}|j	                         st        |      s|j                         }|j	                         st        |      s|j                         }|j	                         st        |      s|j                         }t        |||	|
      }t        |||	|
      }t        |||	|      }|j                         |j                  |j                  d}||||||||fS )Nr   r   z<This path is currently not implemented for jagged layout NT.   r   )rh   rk   r;   )r&   RuntimeErrorrY   ru   rZ   r}   
contiguousr   rh   rk   r;   )r   r   r   r(   r)   r*   rH   rI   rJ   r   head_dim_qk
head_dim_vq_tk_tv_tcumulative_sequence_length_qmax_seqlen_batch_qNnz_qcumulative_sequence_length_kvmax_seqlen_batch_kvNnz_kvquery_buffer_reshapedkey_buffer_reshapedvalue_buffer_reshapedoutput_nt_infos                            r    _sdpa_nested_preprocessingr     s    ::a=L88A;L::a=L**Q-K((1+K**Q-KL(\\-I{"{k'AJ
 	

 

1I**Q-KAJ
//!Q
C
--1
C
//!Q
C 	(,	$ 	(,	% 'H'Mnn'H'Mnn'H'Mnn*3y+N(fiM*3	:N ;;=N 	$%	 	r"   alignment_sizeslicec                     | j                  d      }||z  dk(  r| S |||z  z
  }t        j                  j                  j	                  | d|g      } |r	| dd|f   S | S )Nr.   r   .)r&   r   nn
functionalpad)rv   r   r   last_dim_size	pad_counts        r    _pad_last_dimr   Z  sn     KKOM~%*-."@AIXX  $$Va^<Fc1]?*++Mr"   c                 `    ||}|S t        j                  d| j                  d      z        }|S )Ng      ?r.   )mathsqrtr&   )r   r   softmax_scales      r    _calculate_scaler   m  s5    ".EM 59IIcEJJrN>R4SMr"   outc                 X    | j                   s| j                  d      |k7  r	| dd|f   } | S )Nr.   .r   )r   r&   )r   og_sizes     r    _post_process_flash_outputr   r  s/    ==SXXb\W4#qy.!Jr"   c                    t        | ||||||       t        | t              r t        |t              rt        |t              sJ | j                         dkD  r|j                         dkD  r|j                         dkD  r}| j                  dk(  rnddlm} t        j                  | j                  |j                  |j                  t        |t              r|j                  n||||      }t        |fi  ||       S | j                  xs |j                  xs |j                  }	t        | |||||      }
|
t        j                  k(  r| j                  d      }t        | dd      }t        |dd      }t        |dd      }t!        | |      }t#        |||      \  }}}}}}}}t$        j&                  j(                  j+                  |||||||||d|	      \  }}}}}t        |fi |j-                  dd
      }t/        ||      S |
t        j0                  k(  rt#        | ||      \  }}}}}}} }t$        j&                  j(                  j3                  |j5                  d      |j5                  d      |j5                  d      d ||||t7        |      |	|	      \  }}!}"}#}$}t        |j9                  d      fi |j-                  dd
      S |
t        j:                  k(  rt%        j<                  | ||||||	      d   S t?        d      )Nr   r   r   )extract_kwargs)r   r   r   r   r.   r/   F)r   r   z=No viable backend for scaled_dot_product_attention was found.) r!   r   r   r   r   torch.nested._internal.opsr   Fscaled_dot_product_attention_valuesrD   rd   r   r^   r&   r   r   r   r   opsaten_flash_attention_forwardrY   r   r_   _efficient_attention_forward	unsqueezero   squeezer`   "_scaled_dot_product_attention_mathr   )%r   r   r   r   r   r   r   r   outputcompute_logsumexpbackend_choicer   query_padded
key_paddedvalue_paddedog_scaler   r   r   r   r   r   r   r   	attention	logsumexpphilox_seedphilox_offsetdebug_attn_maskquery_reshapedkey_reshapedvalue_reshaped_
log_sumexpseedoffsetmax_seqlen_qs%                                        r    #jagged_scaled_dot_product_attentionr   x  s-    UIy)US 	5,'sL)ul+	, yy{Q3779q=UYY[1_ARARVWAW=//MMKKMM%/	<%H	!!i

 F<nU&;<<++Ws/@/@WEDWDW(sE9iN 333**R.$UAu5"351
$UAu5#E51 '|ZN		
!!() IINN33!!() 4 
	
  !=n=GG1M	))W==	:99	9 'uc59		
() IINN77$$Q'""1%$$Q'()	N 8 
	
  I--a0CNCMMaQRSS	:??	*773y)Ye

 	 K
 	
r"   )Ng        FN)F)/loggingr   typingr   r   r   torch.nntorch.nn.functionalr   r   r   torch.backends.cudar   r   r   r   r	   r
   r   nested_tensorr   	getLogger__name__r0   Tensorr!   r   r+   r7   strr<   rB   rL   rO   rU   rW   r[   rd   ro   ru   r}   r   r   r   r   r   r    r"   r    <module>r      sW     "       (g! )-
0<<0	0 <<0 %	0f
IZ 
I 
Ij $ 6 16<<%(	4RV .8
 8D 8v
 D 	z 	4 		: 	t 	j $ (%P1 1%cSV@V:W 18ell >1LL1"1/21>A1
\\1lFRLL*-6:
\\&
ELL  )-
}
<<}
	}
 <<}
 %	}
r"   