
    FPhE                        d Z ddlZddlZddlmZ ddlmc mZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dZ G d d	ej$                        Z G d
 de      Z G d dej$                        Z G d dej$                        Z G d dej$                        Z G d dej$                        Z G d dej$                        Z G d dej$                        Z G d dej$                        Z G d dej$                        Zy)zTransformer modules.    N)	constant_xavier_uniform_   )Conv)_get_clonesinverse_sigmoid#multi_scale_deformable_attn_pytorch)
TransformerEncoderLayerTransformerLayerTransformerBlockMLPBlockLayerNorm2dAIFIDeformableTransformerDecoder!DeformableTransformerDecoderLayerMSDeformAttnMLPc                   l     e Zd ZdZddd ej
                         df fd	ZddZddZdd	Z	dd
Z
 xZS )r
   z2Defines a single layer of the transformer encoder.              Fc                    t         |           ddlm} |st	        d      t        j                  |||d      | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        || _        || _        y)zAInitialize the TransformerEncoderLayer with specified parameters.   )	TORCH_1_9z]TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).T)dropoutbatch_firstN)super__init__utils.torch_utilsr   ModuleNotFoundErrornnMultiheadAttentionmaLinearfc1fc2	LayerNormnorm1norm2Dropoutr   dropout1dropout2actnormalize_before)	selfc1cm	num_headsr   r-   r.   r   	__class__s	           mC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\ultralytics/nn/modules/transformer.pyr   z TransformerEncoderLayer.__init__   s    2%oq q''IwTXY99R$99R$\\"%
\\"%
zz'*

7+

7+ 0    c                     ||S ||z   S )z2Add position embeddings to the tensor if provided. )r/   tensorposs      r4   with_pos_embedz&TransformerEncoderLayer.with_pos_embed*   s    v6&3,6r5   c           	      l   | j                  ||      x}}| j                  |||||      d   }|| j                  |      z   }| j                  |      }| j	                  | j                  | j                  | j                  |                        }|| j                  |      z   }| j                  |      S )z.Performs forward pass with post-normalization.value	attn_maskkey_padding_maskr   )
r:   r#   r+   r(   r&   r   r-   r%   r,   r)   )r/   srcsrc_masksrc_key_padding_maskr9   qksrc2s           r4   forward_postz$TransformerEncoderLayer.forward_post.   s    ##C--Awwq!3(MawbcdeDMM$''jjoxxTXXdhhsm%<=>DMM$''zz#r5   c           	      l   | j                  |      }| j                  ||      x}}| j                  |||||      d   }|| j                  |      z   }| j	                  |      }| j                  | j                  | j                  | j                  |                        }|| j                  |      z   S )z-Performs forward pass with pre-normalization.r<   r   )
r(   r:   r#   r+   r)   r&   r   r-   r%   r,   )r/   r@   rA   rB   r9   rE   rC   rD   s           r4   forward_prez#TransformerEncoderLayer.forward_pre8   s    zz###D#..Awwq!48NbwcdefDMM$''zz#xxTXXdhhtn%=>?T]]4(((r5   c                 j    | j                   r| j                  ||||      S | j                  ||||      S )z8Forward propagates the input through the encoder module.)r.   rH   rF   )r/   r@   rA   rB   r9   s        r4   forwardzTransformerEncoderLayer.forwardB   s;      ##C3GMM  h0DcJJr5   NNNN)__name__
__module____qualname____doc__r!   GELUr   r:   rF   rH   rJ   __classcell__r3   s   @r4   r
   r
      s4    <"a'"'')^c 1*7)Kr5   r
   c                   h     e Zd ZdZddd ej
                         df fd	Z fdZed	d       Z	 xZ
S )
r   z#Defines the AIFI transformer layer.r   r   r   Fc                 .    t         |   ||||||       y)z7Initialize the AIFI instance with specified parameters.N)r   r   )r/   r0   r1   r2   r   r-   r.   r3   s          r4   r   zAIFI.__init__L   s    RGS:JKr5   c                 d   |j                   dd \  }}}| j                  |||      }t        |   |j	                  d      j                  ddd      |j                  |j                  |j                              }|j                  ddd      j                  d|||g      j                         S )z,Forward pass for the AIFI transformer layer.r   N   r   )devicedtype)r9   )shape"build_2d_sincos_position_embeddingr   rJ   flattenpermutetorX   rY   view
contiguous)r/   xchw	pos_embedr3   s         r4   rJ   zAIFI.forwardP   s    ''!"+1a;;Aq!D	GOAIIaL00Aq9y||STS[S[cdcjcj|?kOlyyAq!&&Aq!}5@@BBr5   c                    t        j                  t        |       t         j                        }t        j                  t        |      t         j                        }t        j                  ||d      \  }}|dz  dk(  sJ d       |dz  }t        j                  |t         j                        |z  }d||z  z  }|j                         d   |d	   z  }|j                         d   |d	   z  }	t        j                  t        j                  |      t        j                  |      t        j                  |	      t        j                  |	      gd
      d	   S )z)Builds 2D sine-cosine position embedding.rY   ij)indexing   r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embeddingg      ?.NNr   )	torcharangeintfloat32meshgridr]   catsincos)
re   rd   	embed_dimtemperaturegrid_wgrid_hpos_dimomegaout_wout_hs
             r4   r\   z'AIFI.build_2d_sincos_position_embeddingX   s    c!fEMM:c!fEMM:F1}! 	WV	W!q.WEMM:WDkU*+ +eDk9 +eDk9yy%))E*EIIe,<eii>NPUPYPYZ_P`acdefjkkr5   )   g     @)rM   rN   rO   rP   r!   rQ   r   rJ   staticmethodr\   rR   rS   s   @r4   r   r   I   s<    -"a	\a LC l lr5   r   c                   (     e Zd ZdZ fdZd Z xZS )r   zeTransformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).c                 |   t         |           t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||      | _        t        j                  ||d      | _	        t        j                  ||d      | _
        y)z]Initializes a self-attention mechanism using linear transformations and multi-head attention.F)bias)ru   r2   N)r   r   r!   r$   rC   rD   vr"   r#   r%   r&   )r/   rc   r2   r3   s      r4   r   zTransformerLayer.__init__m   s    1ae,1ae,1ae,''!yI99Q.99Q.r5   c                     | j                  | j                  |      | j                  |      | j                  |            d   |z   }| j	                  | j                  |            |z   S )z?Apply a transformer block to the input x and return the output.r   )r#   rC   rD   r   r&   r%   r/   rb   s     r4   rJ   zTransformerLayer.forwardw   sR    GGDFF1Itvvay$&&)4Q7!;xx$q((r5   rM   rN   rO   rP   r   rJ   rR   rS   s   @r4   r   r   j   s    o/)r5   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z4Vision Transformer https://arxiv.org/abs/2010.11929.c                     t         |           d| _        |k7  rt        |      | _        t	        j
                        | _        t	        j                  fdt        |      D         | _	        | _
        y)zaInitialize a Transformer module with position embedding and specified number of heads and layers.Nc              3   6   K   | ]  }t                y wrK   )r   ).0_c2r2   s     r4   	<genexpr>z,TransformerBlock.__init__.<locals>.<genexpr>   s     !]K\a"22y"AK\s   )r   r   convr   r!   r$   linear
Sequentialrangetrr   )r/   r0   r   r2   
num_layersr3   s     `` r4   r   zTransformerBlock.__init__   s^    	8RDIiiB'--!]5Q[K\!]^r5   c                 B   | j                   | j                  |      }|j                  \  }}}}|j                  d      j                  ddd      }| j	                  || j                  |      z         j                  ddd      j                  || j                  ||      S )z;Forward propagates the input through the bottleneck module.rW   r   r   )r   r[   r]   r^   r   r   reshaper   )r/   rb   br   re   rd   ps          r4   rJ   zTransformerBlock.forward   s    99 		!AWW
1aIIaL  Aq)wwq4;;q>)*221a;CCAtwwPQSTUUr5   r   rS   s   @r4   r   r   }   s    >Vr5   r   c                   r     e Zd ZdZej
                  f fd	Zdej                  dej                  fdZ	 xZ
S )r   z6Implements a single block of a multi-layer perceptron.c                     t         |           t        j                  ||      | _        t        j                  ||      | _         |       | _        y)zcInitialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.N)r   r   r!   r$   lin1lin2r-   )r/   embedding_dimmlp_dimr-   r3   s       r4   r   zMLPBlock.__init__   s;    IImW5	IIg}5	5r5   rb   returnc                 `    | j                  | j                  | j                  |                  S )zForward pass for the MLPBlock.)r   r-   r   r   s     r4   rJ   zMLPBlock.forward   s"    yy$))A,/00r5   )rM   rN   rO   rP   r!   rQ   r   rm   TensorrJ   rR   rS   s   @r4   r   r      s-    @3577 1 1%,, 1r5   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z=Implements a simple multi-layer perceptron (also called FFN).c                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        y)zXInitialize the MLP with specified input, hidden, output dimensions and number of layers.r   c              3   N   K   | ]  \  }}t        j                  ||        y wrK   )r!   r$   )r   nrD   s      r4   r   zMLP.__init__.<locals>.<genexpr>   s!     #g@f1BIIaO@fs   #%N)r   r   r   r!   
ModuleListziplayers)r/   	input_dim
hidden_dim
output_dimr   rd   r3   s         r4   r   zMLP.__init__   sS    $LJN+mm#gYKRSOUVZdYeUe@f#ggr5   c                     t        | j                        D ]:  \  }}|| j                  dz
  k  rt        j                   ||            n ||      }< |S )z Forward pass for the entire MLP.r   )	enumerater   r   Frelu)r/   rb   ilayers       r4   rJ   zMLP.forward   sF    !$++.HAu$%!(;$;uQx qA /r5   r   rS   s   @r4   r   r      s    Ghr5   r   c                   *     e Zd ZdZd fd	Zd Z xZS )r   aP  
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    Original implementation at
    https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    c                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y)z1Initialize LayerNorm2d with the given parameters.N)
r   r   r!   	Parameterrm   onesweightzerosr   eps)r/   num_channelsr   r3   s      r4   r   zLayerNorm2d.__init__   sG    ll5::l#;<LL\!:;	r5   c                    |j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }| j
                  ddddf   |z  | j                  ddddf   z   S )z0Perform forward pass for 2D layer normalization.r   TkeepdimrW   N)meanpowrm   sqrtr   r   r   )r/   rb   uss       r4   rJ   zLayerNorm2d.forward   s    FF1dF#UKKN40UejjTXX..{{1dD=)A-		!T4-0HHHr5   )gư>r   rS   s   @r4   r   r      s    Ir5   r   c                   2     e Zd ZdZd fd	Zd ZddZ xZS )r   z
    Multi-Scale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
    c                    t         |           ||z  dk7  rt        d| d|       ||z  }||z  |k(  sJ d       d| _        || _        || _        || _        || _        t        j                  |||z  |z  dz        | _
        t        j                  |||z  |z        | _        t        j                  ||      | _        t        j                  ||      | _        | j                          y)z2Initialize MSDeformAttn with the given parameters.r   z.d_model must be divisible by n_heads, but got z and z(`d_model` must be divisible by `n_heads`@   rW   N)r   r   
ValueErrorim2col_stepd_modeln_levelsn_headsn_pointsr!   r$   sampling_offsetsattention_weights
value_projoutput_proj_reset_parameters)r/   r   r   r   r   _d_per_headr3   s         r4   r   zMSDeformAttn.__init__   s    W!MgYV[\c[deff(W$/[1[[/   "		'7X3E3PST3T U!#7Gh4F4Q!R))GW599Wg6 r5   c                 H   t        | j                  j                  j                  d       t	        j
                  | j                  t        j                        dt        j                  z  | j                  z  z  }t	        j                  |j                         |j                         gd      }||j                         j                  dd      d   z  j                  | j                  ddd	      j!                  d| j"                  | j$                  d      }t'        | j$                        D ]  }|d
d
d
d
|d
d
fxx   |dz   z  cc<    t	        j(                         5  t+        j,                  |j                  d            | j                  _        d
d
d
       t        | j0                  j                  j                  d       t        | j0                  j.                  j                  d       t3        | j4                  j                  j                         t        | j4                  j.                  j                  d       t3        | j6                  j                  j                         t        | j6                  j.                  j                  d       y
# 1 sw Y   xY w)zReset module parameters.r   rh   g       @rZ   Tr   r   r   rW   N)r   r   r   datarm   rn   r   rp   mathpistackrt   rs   absmaxr`   repeatr   r   r   no_gradr!   r   r   r   r   r   r   )r/   thetas	grid_initr   s       r4   r   zMSDeformAttn._reset_parameters   s   $''..33R8dll%--@C$''MTXT`T`D`aKKvzz| <bA	!4!4R!4!Fq!IIOOPTP\P\^_abdefmmt}}dmmQ0	t}}%AaAqj!QU*! &]]_)+innR6H)ID!!& $((//44b9$((--22B7..334$//&&++R0((//445$""'',,b1 _s    4JJ!c           	         |j                   dd \  }}|j                   d   }t        d |D              |k(  sJ | j                  |      }||j                  |d   t	        d            }|j                  ||| j                  | j                  | j                  z        }| j                  |      j                  ||| j                  | j                  | j                  d      }	| j                  |      j                  ||| j                  | j                  | j                  z        }
t        j                  |
d      j                  ||| j                  | j                  | j                        }
|j                   d   }|dk(  rdt        j                  ||j                   |j"                        j%                  d      }|	|ddddddddf   z  }|ddddddddddf   |z   }nQ|d	k(  r=|	| j                  z  |ddddddddddf   z  d
z  }|ddddddddddf   |z   }nt'        d| d      t)        ||||
      }| j+                  |      S )a  
        Perform forward pass for multi-scale deformable attention.

        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

        Args:
            query (torch.Tensor): [bs, query_length, C]
            refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (torch.Tensor): [bs, value_length, C]
            value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        NrW   r   c              3   2   K   | ]  }|d    |d   z    yw)r   r   Nr7   )r   r   s     r4   r   z'MSDeformAttn.forward.<locals>.<genexpr>  s     511Q4!A$;s   rl   r   rZ   )rY   rX   rk   g      ?z5Last dim of reference_points must be 2 or 4, but got .)r[   sumr   masked_fillfloatr`   r   r   r   r   r   r   r   softmaxrm   	as_tensorrY   rX   flipr   r	   r   )r/   query
refer_bboxr=   value_shapes
value_maskbslen_qlen_vr   r   
num_pointsoffset_normalizeraddsampling_locationsoutputs                   r4   rJ   zMSDeformAttn.forward   sH   " KKO	EA555>>>&!%%j&;U1XFE

2udllDLLDLL4PQ007<<RVZVcVceierertuv 2259>>r5$,,X\XeXehlhuhuXuvII&7<AA"eT\\[_[h[hjnjwjwx%%b)
? %EKKX]XdXd e j jkm n"%6tT4DRS7S%TTC!+Aq$4,B!Cc!I1_"T]]2Z1dAtUVUW@W5XX[^^C!+Aq$4!,C!Ds!JTU_T``abcc4ULJ\^op''r5   )r}   rk   r   rk   rK   )rM   rN   rO   rP   r   r   rJ   rR   rS   s   @r4   r   r      s    !.2$((r5   r   c                   n     e Zd ZdZdddd ej
                         ddf fd	Zed        Zd	 Z	dd
Z
 xZS )r   aK  
    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    r}   r   i   r   rk   c                 h   t         |           t        j                  |||      | _        t        j
                  |      | _        t        j                  |      | _        t        ||||      | _
        t        j
                  |      | _        t        j                  |      | _        t        j                  ||      | _        || _        t        j
                  |      | _        t        j                  ||      | _        t        j
                  |      | _        t        j                  |      | _        y)zKInitialize the DeformableTransformerDecoderLayer with the given parameters.)r   N)r   r   r!   r"   	self_attnr*   r+   r'   r(   r   
cross_attnr,   r)   r$   linear1r-   dropout3linear2dropout4norm3)	r/   r   r   d_ffnr   r-   r   r   r3   s	           r4   r   z*DeformableTransformerDecoderLayer.__init__.  s     ..wQ

7+\\'*
 'w'8L

7+\\'*
 yy%0

7+yy0

7+\\'*
r5   c                     || S | |z   S )z;Add positional embeddings to the input tensor, if provided.r7   )r8   r9   s     r4   r:   z0DeformableTransformerDecoderLayer.with_pos_embedD  s     v6&3,6r5   c           	          | j                  | j                  | j                  | j                  |                        }|| j	                  |      z   }| j                  |      S )zHPerform forward pass through the Feed-Forward Network part of the layer.)r   r   r-   r   r   r   )r/   tgttgt2s      r4   forward_ffnz-DeformableTransformerDecoderLayer.forward_ffnI  sL    ||DMM$((4<<3D*EFGDMM$''zz#r5   c                    | j                  ||      x}}	| j                  |j                  dd      |	j                  dd      |j                  dd      |      d   j                  dd      }
|| j                  |
      z   }| j	                  |      }| j                  | j                  ||      |j                  d      |||      }
|| j                  |
      z   }| j                  |      }| j                  |      S )z:Perform the forward pass through the entire decoder layer.r   r   )r>   rW   )
r:   r   	transposer+   r(   r   	unsqueezer,   r)   r   )r/   embedr   featsshapespadding_maskr>   	query_posrC   rD   r   s              r4   rJ   z)DeformableTransformerDecoderLayer.forwardO  s     ##E955AnnQ[[A.Aq0A5??STVWCX'0  22355>Yq!_ 	c**

5! ood11%CZEYEYZ[E\^cek*,c**

5! &&r5   rL   )rM   rN   rO   rP   r!   ReLUr   r~   r:   r   rJ   rR   rS   s   @r4   r   r   &  sF      #AT272779_`kl +, 7 7'r5   r   c                   0     e Zd ZdZd fd	Z	 	 ddZ xZS )r   z
    Implementation of Deformable Transformer Decoder based on PaddleDetection.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    c                     t         |           t        ||      | _        || _        || _        |dk\  r|| _        y||z   | _        y)zFInitialize the DeformableTransformerDecoder with the given parameters.r   N)r   r   r   r   r   r   eval_idx)r/   r   decoder_layerr   r	  r3   s        r4   r   z%DeformableTransformerDecoder.__init__j  sB    !-<$$$,MzH7Lr5   c
                    |}
g }g }d}|j                         }t        | j                        D ]  \  }} ||
||||	| ||            }
 ||   |
      }t        j                   |t	        |      z         }| j
                  rb|j                   ||   |
             |dk(  r|j                  |       nm|j                  t        j                   |t	        |      z                n<|| j                  k(  r-|j                   ||   |
             |j                  |        n#|}| j
                  r|j                         n|} t        j                  |      t        j                  |      fS )z4Perform the forward pass through the entire decoder.Nr   )
sigmoidr   r   rm   r   trainingappendr	  detachr   )r/   r  r   r  r  	bbox_head
score_headpos_mlpr>   r  r   
dec_bboxesdec_clslast_refined_bboxr   r   bboxrefined_bboxs                     r4   rJ   z$DeformableTransformerDecoder.forwardr  sG    
 '')
!$++.HAu6:uflIW^_iWjkF9Q<'D ==
0K)KLL}}}z!}V456%%l3%%emmD?K\;]4]&^_dmm#}z!}V45!!,/ ,26--,,.\J% /( {{:&G(<<<r5   )rZ   )NNr   rS   s   @r4   r   r   c  s    M" %=r5   r   )rP   r   rm   torch.nnr!   torch.nn.functional
functionalr   torch.nn.initr   r   r   r   utilsr   r   r	   __all__Moduler
   r   r   r   r   r   r   r   r   r   r7   r5   r4   <module>r     s          4  T Tg4Kbii 4Knl" lB)ryy )&Vryy V,1ryy 1")) "I")) I0X(299 X(v:'		 :'z4=299 4=r5   