
    FPha                        d dl mZmZmZmZ d dlZd dlZd dlm	Z	 d dl
m	c mZ d dlmZmZ  G d de	j                         Z G d de	j                         Z G d d	e	j                         Z G d
 de	j                         Z G d de	j                         Zdej,                  dedeej,                  eeef   f   fdZdej,                  dedeeef   deeef   dej,                  f
dZdededej,                  dej,                  fdZdej,                  dej,                  dej,                  dej,                  deeef   deeef   dej,                  fdZ G d d e	j                         Zy)!    )AnyOptionalTupleTypeN)LayerNorm2dMLPBlockc            #       (    e Zd ZdZddddddddd	ej
                  ej                  d	d
d	ddfdedededededededede	de
ej                     de
ej                     de	de	de	dedeedf   ddf" fd Zd!ej                  dej                  fd"Z xZS )#ImageEncoderViTa  
    An image encoder using Vision Transformer (ViT) architecture for encoding an image into a compact latent space. The
    encoder takes an image, splits it into patches, and processes these patches through a series of transformer blocks.
    The encoded patches are then processed through a neck to generate the final encoded representation.

    Attributes:
        img_size (int): Dimension of input images, assumed to be square.
        patch_embed (PatchEmbed): Module for patch embedding.
        pos_embed (nn.Parameter, optional): Absolute positional embedding for patches.
        blocks (nn.ModuleList): List of transformer blocks for processing patch embeddings.
        neck (nn.Sequential): Neck module to further process the output.
    i                     @   TFr    img_size
patch_sizein_chans	embed_dimdepth	num_heads	mlp_ratio	out_chansqkv_bias
norm_layer	act_layeruse_abs_posuse_rel_posrel_pos_zero_initwindow_sizeglobal_attn_indexes.returnNc                 h   t         |           || _        t        ||f||f||      | _        d| _        |r6t        j                  t        j                  d||z  ||z  |            | _        t        j                         | _        t        |      D ]@  }t        ||||	|
|||||vr|nd||z  ||z  f
      }| j                  j                  |       B t        j                  t        j                   ||dd      t#        |      t        j                   ||ddd	      t#        |            | _        y)
a  
        Args:
            img_size (int): Input image size.
            patch_size (int): Patch size.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
            depth (int): Depth of ViT.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_abs_pos (bool): If True, use absolute positional embeddings.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks.
            global_attn_indexes (list): Indexes for blocks using global attention.
        )kernel_sizestrider   r   N   r   )
dimr   r   r   r   r   r   r   r    
input_sizeF)r$   biasr   )r$   paddingr)   )super__init__r   
PatchEmbedpatch_embed	pos_embednn	Parametertorchzeros
ModuleListblocksrangeBlockappend
SequentialConv2dr   neck)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   iblock	__class__s                      rC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\ultralytics/models/sam/modules/encoders.pyr,   zImageEncoderViT.__init__   s<   J 	 %#Z0
+	
 26\\%++aZ9OQY]gQgir*stDNmmouA##!%#'"3+,4G+GKQ$
2H
4JKE KKu%  MMII	 	"II 	"
	    xc                     | j                  |      }| j                  || j                  z   }| j                  D ]
  } ||      } | j                  |j	                  dddd            S )zProcesses input through patch embedding, applies positional embedding if present, and passes through blocks
        and neck.
        r   r   r&      )r.   r/   r5   r;   permute)r<   rB   blks      r@   forwardzImageEncoderViT.forwardr   sa     Q>>%DNN"A;;CAA yy1aA.//rA   )__name__
__module____qualname____doc__r0   	LayerNormGELUintfloatboolr   Moduler   r,   r2   TensorrG   __classcell__r?   s   @r@   r
   r
      s5    !  " !*,,,)+ $ %&* 35#T
T
 T
 	T

 T
 T
 T
 T
 T
 T
 RYYT
 BIIT
 T
 T
  $T
  !T
" "'sCx#T
$ 
%T
l	0 	0%,, 	0rA   r
   c                       e Zd ZdZej
                  fdedeeef   deeef   dedeej                     ddf fd	Z
dej                  fd
Zdej                  dej                  dedej                  fdZdej                  dej                  fdZdej                  dej                  fdZdeeej                  ej                  f      deej                     deej                     defdZdej(                  fdZdeeej                  ej                  f      deej                     deej                     deej                  ej                  f   fdZ xZS )PromptEncodera  
    Encodes different types of prompts, including points, boxes, and masks, for input to SAM's mask decoder. The encoder
    produces both sparse and dense embeddings for the input prompts.

    Attributes:
        embed_dim (int): Dimension of the embeddings.
        input_image_size (Tuple[int, int]): Size of the input image as (H, W).
        image_embedding_size (Tuple[int, int]): Spatial size of the image embedding as (H, W).
        pe_layer (PositionEmbeddingRandom): Module for random position embedding.
        num_point_embeddings (int): Number of point embeddings for different types of points.
        point_embeddings (nn.ModuleList): List of point embeddings.
        not_a_point_embed (nn.Embedding): Embedding for points that are not a part of any label.
        mask_input_size (Tuple[int, int]): Size of the input mask.
        mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
        no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
    r   image_embedding_sizeinput_image_sizemask_in_chans
activationr"   Nc                    t         |           || _        || _        || _        t        |dz        | _        d| _        t        | j                        D cg c]  }t        j                  d|       }}t        j                  |      | _        t        j                  d|      | _        d|d   z  d|d   z  f| _        t        j                  t        j                   d|dz  dd      t#        |dz         |       t        j                   |dz  |dd      t#        |       |       t        j                   ||d            | _        t        j                  d|      | _        yc c}w )a;  
        Encodes prompts for input to SAM's mask decoder.

        Args:
          embed_dim (int): The prompts' embedding dimension
          image_embedding_size (tuple(int, int)): The spatial size of the
            image embedding, as (H, W).
          input_image_size (int): The padded size of the image as input
            to the image encoder, as (H, W).
          mask_in_chans (int): The number of hidden channels used for
            encoding input masks.
          activation (nn.Module): The activation to use when encoding
            input masks.
        rD      r&   r   )r$   r%   )r$   N)r+   r,   r   rX   rW   PositionEmbeddingRandompe_layernum_point_embeddingsr6   r0   	Embeddingr4   point_embeddingsnot_a_point_embedmask_input_sizer9   r:   r   mask_downscalingno_mask_embed)	r<   r   rW   rX   rY   rZ   _ra   r?   s	           r@   r,   zPromptEncoder.__init__   s>   , 	" 0$8!/	Q?)*!@EdF_F_@`a@`1BLLI6@`a ".> ?!#a!; !$8$; ;QAUVWAX=XY "IIa!+1E*+LIImq(-QqQ&LIImYA>!
  \\!Y7 bs   E$c                 V    | j                  | j                        j                  d      S )a  
        Returns the positional encoding used to encode point prompts, applied to a dense set of points the shape of the
        image encoding.

        Returns:
          torch.Tensor: Positional encoding with shape 1x(embed_dim)x(embedding_h)x(embedding_w)
        r   )r^   rW   	unsqueezer<   s    r@   get_dense_pezPromptEncoder.get_dense_pe   s$     }}T667AA!DDrA   pointslabelspadc                 |   |dz   }|rt        j                  |j                  d   ddf|j                        }t        j                  |j                  d   df|j                         }t        j
                  ||gd      }t        j
                  ||gd      }| j                  j                  || j                        }d||dk(  <   ||dk(  xx   | j                  j                  z  cc<   ||dk(  xx   | j                  d   j                  z  cc<   ||dk(  xx   | j                  d   j                  z  cc<   |S )	zEmbeds point prompts.      ?r   r&   rD   devicer'           )r2   r3   shaperq   onescatr^   forward_with_coordsrX   rb   weightra   )r<   rk   rl   rm   padding_pointpadding_labelpoint_embeddings          r@   _embed_pointszPromptEncoder._embed_points   s    #!KKa!Q(?VM"ZZa!(<V]]SSMYY6A>FYY6A>F--;;FDDYDYZ(+"%"%)?)?)F)FF%!$(=(=a(@(G(GG$!$(=(=a(@(G(GG$rA   boxesc                 .   |dz   }|j                  ddd      }| j                  j                  || j                        }|dddddfxx   | j                  d   j
                  z  cc<   |dddddfxx   | j                  d   j
                  z  cc<   |S )zEmbeds box prompts.ro   rt   rD   Nr   r&   r   )reshaper^   rx   rX   ra   ry   )r<   r~   coordscorner_embeddings       r@   _embed_boxeszPromptEncoder._embed_boxes   s    r1a(==<<VTEZEZ[Aq!T%:%:1%=%D%DD!Aq!T%:%:1%=%D%DD!rA   masksc                 $    | j                  |      S )zEmbeds mask inputs.)rd   )r<   r   s     r@   _embed_maskszPromptEncoder._embed_masks   s    $$U++rA   c                 p    ||d   j                   d   S ||j                   d   S ||j                   d   S y)zLGets the batch size of the output given the batch size of the input prompts.r   r&   )ru   )r<   rk   r~   r   s       r@   _get_batch_sizezPromptEncoder._get_batch_size   sH     !9??1%%;;q>!;;q>!rA   c                 H    | j                   d   j                  j                  S )z@Returns the device of the first point embedding's weight tensor.r   )ra   ry   rq   ri   s    r@   _get_devicezPromptEncoder._get_device   s    $$Q'..555rA   c                 .   | j                  |||      }t        j                  |d| j                  f| j	                               }|4|\  }}| j                  |||du       }t        j                  ||gd      }|*| j                  |      }	t        j                  ||	gd      }|| j                  |      }
||
fS | j                  j                  j                  dddd      j                  |d| j                  d   | j                  d         }
||
fS )a`  
        Embeds different types of prompts, returning both sparse and dense embeddings.

        Args:
          points (tuple(torch.Tensor, torch.Tensor), None): point coordinates and labels to embed.
          boxes (torch.Tensor, None): boxes to embed
          masks (torch.Tensor, None): masks to embed

        Returns:
          torch.Tensor: sparse embeddings for the points and boxes, with shape BxNx(embed_dim), where N is determined
            by the number of input points and boxes.
          torch.Tensor: dense embeddings for the masks, in the shape Bx(embed_dim)x(embed_H)x(embed_W)
        r   rp   N)rm   r&   rr   rt   )r   r2   emptyr   r   r}   rw   r   r   re   ry   r   expandrW   )r<   rk   r~   r   bssparse_embeddingsr   rl   ra   box_embeddingsdense_embeddingss              r@   rG   zPromptEncoder.forward   s<   & !!&%7!KKQ(?HXHXHZ[#NFF#11&&uPT}1V %		+<>N*OUV W!..u5N %		+<n*MST U#007 !"222	  $1188@@BAB DDJF2rSWSlSlmnSoKOKdKdefKgEi  !"222rA   )rH   rI   rJ   rK   r0   rM   rN   r   r   rQ   r,   r2   rR   rj   rP   r}   r   r   r   r   rq   r   rG   rS   rT   s   @r@   rV   rV   ~   s   . ')gg+8+8 $CHo+8  S/	+8
 +8 O+8 
+8ZEell E  	
 
( %,,  5<<  ,%,, ,5<< ,u||U\\9:; % %	
 
 6U\\ 6$3u||U\\9:;$3 %$3 %	$3
 
u||U\\)	*$3rA   rV   c                        e Zd ZdZddedee   ddf fdZdej                  dej                  fdZ
d	eeef   dej                  fd
Zdej                  deeef   dej                  fdZ xZS )r]   z5Positional encoding using random spatial frequencies.Nnum_pos_featsscaler"   c                     t         |           ||dk  rd}| j                  d|t        j                  d|f      z         t        j
                  d       dt        j                  j                  _        y)zBInitializes a position embedding using random spatial frequencies.Nrs         ?#positional_encoding_gaussian_matrixrD   F)	r+   r,   register_bufferr2   randnuse_deterministic_algorithmsbackendscudnndeterministic)r<   r   r   r?   s      r@   r,   z PositionEmbeddingRandom.__init__&  sf    =ESLEBEEKKYZ\iXjLkDkl 	**51-2*rA   r   c                     d|z  dz
  }|| j                   z  }dt        j                  z  |z  }t        j                  t        j
                  |      t        j                  |      gd      S )z8Positionally encode points that are normalized to [0,1].rD   r&   rt   rr   )r   nppir2   rw   sincos)r<   r   s     r@   _pe_encodingz$PositionEmbeddingRandom._pe_encoding1  s[     Va$BBBRUUV#yy%))F+UYYv->?RHHrA   sizec                 f   |\  }}| j                   j                  }t        j                  ||f|t        j                        }|j                  d      dz
  }|j                  d      dz
  }||z  }||z  }| j                  t        j                  ||gd            }|j                  ddd      S )z>Generate positional encoding for a grid of the specified size.)rq   dtyper   rr   ro   r&   rt   rD   )	r   rq   r2   rv   float32cumsumr   stackrE   )	r<   r   hwrq   gridy_embedx_embedpes	            r@   rG   zPositionEmbeddingRandom.forward:  s    1>>EEzz1a&u}}E++!+$s*++!+$s*A+A+u{{GW+=2FGzz!Q""rA   coords_input
image_sizec                     |j                         }|dddddf   |d   z  |dddddf<   |dddddf   |d   z  |dddddf<   | j                  |j                  t        j                              S )z<Positionally encode points that are not normalized to [0,1].Nr   r&   )cloner   tor2   rO   )r<   r   r   r   s       r@   rx   z+PositionEmbeddingRandom.forward_with_coordsG  sr    ##% Aq/JqM9q!Qw Aq/JqM9q!Qw  5;;!788rA   )@   N)rH   rI   rJ   rK   rN   r   rO   r,   r2   rR   r   r   rG   rx   rS   rT   s   @r@   r]   r]   #  s    ?	3c 	3x 	3RV 	3I5<< IELL I#E#s(O # #9 9%PSUXPX/ 9^c^j^j 9rA   r]   c                       e Zd ZdZddej
                  ej                  ddddfdeded	ed
e	de
ej                     de
ej                     de	de	dedeeeef      ddf fdZdej                   dej                   fdZ xZS )r7   zTTransformer blocks with support of window attention and residual propagation blocks.r   TFr   Nr'   r   r   r   r   r   r   r   r    r(   r"   c           	          t         |            ||      | _        t        ||||||	dk(  r|
n|	|	f      | _         ||      | _        t        |t        ||z        |      | _        |	| _	        y)ag  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks. If it equals 0, then
                use global attention.
            input_size (tuple(int, int), None): Input resolution for calculating the relative
                positional parameter size.
        r   )r   r   r   r   r(   )embedding_dimmlp_dimactN)
r+   r,   norm1	Attentionattnnorm2r   rN   mlpr    )r<   r'   r   r   r   r   r   r   r   r    r(   r?   s              r@   r,   zBlock.__init__R  sv    8 	_
#/%0A%5zK;U
	  _
#s3?7KQZ[&rA   rB   c                 x   |}| j                  |      }| j                  dkD  r7|j                  d   |j                  d   }}t        || j                        \  }}| j	                  |      }| j                  dkD  rt        || j                  f      }||z   }|| j                  | j                  |            z   S )zhExecutes a forward pass through the transformer block with window attention and non-overlapping windows.r   r&   rD   )r   r    ru   window_partitionr   window_unpartitionr   r   )r<   rB   shortcutHWpad_hws         r@   rG   zBlock.forward~  s    JJqMa771:qwwqzqA(D,<,<=IAvIIaLa"1d&6&6AGAqL488DJJqM***rA   )rH   rI   rJ   rK   r0   rL   rM   rN   rO   rP   r   rQ   r   r   r,   r2   rR   rG   rS   rT   s   @r@   r7   r7   O  s    ^ &(ll%'WW!"&04*'*' *' 	*'
 *' O*' 		?*' *'  *' *' U38_-*' 
*'X+ +%,, +rA   r7   c                        e Zd ZdZ	 	 	 	 	 ddedededededeeeef      d	df fd
Zde	j                  d	e	j                  fdZ xZS )r   z=Multi-head Attention block with relative position embeddings.Nr'   r   r   r   r   r(   r"   c                    t         |           || _        ||z  }|dz  | _        t	        j
                  ||dz  |      | _        t	        j
                  ||      | _        || _        | j                  rx|J d       t	        j                  t        j                  d|d   z  dz
  |            | _        t	        j                  t        j                  d|d   z  dz
  |            | _        yy)	a  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            input_size (tuple(int, int), None): Input resolution for calculating the relative
                positional parameter size.
        g      r   )r)   NzBInput size must be provided if using relative positional encoding.rD   r   r&   )r+   r,   r   r   r0   Linearqkvprojr   r1   r2   r3   	rel_pos_h	rel_pos_w)	r<   r'   r   r   r   r   r(   head_dimr?   s	           r@   r,   zAttention.__init__  s    $ 	")#%
99S#'9IIc3'	&*q-qq*\\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN	 rA   rB   c           	         |j                   \  }}}}| j                  |      j                  |||z  d| j                  d      j	                  ddddd      }|j                  d|| j                  z  ||z  d      j                  d      \  }}}	|| j                  z  |j                  dd      z  }
| j                  r(t        |
|| j                  | j                  ||f||f      }
|
j                  d      }
|
|	z  j                  || j                  ||d      j	                  ddddd      j                  |||d      }| j                  |      S )	ziApplies the forward operation including attention, normalization, MLP, and indexing within window limits.r   rt   rD   r   r&   r\   rr   )ru   r   r   r   rE   unbindr   	transposer   add_decomposed_rel_posr   r   softmaxviewr   )r<   rB   Br   r   rf   r   qkvr   s              r@   rG   zAttention.forward  s7   WW
1ahhqk!!!QUAt~~rBJJ1aQRTUWXY++aT^^!3QUB?FFqI1aDJJ!++b""55)$4>>4>>TUWXSY\]_`[abD|||#AXOOAt~~q!R8@@Aq!QOWWXY[\^_acdyy|rA   )   TFTN)rH   rI   rJ   rK   rN   rP   r   r   r,   r2   rR   rG   rS   rT   s   @r@   r   r     s    G
 !"&04XX X 	X
 X  X U38_-X 
XB %,, rA   r   rB   r    r"   c           	      `   | j                   \  }}}}|||z  z
  |z  }|||z  z
  |z  }|dkD  s|dkD  rt        j                  | ddd|d|f      } ||z   ||z   }	}| j                  |||z  ||	|z  ||      } | j	                  dddddd      j                         j                  d|||      }
|
||	ffS )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r&   r   rD   r\      rt   )ru   Frm   r   rE   
contiguous)rB   r    r   r   r   Cpad_hpad_wHpWpwindowss              r@   r   r     s     JAq!Q1{?*k9E1{?*k9EqyEAIEE!aAua/0YE	B	q"#["2C[RSTAii1aAq)446;;B[Z[\GRHrA   r   r   hwc                 F   |\  }}|\  }}| j                   d   ||z  |z  |z  z  }| j                  |||z  ||z  ||d      }	|	j                  dddddd      j                         j                  |||d      }	||kD  s||kD  r|	ddd|d|ddf   j                         }	|	S )	a  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    r   rt   r&   r   rD   r\   r   N)ru   r   rE   r   )
r   r    r   r   r   r   r   r   r   rB   s
             r@   r   r     s     FBDAqaR"W3{BCAQk)2+<k;XZ[A			!Q1a#..055aRDA	Avaa!RaRlO&&(HrA   q_sizek_sizerel_posc                 @   t        dt        | |      z  dz
        }|j                  d   |k7  rjt        j                  |j                  d|j                  d   d      j                  ddd      |d      }|j                  d|      j                  dd      }n|}t        j                  |       dddf   t        || z  d      z  }t        j                  |      dddf   t        | |z  d      z  }||z
  |dz
  t        | |z  d      z  z   }||j                            S )	a\  
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
    rD   r&   r   rt   linear)r   modeNr   )
rN   maxru   r   interpolater   rE   r2   arangelong)r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           r@   get_rel_posr     s    q3vv..23L}}Q<'--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ! ||F#AtG,s6F?C/HHH||F#D!G,s6F?C/HHH(*vzS&RU=V.VVO?//122rA   r   r   r   r   c                    |\  }}|\  }}	t        |||      }
t        ||	|      }|j                  \  }}}|j                  ||||      }t        j                  d||
      }t        j                  d||      }| j                  |||||	      |dddddddddf   z   |dddddddddf   z   j                  |||z  ||	z        } | S )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
    Args:
        attn (Tensor): attention map.
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   ru   r   r2   einsumr   )r   r   r   r   r   r   q_hq_wk_hk_wRhRwr   rf   r'   r_qrel_hrel_ws                     r@   r   r     s    , HCHC	S#y	)B	S#y	)BIAq#
))AsC
%CLL)33ELL)33EIIac3,uQ1a5E/FFqRSUVX\^_O_I``ff	39cCi!D KrA   c                        e Zd ZdZ	 	 	 	 	 ddeeef   deeef   deeef   dededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )r-   zImage to Patch Embedding.r$   r%   r*   r   r   r"   Nc                 `    t         |           t        j                  |||||      | _        y)aP  
        Args:
            kernel_size (Tuple): kernel size of the projection layer.
            stride (Tuple): stride of the projection layer.
            padding (Tuple): padding size of the projection layer.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
        )r$   r%   r*   N)r+   r,   r0   r:   r   )r<   r$   r%   r*   r   r   r?   s         r@   r,   zPatchEmbed.__init__A  s*      	IIh	{SYcjk	rA   rB   c                 H    | j                  |      j                  dddd      S )zRComputes patch embedding by applying convolution and transposing resulting tensor.r   rD   r   r&   )r   rE   )r<   rB   s     r@   rG   zPatchEmbed.forwardU  s!    yy|##Aq!Q//rA   )r   r   r  )r   r   r   r   )rH   rI   rJ   rK   r   rN   r,   r2   rR   rG   rS   rT   s   @r@   r-   r-   >  s    # ,4&.'- lsCxl #s(Ol 38_	l
 l l 
l(0 0%,, 0rA   r-   )typingr   r   r   r   numpyr   r2   torch.nnr0   torch.nn.functional
functionalr   ultralytics.nn.modulesr   r   rQ   r
   rV   r]   r7   r   rR   rN   r   r   r   r   r-   r   rA   r@   <module>r     s   . -      8m0bii m0`b3BII b3J)9bii )9X>+BII >+B3		 3l 3 5uUXZ]U]A^;_ 0 3 cSVh  c?/4||23 3S 35<< 3ELL 3B#
,,#||# ||# ||	#
 #s(O# #s(O# \\#L0 0rA   