
    FPh4q                     R   d dl Z d dlmZ d dlZd dlmZ d dlmc mZ d dl	m
c mZ d dlmZ  G d dej
                  j                        Z G d dej                         Z G d d	ej                         Z G d
 dej                         Z G d dej                         Z G d dej                         Z G d dej
                  j                         Z G d dej                         Z G d dej                         Z G d dej                         Z G d dej                         Zy)    N)Tuple)	to_2tuplec                   $     e Zd ZdZd fd	Z xZS )	Conv2d_BNzTA sequential container that performs 2D convolution followed by batch normalization.c	                    t         
|           | j                  dt        j                  j                  |||||||d             t        j                  j                  |      }	t        j                  j                  j                  |	j                  |       t        j                  j                  j                  |	j                  d       | j                  d|	       y)zInitializes the MBConv model with given input channels, output channels, expansion ratio, activation, and
        drop path.
        cF)biasr   bnN)super__init__
add_moduletorchnnConv2dBatchNorm2dinit	constant_weightr	   )selfabksstridepaddilationgroupsbn_weight_initr
   	__class__s             vC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\ultralytics/models/sam/modules/tiny_encoder.pyr   zConv2d_BN.__init__   s     	UXX__Q2vsHf[`_abXX!!!$		>:+b!    )   r!   r   r!   r!   r!   )__name__
__module____qualname____doc__r   __classcell__r   s   @r   r   r      s    ^	" 	"r    r   c                   (     e Zd ZdZ fdZd Z xZS )
PatchEmbedzREmbeds images into patches and projects them into a specified embedding dimension.c                 L   t         |           t        |      }|d   dz  |d   dz  f| _        | j                  d   | j                  d   z  | _        || _        || _        |}t        j                  t        ||dz  ddd       |       t        |dz  |ddd            | _
        y)zInitialize the PatchMerging class with specified input, output dimensions, resolution and activation
        function.
        r      r!         N)r   r   r   patches_resolutionnum_patchesin_chans	embed_dimr   
Sequentialr   seq)r   r0   r1   
resolution
activationimg_sizenr   s          r   r   zPatchEmbed.__init__)   s     	$-j$9#+A;!#3Xa[A5E"F22158O8OPQ8RR "==hQ1a0La1faAq)
r    c                 $    | j                  |      S )zNRuns input tensor 'x' through the PatchMerging model's sequence of operations.)r3   r   xs     r   forwardzPatchEmbed.forward:   s    xx{r    r"   r#   r$   r%   r   r;   r&   r'   s   @r   r)   r)   &   s    \
"r    r)   c                   (     e Zd ZdZ fdZd Z xZS )MBConvzVMobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.c                    t         |           || _        t        ||z        | _        || _        t        || j                  d      | _         |       | _        t        | j                  | j                  ddd| j                        | _	         |       | _
        t        | j                  |dd      | _         |       | _        t        j                         | _        y)zInitializes a convolutional layer with specified dimensions, input resolution, depth, and activation
        function.
        r!   )r   r-   r   r   r   r           )r   r   N)r   r   r0   inthidden_chans	out_chansr   conv1act1conv2act2conv3act3r   Identity	drop_path)r   r0   rD   expand_ratior5   rL   r   s         r   r   zMBConv.__init__B   s     	 < 78"x):):qA
L	t00$2C2CRSYZcgctctu
L	t00)RUV
L	 r    c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }||z  }| j                  |      S )z7Implements the forward pass for the model architecture.)rE   rF   rG   rH   rI   rL   rJ   )r   r:   shortcuts      r   r;   zMBConv.forwardX   sm    JJqMIIaLJJqMIIaLJJqMNN1	Xyy|r    r<   r'   s   @r   r>   r>   ?   s    `',
r    r>   c                   (     e Zd ZdZ fdZd Z xZS )PatchMergingzNMerges neighboring patches in the feature map and projects to a new dimension.c                     t         |           || _        || _        || _         |       | _        t        ||ddd      | _        |dv rdnd}t        ||d|d|      | _        t        ||ddd      | _	        y)zInitializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other
        optional parameters.
        r!   r   )i@  i  i@  r,   r-   )r   N)
r   r   input_resolutiondimout_dimactr   rE   rG   rI   )r   rS   rT   rU   r5   stride_cr   s         r   r   zPatchMerging.__init__h   s{     	 0<sGQ15
?21wHaP
wAq9
r    c                    |j                   dk(  r@| j                  \  }}t        |      }|j                  |||d      j	                  dddd      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  d      j                  dd      S )zfApplies forward pass on the input utilizing convolution and activation layers, and returns the result.r-   r   r!   r,   )ndimrS   lenviewpermuterE   rV   rG   rI   flatten	transpose)r   r:   HWBs        r   r;   zPatchMerging.forwardw   s    66Q;((DAqAAq!Q#++Aq!Q7AJJqMHHQKJJqMHHQKJJqMyy|%%a++r    r<   r'   s   @r   rQ   rQ   e   s    X:,r    rQ   c                   4     e Zd ZdZ	 	 	 	 	 d fd	Zd Z xZS )	ConvLayerz
    Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).

    Optionally applies downsample operations to the output, and provides support for gradient checkpointing.
    c
                 @   t         |           || _        || _        || _        || _        t        j                  t        |      D 
cg c]&  }
t        |||	|t        |t              r||
   n|      ( c}
      | _        |d| _        y |||||      | _        yc c}
w )a  
        Initializes the ConvLayer with the given dimensions and settings.

        Args:
            dim (int): The dimensionality of the input and output.
            input_resolution (Tuple[int, int]): The resolution of the input image.
            depth (int): The number of MBConv layers in the block.
            activation (Callable): Activation function applied after each convolution.
            drop_path (Union[float, List[float]]): Drop path rate. Single float or a list of floats for each MBConv.
            downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
            use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
            out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
            conv_expand_ratio (float): Expansion ratio for the MBConv layers.
        NrT   rU   r5   )r   r   rT   rS   depthuse_checkpointr   
ModuleListranger>   
isinstancelistblocks
downsample)r   rT   rS   rg   r5   rL   rn   rh   rU   conv_expand_ratioir   s              r   r   zConvLayer.__init__   s    4 	 0
, mm U|%% $! ! *9d ;	! $%% & #-"4$*#w:;O%%s   +Bc                     | j                   D ],  }| j                  rt        j                  ||      n ||      }. | j                  |S | j                  |      S )z^Processes the input through a series of convolutional layers and returns the activated output.rm   rh   
checkpointrn   r   r:   blks      r   r;   zConvLayer.forward   M    ;;C151D1D
%%c1-#a&A OO+qC1CCr    )rA   NFN      @r<   r'   s   @r   rd   rd      s&     ,O\Dr    rd   c                   F     e Zd ZdZddej
                  df fd	Zd Z xZS )Mlpz
    Multi-layer Perceptron (MLP) for transformer architectures.

    This layer takes an input with in_features, applies layer normalization and two fully-connected layers.
    NrA   c                 &   t         |           |xs |}|xs |}t        j                  |      | _        t        j
                  ||      | _        t        j
                  ||      | _         |       | _        t        j                  |      | _
        y)zjInitializes Attention module with the given parameters including dimension, key_dim, number of heads, etc.N)r   r   r   	LayerNormnormLinearfc1fc2rV   Dropoutdrop)r   in_featureshidden_featuresout_features	act_layerr   r   s         r   r   zMlp.__init__   so    #2{)8[LL-	99[/:99_l;;JJt$	r    c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      S )zRApplies operations on input x and returns modified x, runs downsample if not None.)r|   r~   rV   r   r   r9   s     r   r;   zMlp.forward   sQ    IIaLHHQKHHQKIIaLHHQKyy|r    )	r"   r#   r$   r%   r   GELUr   r;   r&   r'   s   @r   ry   ry      s%     59tWYW^W^eg 	%r    ry   c                   d     e Zd ZdZ	 	 	 d fd	Z ej                         d fd	       Zd Z xZ	S )	Attentionag  
    Multi-head attention module with support for spatial awareness, applying attention biases based on spatial
    resolution. Implements trainable attention biases for each unique offset between spatial positions in the resolution
    grid.

    Attributes:
        ab (Tensor, optional): Cached attention biases for inference, deleted during training.
    c           	         t         |           t        |t              rt	        |      dk(  sJ || _        |dz  | _        || _        ||z  x| _        }t        ||z        | _
        t        ||z        |z  | _        || _        | j                  |dz  z   }t        j                  |      | _        t        j                   ||      | _        t        j                   | j                  |      | _        t'        t)        j*                  t-        |d         t-        |d                     }t	        |      }	i }
g }|D ]W  }|D ]P  }t/        |d   |d   z
        t/        |d   |d   z
        f}||
vrt	        |
      |
|<   |j1                  |
|          R Y t2        j                  j5                  t3        j6                  |t	        |
                  | _        | j;                  dt3        j<                  |      j?                  |	|	      d       y)	ac  
        Initializes the Attention module.

        Args:
            dim (int): The dimensionality of the input and output.
            key_dim (int): The dimensionality of the keys and queries.
            num_heads (int, optional): Number of attention heads. Default is 8.
            attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
            resolution (Tuple[int, int], optional): Spatial resolution of the input feature map. Default is (14, 14).

        Raises:
            AssertionError: If `resolution` is not a tuple of length 2.
        r,   g      r   r!   attention_bias_idxsF)
persistentN) r   r   rk   tupler[   	num_headsscalekey_dimnh_kdrB   ddh
attn_ratior   r{   r|   r}   qkvprojrl   	itertoolsproductrj   absappendr   	Parameterzerosattention_biasesregister_buffer
LongTensorr\   )r   rT   r   r   r   r4   r   hpointsNattention_offsetsidxsp1p2offsetr   s                  r   r   zAttention.__init__   s   * 	*e,ZA1EEE"_
$y00
UZ')*j7*+i7$GGeaiLL%	99S!$IIdggs+	i''jm(<eJqM>RSTKBbebem,c"Q%"Q%-.@A!22034E0F%f--f56	   !& 2 25;;y#N_J`3a b2E4D4DT4J4O4OPQST4Ubghr    c                     t         |   |       |rt        | d      r| `y| j                  dd| j
                  f   | _        y)zNSets the module in training mode and handles attribute 'ab' based on the mode.abN)r   trainhasattrr   r   r   )r   moder   s     r   r   zAttention.train  s?     	dGD$'++At/G/G,GHDGr    c                 D   |j                   \  }}}| j                  |      }| j                  |      }|j                  ||| j                  d      j                  | j                  | j                  | j                  gd      \  }}}|j                  dddd      }|j                  dddd      }|j                  dddd      }| j                  j                  | j                  j                        | _	        ||j                  dd      z  | j                  z  | j                  r| j                  dd| j                   f   n| j                  z   }	|	j#                  d      }	|	|z  j                  dd      j%                  ||| j&                        }| j)                  |      S )	zcPerforms forward pass over the input tensor 'x' by applying normalization and querying keys/values.rY   r-   )rT   r   r,   r!   N)shaper|   r   r\   r   splitr   r   r]   r   tor   devicer_   r   trainingr   softmaxreshaper   r   )
r   r:   rb   r   _r   qkvattns
             r   r;   zAttention.forward%  sa   ''1a IIaLhhqk((1a4::DLL$,,X\X^X^;_ef:g1aIIaAq!IIaAq!IIaAq!''**T2299:Q[[R((DJJ6GK}}&&q$*B*B'BCZ^ZaZac|||#AX  A&..q!TWW=yy|r    )   r+   )   r   )T)
r"   r#   r$   r%   r   r   no_gradr   r;   r&   r'   s   @r   r   r      s=     0id U]]_I Ir    r   c                   V     e Zd ZdZdddddej
                  f fd	Zd Zdefd	Z	 xZ
S )
TinyViTBlockzOTinyViT Block that applies self-attention and a local convolution to the input.   rw   rA   r-   c
                    t         |           || _        || _        || _        |dkD  sJ d       || _        || _        t        j                         | _	        ||z  dk(  sJ d       ||z  }
||f}t        ||
|d|      | _        t        ||z        }|	}t        ||||      | _        |dz  }t        |||d||      | _        y	)
a  
        Initializes the TinyViTBlock.

        Args:
            dim (int): The dimensionality of the input and output.
            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
            num_heads (int): Number of attention heads.
            window_size (int, optional): Window size for attention. Default is 7.
            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
            drop (float, optional): Dropout rate. Default is 0.
            drop_path (float, optional): Stochastic depth rate. Default is 0.
            local_conv_size (int, optional): The kernel size of the local convolution. Default is 3.
            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.

        Raises:
            AssertionError: If `window_size` is not greater than 0.
            AssertionError: If `dim` is not divisible by `num_heads`.
        r   z"window_size must be greater than 0z"dim must be divisible by num_headsr!   )r   r4   )r   r   r   r   r,   r@   N)r   r   rT   rS   r   window_size	mlp_ratior   rK   rL   r   r   rB   ry   mlpr   
local_conv)r   rT   rS   r   r   r   r   rL   local_conv_sizer5   head_dimwindow_resolutionmlp_hidden_dimmlp_activationr   r   s                  r   r   zTinyViTBlock.__init__?  s    < 	 0"QD DD&" Y!#I%II#)#(+6c8Y1Qbc	S9_-#3R`gkl"#CPS\_`r    c           	      >   | j                   \  }}|j                  \  }}}|||z  k(  sJ d       |}|| j                  k(  r"|| j                  k(  r| j                  |      }n|j	                  ||||      }| j                  || j                  z  z
  | j                  z  }| j                  || j                  z  z
  | j                  z  }	|dkD  xs |	dkD  }
|
rt        j                  |ddd|	d|f      }||z   ||	z   }}|| j                  z  }|| j                  z  }|j	                  ||| j                  || j                  |      j                  dd      j                  ||z  |z  | j                  | j                  z  |      }| j                  |      }|j	                  |||| j                  | j                  |      j                  dd      j                  ||||      }|
r|ddd|d|f   j                         }|j	                  |||      }|| j                  |      z   }|j                  dd      j                  ||||      }| j                  |      }|j	                  |||      j                  dd      }|| j                  | j                  |            z   S )z~Applies attention-based transformation or padding to input 'x' before passing it through a local
        convolution.
        zinput feature has wrong sizer   r,   r-   Nr!   )rS   r   r   r   r\   Fr   r_   r   
contiguousrL   r   r   )r   r:   r`   ra   rb   LCres_xpad_bpad_rpaddingpHpWnHnWs                  r   r;   zTinyViTBlock.forwardv  s    $$1''1aAEz999z   Q$*:*:%:		!Aq!Q"A%%D,<,<(<<@P@PPE%%D,<,<(<<@P@PPEai,519GEE!aAua78YE	Bt'''Bt'''Bq"d..D4D4D#)Aq/''!b&2+t?O?ORVRbRb?bde*f 		!Aq"b$"2"2D4D4DaHRRSTVWX``abdfhjlmnAa!RaRiL++-q!QADNN1%%KK1%%aAq1OOAFF1aO%%a+4>>$((1+...r    returnc           
          d| j                    d| j                   d| j                   d| j                   d| j                   
S )zReturns a formatted string representing the TinyViTBlock's parameters: dimension, input resolution, number of
        attentions heads, window size, and MLP ratio.
        dim=, input_resolution=z, num_heads=z, window_size=z, mlp_ratio=)rT   rS   r   r   r   r   s    r   
extra_reprzTinyViTBlock.extra_repr  sT     dhhZ243H3H2IVZVdVdUe f"../|DNN;KM 	Mr    r"   r#   r$   r%   r   r   r   r;   strr   r&   r'   s   @r   r   r   <  s9    Y 775an(/TMC Mr    r   c                   Z     e Zd ZdZddddddej
                  df fd	Zd Zd	efd
Z	 xZ
S )
BasicLayerz>A basic TinyViT layer for one stage in a TinyViT architecture.rw   rA   NFr-   c                 J   t         |           || _        || _        || _        |
| _        t        j                  t        |      D cg c]+  }t        ||||||t        |t              r||   n|||	      - c}      | _        |	d| _        y |	||||      | _        yc c}w )a  
        Initializes the BasicLayer.

        Args:
            dim (int): The dimensionality of the input and output.
            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
            depth (int): Number of TinyViT blocks.
            num_heads (int): Number of attention heads.
            window_size (int): Local window size.
            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
            drop (float, optional): Dropout rate. Default is 0.
            drop_path (float | tuple[float], optional): Stochastic depth rate. Default is 0.
            downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default is None.
            use_checkpoint (bool, optional): Whether to use checkpointing to save memory. Default is False.
            local_conv_size (int, optional): Kernel size of the local convolution. Default is 3.
            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
            out_dim (int | None, optional): The output dimension of the layer. Default is None.

        Raises:
            ValueError: If `drop_path` is a list of float but its length doesn't match `depth`.
        )	rT   rS   r   r   r   r   rL   r   r5   Nrf   )r   r   rT   rS   rg   rh   r   ri   rj   r   rk   rl   rm   rn   )r   rT   rS   rg   r   r   r   r   rL   rn   rh   r   r5   rU   rp   r   s                  r   r   zBasicLayer.__init__  s    J 	 0
, mm U|%% $! !1#'#*4Y*E)A,9 /%
 $%% & #-"4$*#w:;O%%s   0B c                     | j                   D ],  }| j                  rt        j                  ||      n ||      }. | j                  |S | j                  |      S )zQPerforms forward propagation on the input tensor and returns a normalized tensor.rr   rt   s      r   r;   zBasicLayer.forward  rv   r    r   c                 T    d| j                    d| j                   d| j                   S )zWReturns a string representation of the extra_repr function with the layer's parameters.r   r   z, depth=)rT   rS   rg   r   s    r   r   zBasicLayer.extra_repr  s/    dhhZ243H3H2IRVR\R\Q]^^r    r   r'   s   @r   r   r     s?    H 77;OzD_C _r    r   c                   j     e Zd ZdZd	dededdf fdZdej                  dej                  fdZ	 xZ
S )
LayerNorm2dz6A PyTorch implementation of Layer Normalization in 2D.num_channelsepsr   Nc                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y)zKInitialize LayerNorm2d with the number of channels and an optional epsilon.N)
r   r   r   r   r   onesr   r   r	   r   )r   r   r   r   s      r   r   zLayerNorm2d.__init__  sG    ll5::l#;<LL\!:;	r    r:   c                    |j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }| j
                  ddddf   |z  | j                  ddddf   z   S )z5Perform a forward pass, normalizing the input tensor.r!   T)keepdimr,   N)meanpowr   sqrtr   r   r	   )r   r:   uss       r   r;   zLayerNorm2d.forward  s    FF1dF#UKKN40UejjTXX..{{1dD=)A-		!T4-0HHHr    )gư>)r"   r#   r$   r%   rB   floatr   r   Tensorr;   r&   r'   s   @r   r   r     s>    @S u  I I%,, Ir    r   c                        e Zd ZdZdddg dg dg dg dd	d
ddd	ddf fd	Zd Zd Zej                  j                  d        Z
d Zd Z xZS )TinyViTa  
    The TinyViT architecture for vision tasks.

    Attributes:
        img_size (int): Input image size.
        in_chans (int): Number of input channels.
        num_classes (int): Number of classification classes.
        embed_dims (List[int]): List of embedding dimensions for each layer.
        depths (List[int]): List of depths for each layer.
        num_heads (List[int]): List of number of attention heads for each layer.
        window_sizes (List[int]): List of window sizes for each layer.
        mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
        drop_rate (float): Dropout rate for drop layers.
        drop_path_rate (float): Drop path rate for stochastic depth.
        use_checkpoint (bool): Use checkpointing for efficient memory usage.
        mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
        local_conv_size (int): Local convolution kernel size.
        layer_lr_decay (float): Layer-wise learning rate decay.

    Note:
        This implementation is generalized to accept a list of depths, attention heads,
        embedding dimensions and window sizes, which allows you to create a
        "stack" of TinyViT models of varying configurations.
       r-   i  )`      i  i   )r,   r,      r,   )r-   r         )r   r   r   r   rw   rA   g?F      ?c                    t         |           || _        || _        || _        t        |      | _        || _        t        j                  }t        ||d   ||      | _        | j                  j                  }|| _        t        j                  d|
t        |            D cg c]  }|j!                          }}t        j"                         | _        t'        | j                        D ]  }t)        ||   |d   d|dk(  r|dz
  n|z  z  |d   d|dk(  r|dz
  n|z  z  f||   |t        |d|       t        |d|dz           || j                  dz
  k  rt*        nd||t-        |dz   t        |      dz
           |      }|dk(  rt/        dd|i|}n!t1        d||   ||   | j                  |	|d	|}| j$                  j3                  |        t        j4                  |d
         | _        |dkD  rt        j8                  |d
   |      nt        j                  j;                         | _        | j?                  | j@                         | jC                  |       t        jD                  t        jF                  |d
   ddd      tI        d      t        jF                  ddddd      tI        d            | _%        yc c}w )a  
        Initializes the TinyViT model.

        Args:
            img_size (int, optional): The input image size. Defaults to 224.
            in_chans (int, optional): Number of input channels. Defaults to 3.
            num_classes (int, optional): Number of classification classes. Defaults to 1000.
            embed_dims (List[int], optional): List of embedding dimensions for each layer. Defaults to [96, 192, 384, 768].
            depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2].
            num_heads (List[int], optional): List of number of attention heads for each layer. Defaults to [3, 6, 12, 24].
            window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7].
            mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4.
            drop_rate (float, optional): Dropout rate. Defaults to 0.
            drop_path_rate (float, optional): Drop path rate for stochastic depth. Defaults to 0.1.
            use_checkpoint (bool, optional): Whether to use checkpointing for efficient memory usage. Defaults to False.
            mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer. Defaults to 4.0.
            local_conv_size (int, optional): Local convolution kernel size. Defaults to 3.
            layer_lr_decay (float, optional): Layer-wise learning rate decay. Defaults to 1.0.
        r   )r0   r1   r4   r5   r,   r-   r!   N)rT   rS   rg   rL   rn   rh   rU   r5   ro   )r   r   r   r   r   rY      F)kernel_sizer	   )r   r   r	    )&r   r   r6   num_classesdepthsr[   
num_layersr   r   r   r)   patch_embedr.   r   linspacesumitemri   layersrj   dictrQ   minrd   r   r   r{   	norm_headr}   rK   headapply_init_weightsset_layer_lr_decayr2   r   r   neck)r   r6   r0   r  
embed_dimsr  r   window_sizesr   	drop_ratedrop_path_raterh   mbconv_expand_ratior   layer_lr_decayr5   r.   r:   dpri_layerkwargslayerr   s                         r   r   zTinyViT.__init__  s   H 	 &f+"WW
%x0:1191;=
 "--@@"4 "'>3v;!OP!OAqvvx!OP mmoT__-Gw'"4Q"7AQX\]Q]'A+cj<k"l"4Q"7AQX\]Q]'A+cj<k"l"n Woc&'"23C|!8L4MN,3doo6I,I<PT-"3w{'*:':$< =%F !|!R4GR6R" -Yw-?/;G/D-1^^(13B	-
 &,- KKu%1 .6 jn5>IAoBIIjnk:SXS[S[SdSdSf	 	

4%%&/MMII2	 II 
	K Qs   "J<c                 z   |}t        | j                        }t        |      D cg c]  }|||z
  dz
  z   c}d | j                  j	                  fd       d| j
                  D ][  }|j                  D ]  }|j	                  fd       dz   |j                  <|j                  j	                  fd       ] |k(  sJ | j                  | j                  fD ]  }|j	                  fd        | j                         D ]  \  }}	||	_         d	 }
| j	                  |
       yc c}w )
zASets the learning rate decay for each layer in the TinyViT model.r!   c                 <    | j                         D ]	  }||_         y)zTSets the learning rate scale for each layer in the model based on the layer's depth.N)
parameterslr_scale)mr   ps      r   _set_lr_scalez1TinyViT.set_layer_lr_decay.<locals>._set_lr_scale  s    \\^"
 $r    c                      | d         S )Nr   r   r:   r!  	lr_scaless    r   <lambda>z,TinyViT.set_layer_lr_decay.<locals>.<lambda>  s    q)A,)Gr    r   c                      |          S )Nr   r:   r!  rp   r$  s    r   r%  z,TinyViT.set_layer_lr_decay.<locals>.<lambda>  s    mAy|&Dr    Nc                 "     | dz
           S )Nr!   r   r'  s    r   r%  z,TinyViT.set_layer_lr_decay.<locals>.<lambda>  s    q)APQEBR1Sr    c                      | d         S )NrY   r   r#  s    r   r%  z,TinyViT.set_layer_lr_decay.<locals>.<lambda>  s    mAy}=r    c                 h    | j                         D ]  }t        |d      rJ |j                          y)zNChecks if the learning rate scale attribute is present in module's parameters.r  N)r  r   
param_name)r  r   s     r   _check_lr_scalez3TinyViT.set_layer_lr_decay.<locals>._check_lr_scale  s+    \\^q*-;q||;- $r    )r  r  rj   r  r  r  rm   rn   r  r  named_parametersr+  )r   r  
decay_raterg   rp   r  blockr  r   r   r,  r!  r$  s       `      @@r   r  zTinyViT.set_layer_lr_decay  s   #
 DKK <A%LILqZEAIM2LI		#
 	GH[[EDEQ & +  &&'ST ! Ezz..$)),AGG=> - ))+DAqAL ,	<
 	

?#7 Js   D8c                    t        |t        j                        r8|j                  +t        j                  j                  |j                  d       yyt        |t        j                        rUt        j                  j                  |j                  d       t        j                  j                  |j                  d       yy)zRInitializes weights for linear layers and layer normalization in the given module.Nr   r   )rk   r   r}   r	   r   r   r{   r   )r   r  s     r   r  zTinyViT._init_weights  s|    a# vv!!!!&&!, "2<<(GGaffa(GGahh, )r    c                     dhS )zQReturns a dictionary of parameter names where weight decay should not be applied.r   r   r   s    r   no_weight_decay_keywordsz TinyViT.no_weight_decay_keywords  s     ###r    c                 b   | j                  |      } | j                  d   |      }d}t        |t        | j                              D ]  }| j                  |   } ||      } |j	                         \  }}}|j                  |dd|      }|j                  dddd      }| j                  |      S )zKRuns the input through the model layers and returns the transformed output.r   r!   @   r-   r,   )r  r  rj   r[   sizer\   r]   r  )r   r:   start_irp   r  rb   r   r   s           r   forward_featureszTinyViT.forward_features  s    QDKKN1wDKK 01AKKNEaA 2 &&(1aFF1b"a IIaAq!yy|r    c                 $    | j                  |      S )zQExecutes a forward pass on the input tensor through the constructed model layers.)r7  r9   s     r   r;   zTinyViT.forward  s    $$Q''r    )r"   r#   r$   r%   r   r  r  r   jitignorer2  r7  r;   r&   r'   s   @r   r   r     sn    6 & "k
Z!$F	- YY$ $(r    r   )r   typingr   r   torch.nnr   torch.nn.functional
functionalr   torch.utils.checkpointutilsrs   ultralytics.utils.instancer   r2   r   Moduler)   r>   rQ   rd   ry   r   r   r   r   r   r   r    r   <module>rC     s          + + 0"## " 2#RYY #L ,299  ,F9D		 9Dx")) 8Y YxiM299 iMXH_ H_VI")) I$K(bii K(r    