
    FPh~                         d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
  G d dej                        Z G d d	ej                        Zy)
    )ListTupleTypeN)nn)
functional)LayerNorm2dc                       e Zd ZdZdej
                  ddddedej                  dedeej                     d	ed
eddf fdZ	de
j                  de
j                  de
j                  de
j                  dedee
j                  e
j                  f   fdZde
j                  de
j                  de
j                  de
j                  dee
j                  e
j                  f   f
dZ xZS )MaskDecoderan  
    Decoder module for generating masks and their associated quality scores, using a transformer architecture to predict
    masks given image and prompt embeddings.

    Attributes:
        transformer_dim (int): Channel dimension for the transformer module.
        transformer (nn.Module): The transformer module used for mask prediction.
        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
        iou_token (nn.Embedding): Embedding for the IoU token.
        num_mask_tokens (int): Number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for the mask tokens.
        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
        iou_prediction_head (nn.Module): MLP for predicting mask quality.
          )num_multimask_outputs
activationiou_head_depthiou_head_hidden_dimtransformer_dimtransformerr   r   r   r   returnNc                   t         |           || _        || _        || _        t        j                  d|      | _        |dz   | _        t        j                  | j                  |      | _	        t        j                  t        j                  ||dz  dd      t        |dz         |       t        j                  |dz  |dz  dd       |             | _        t        j                  t        | j                        D cg c]  }t!        |||dz  d       c}      | _        t!        ||| j                  |      | _        yc c}w )a  
        Predicts masks given an image and prompt embeddings, using a transformer architecture.

        Args:
            transformer_dim (int): the channel dimension of the transformer module
            transformer (nn.Module): the transformer used to predict masks
            num_multimask_outputs (int): the number of masks to predict when disambiguating masks
            activation (nn.Module): the type of activation to use when upscaling masks
            iou_head_depth (int): the depth of the MLP used to predict mask quality
            iou_head_hidden_dim (int): the hidden dimension of the MLP used to predict mask quality
                 )kernel_sizestride   r   N)super__init__r   r   r   r   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr   output_upscaling
ModuleListrangeMLPoutput_hypernetworks_mlpsiou_prediction_head)	selfr   r   r   r   r   r   _	__class__s	           rC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\ultralytics/models/sam/modules/decoders.pyr   zMaskDecoder.__init__   s0   * 	.&%:"a94q8<<(<(<oN "10DRS\]^1,-L!3_5IWXabcL!
 *,TYZ^ZnZnTo8qToqC/Q2FJTo8q *r& $'8KTMaMacq#r 8qs    Eimage_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputc                     | j                  ||||      \  }}|rt        dd      nt        dd      }|dd|ddddf   }|dd|f   }||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): the embeddings from the image encoder
            image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
            sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
            dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
            multimask_output (bool): Whether to return multiple masks or a single mask.

        Returns:
            torch.Tensor: batched predicted masks
            torch.Tensor: batched predictions of mask quality
        )r-   r.   r/   r0   r   Nr   )predict_masksslice)	r)   r-   r.   r/   r0   r1   masksiou_pred
mask_slices	            r,   forwardzMaskDecoder.forwardH   sm    , ,,-%=$;	 - 
x (8U1d^U1a[
aQ)*AzM* h    c           
         t        j                  | j                  j                  | j                  j                  gd      }|j                  d      j                  |j                  d      dd      }t        j                  ||fd      }t        j                  ||j                  d   d      }||z   }t        j                  ||j                  d   d      }|j                  \  }	}
}}| j                  |||      \  }}|dddddf   }|dddd| j                  z   ddf   }|j                  dd      j                  |	|
||      }| j                  |      }t        | j                        D cg c]!  } | j                   |   |dd|ddf         # }}t        j"                  |d      }|j                  \  }	}
}}||j                  |	|
||z        z  j                  |	d||      }| j%                  |      }||fS c c}w )zJ
        Predicts masks.

        See 'forward' for more details.
        r   )dimr   Nr   )torchcatr   weightr    	unsqueezeexpandsizerepeat_interleaveshaper   r   	transposeviewr#   r%   r'   stackr(   )r)   r-   r.   r/   r0   output_tokenstokenssrcpos_srcbchwhsiou_token_outmask_tokens_outupscaled_embeddingihyper_in_listhyper_inr5   r6   s                         r,   r3   zMaskDecoder.predict_masksm   s    		4>>#8#8$:J:J:Q:Q"RXYZ%//299:R:W:WXY:Z\^`bcM+CD!L %%&6QQO++))(FLLOKYY
1a ""38C1a7Q1t';';#; <a?@ mmAq!&&q!Q2!2237QVW[WkWkQl-nQlA-D**1-oaAg.FGQl 	 -n;;}!4'--
1a.33Aq!a%@@FFq"aQRS ++M:h-ns   5&H)__name__
__module____qualname____doc__r   GELUintModuler   r   r=   Tensorboolr   r8   r3   __classcell__r+   s   @r,   r
   r
      s3   * &'&(gg#&)s )s YY	)s
  #)s O)s )s !)s 
)sV#,,# ,,# #(,,	#
 "'# # 
u||U\\)	*#J(,,( ,,( #(,,	(
 "'( 
u||U\\)	*(r9   r
   c                   F     e Zd ZdZ	 ddedededededdf fd	Zd
 Z xZS )r&   z
    MLP (Multi-Layer Perceptron) model lightly adapted from
    https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py
    	input_dim
hidden_dim
output_dim
num_layerssigmoid_outputr   Nc                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        || _        y)a  
        Initializes the MLP (Multi-Layer Perceptron) model.

        Args:
            input_dim (int): The dimensionality of the input features.
            hidden_dim (int): The dimensionality of the hidden layers.
            output_dim (int): The dimensionality of the output layer.
            num_layers (int): The number of hidden layers.
            sigmoid_output (bool, optional): Whether to apply a sigmoid activation to the output layer. Defaults to False.
        r   c              3   N   K   | ]  \  }}t        j                  ||        y w)N)r   Linear).0nks      r,   	<genexpr>zMLP.__init__.<locals>.<genexpr>   s!     #g@f1BIIaO@fs   #%N)r   r   rf   r   r$   ziplayersrg   )r)   rc   rd   re   rf   rg   rN   r+   s          r,   r   zMLP.__init__   s]    $ 	$LJN+mm#gYKRSOUVZdYeUe@f#gg,r9   c                     t        | j                        D ]:  \  }}|| j                  dz
  k  rt        j                   ||            n ||      }< | j
                  rt        j                  |      }|S )zMExecutes feedforward within the neural network module and applies activation.r   )	enumeraterp   rf   Frelurg   r=   sigmoid)r)   xrT   layers       r,   r8   zMLP.forward   s]    !$++.HAu$%!(;$;uQx qA /a Ar9   )F)	rW   rX   rY   rZ   r\   r_   r   r8   r`   ra   s   @r,   r&   r&      sQ      %-- - 	-
 - - 
-0r9   r&   )typingr   r   r   r=   r   torch.nnr   rs   ultralytics.nn.modulesr   r]   r
   r&    r9   r,   <module>r|      s<    % $   $ .I")) IX$")) $r9   