
    Ph                         d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ  ej                  e      Zd Zd Zd	 Zd
eeef   defdZdede
fdZy)zZImport Hugging Face transformers's wav2vec2.0 pretrained weights to torchaudios's format.
    N)AnyDict)Module   )wav2vec2_modelWav2Vec2Modelwavlm_modelc                    | j                    dt        t        | j                  | j                  | j
                              | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                   | j"                  | j$                  d}|S )N_norm)extractor_modeextractor_conv_layer_configextractor_conv_biasencoder_embed_dimencoder_projection_dropoutencoder_pos_conv_kernelencoder_pos_conv_groupsencoder_num_layersencoder_num_headsencoder_attention_dropoutencoder_ff_interm_featuresencoder_ff_interm_dropoutencoder_dropoutencoder_layer_norm_firstencoder_layer_drop)feat_extract_normlistzipconv_dimconv_kernelconv_stride	conv_biashidden_sizefeat_proj_dropoutnum_conv_pos_embeddingsnum_conv_pos_embedding_groupsnum_hidden_layersnum_attention_headsattention_dropoutintermediate_sizeactivation_dropouthidden_dropoutdo_stable_layer_norm	layerdropcfgconfigs     ~C:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torchaudio/models/wav2vec2/utils/import_huggingface.py_get_configr2      s     22359'+Ccoos,_'`"}} __&)&;&;#&#>#>#&#D#D!33 44%(%:%:&)&;&;%(%;%;--$'$<$<!mmF" M    c           	         i d| j                    ddt        t        | j                  | j                  | j
                              d| j                  d| j                  d| j                  d| j                  d| j                  d	| j                  d
| j                  d| j                  d| j                  d| j                  d| j                   d| j"                  d| j$                  d| j&                  d| j(                  }|S )Nr   r   r   r   r   r   r   r   r   r   encoder_num_bucketsencoder_max_distancer   r   r   r   r   r   )r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   num_bucketsmax_bucket_distancer(   r)   r*   r+   r,   r-   r.   s     r1   _get_config_wavlmr9   #   sR   S22359%tCcoos,_'` 	s}} 	S__	
 	%c&;&; 	"3#>#> 	"3#D#D 	c33 	S44 	s 	 7 7 	$S%:%: 	%c&;&; 	$S%;%; 	3--  	#C$<$<!" 	cmm#F& Mr3   c                    |j                   j                  dv }|r#|j                  j                  }|j                  }nt
        j                  d       d }|}|j                   j                  dv }|rt        di | d|i}nt        di | d|i}|j                  j                  |j                  j                                |j                  j                  j                  |j                  j                                |j                  j                         }|rt        || d          |j                  j                  j                  |       |r3|j                   j                  |j"                  j                                |S )N)Wav2Vec2ForCTCWavLMForCTCz`The model is not an instance of Wav2Vec2ForCTC or WavLMForCTC. "lm_head" module is not imported.
WavLMModelr<   aux_num_outr    )	__class____name__r0   
vocab_sizewav2vec2_LGwarningr	   r   feature_extractorload_state_dict
state_dictencoderfeature_projectiontransform_wavlm_encoder_statetransformerauxlm_head)r0   original
is_for_ctcr?   rD   is_wavlmimportedencoder_state_dicts           r1   _buildrU   :   sH   ##,,0QQJoo00$$q	
 !!**.KKHAA[A!DFDD..x/I/I/T/T/VW''778S8S8^8^8`a!))446%&8&AU:VW  001CD$$X%5%5%@%@%BCOr3   stater   c                    t        |      D ]  }| j                  d| d      }| j                  d| d      }| j                  d| d      }| j                  d| d      }| j                  d| d      }| j                  d| d      }t        j                  |||f      | d| d<   t        j                  |||f      | d| d	<   | j                  d| d
      | d| d<   | j                  d| d      | d| d<    y)zConverts WavLM encoder state from HuggingFace format. In particular, concatenates linear projection weights and
    biases to align with the structure of ``torch.nn.MultiheadAttention``.
    zlayers.z.attention.q_proj.biasz.attention.k_proj.biasz.attention.v_proj.biasz.attention.q_proj.weightz.attention.k_proj.weightz.attention.v_proj.weightz!.attention.attention.in_proj_biasz#.attention.attention.in_proj_weightz.attention.out_proj.weightz$.attention.attention.out_proj.weightz.attention.out_proj.biasz".attention.attention.out_proj.biasN)rangepoptorchcat)	rV   r   iq_proj_biask_proj_biasv_proj_biasq_proj_weightk_proj_weightv_proj_weights	            r1   rL   rL   U   sH    %&ii'!,B CDii'!,B CDii'!,B CD		GA3.F"GH		GA3.F"GH		GA3.F"GH@E		;XcepJq@rs;<=BG))M=9C
s=>? DI99wWXVYYsMtCus>?@AFWUVTWWoKpAqs<=> 'r3   rP   returnc                 R   t         j                  d       t         j                  d       | j                  j                  dv }|rt	        | j
                        }nt        | j
                        }t         j                  d|       t         j                  d       t        ||       }|S )a  Builds :class:`Wav2Vec2Model` from the corresponding model object of
    `Transformers <https://huggingface.co/transformers/>`_.

    Args:
        original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``.

    Returns:
        Wav2Vec2Model: Imported model.

    Example
        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
        >>>
        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        >>> model = import_huggingface_model(original)
        >>>
        >>> waveforms, _ = torchaudio.load("audio.wav")
        >>> logits, _ = model(waveforms)
    zImporting model.zLoading model configuration.r=   z  - config: %szBuilding model.)	rE   inforA   rB   r9   r0   r2   debugrU   )rP   rR   r0   rS   s       r1   import_huggingface_modelrg   i   s    & HH HH+,!!**.KKH"8??3X__-II'HHfh'HOr3   )__doc__loggingtypingr   r   rZ   torch.nnr   modelr   r   r	   	getLoggerrB   rE   r2   r9   rU   strintrL   rg   r@   r3   r1   <module>rp      sn        > >g!*.6rc3h rS r(v - r3   