
    Ph$                         d Z ddlZddlmZ ddlmZmZ d Zd Zd Z	d	ed
efdZ
d	ed
efdZd	ed
efdZd	ed
efdZd	ed
efdZy)zvImport fariseq's wav2vec2.0 pretrained weights to torchaudios's format.

For this module to work, you need `fairseq`.
    N)Module   )wav2vec2_modelWav2Vec2Modelc                    | j                   }| j                  j                  }d}d|d   d   j                  j                  v rd}nd}|D cg c]4  }|d   j
                  |d   j                  d   |d   j                  d   f6 }}t        d |D              rd}n t        d |D              rd	}nt        d
      |||| j                  j                  | j                  j                  |j                  d   j                  d   |j                  d   j                  t!        |j"                        |j"                  d   j$                  j&                  |j"                  d   j$                  j(                  j                  |j"                  d   j*                  j                  |j"                  d   j,                  j                  |j"                  d   j.                  j                  |j0                  |j2                  d}|S c c}w )N
layer_norm	GroupNormr   r   
group_normc              3   >   K   | ]  }|d    j                   du   ywr   Nbias.0ls     zC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torchaudio/models/wav2vec2/utils/import_fairseq.py	<genexpr>z _parse_config.<locals>.<genexpr>   s     
2k1Q499k   Fc              3   >   K   | ]  }|d    j                   du  ywr   r   r   s     r   r   z _parse_config.<locals>.<genexpr>   s     8KqQqTYYd"Kr   TzIEither all the convolutions layers have bias term or none of them should.)extractor_modeextractor_conv_layer_configextractor_conv_biasencoder_embed_dimencoder_projection_dropoutencoder_pos_conv_kernelencoder_pos_conv_groupsencoder_num_layersencoder_num_headsencoder_attention_dropoutencoder_ff_interm_featuresencoder_ff_interm_dropoutencoder_dropoutencoder_layer_norm_firstencoder_layer_drop)encoderfeature_extractorconv_layers	__class____name__out_channelskernel_sizestrideall
ValueErrorpost_extract_projout_featuresdropout_inputppos_convgroupslenlayers	self_attn	num_headsdropout_modulefc1dropout2dropout3layer_norm_first	layerdrop)	w2v_modelr%   r'   r   r   conv_layer_config	conv_biasconfigs           r   _parse_configrC      s   G--99K!Nk!nQ'11:::%%[fg[fVW!A$++QqT-=-=a-@!A$++a.Q[fg

2k
22		8K8	8	dee )'8(&88EE&/&=&=&?&?#*#3#3A#6#B#B1#E#*#3#3A#6#=#=!'..1$^^A.88BB%,^^A%6%@%@%O%O%Q%Q&-nnQ&7&;&;&H&H%,^^A%6%?%?%A%A">>!,5577$+$<$<%//F" M5 hs   9G?c                    | }| j                  d      r| j                  dd      } t        j                  d|       ry t        j                  d|       }|rd|j	                  d       S t        j                  d|       }|r&d|j	                  d       d	|j	                  d
       S t        j                  d|       }|r&d|j	                  d       d|j	                  d
       S t        j                  d|       }|rd|j	                  d       S t        j                  d|       }|rd|j	                  d       S t        j                  d|       }|rd|j	                  d       S t        j                  d|       }|rd|j	                  d       S t        j                  d|       }|r&d|j	                  d       d|j	                  d
       S t        j                  d|       }|r&d|j	                  d       d|j	                  d
       S t        j                  d|       }|r&d|j	                  d       d|j	                  d
       S t        j                  d|       }|r&d|j	                  d       d|j	                  d
       S t        j                  d|       }|r&d|j	                  d       d|j	                  d
       S t        j                  d|       }|rd |j	                  d       S | d!v r| S t        d"|       )#Nz
w2v_model. z2(mask_emb|quantizer|project_q|final_proj|mask_emb)z3feature_extractor\.conv_layers\.0\.2\.(weight|bias)z+feature_extractor.conv_layers.0.layer_norm.   z7feature_extractor\.conv_layers\.(\d+)\.0\.(weight|bias)zfeature_extractor.conv_layers.z.conv.r   z:feature_extractor\.conv_layers\.(\d+)\.2\.1\.(weight|bias)z.layer_norm.z post_extract_proj\.(weight|bias)z&encoder.feature_projection.projection.zlayer_norm\.(weight|bias)z&encoder.feature_projection.layer_norm.z.encoder\.pos_conv\.0\.(bias|weight_g|weight_v)z(encoder.transformer.pos_conv_embed.conv.z"encoder\.layer_norm\.(weight|bias)zencoder.transformer.layer_norm.zGencoder\.layers\.(\d+)\.self_attn\.((k_|v_|q_|out_)proj\.(weight|bias))zencoder.transformer.layers.z.attention.z;encoder\.layers\.(\d+)\.self_attn_layer_norm\.(weight|bias)z*encoder\.layers\.(\d+)\.fc1\.(weight|bias)z!.feed_forward.intermediate_dense.z*encoder\.layers\.(\d+)\.fc2\.(weight|bias)z.feed_forward.output_dense.z7encoder\.layers\.(\d+)\.final_layer_norm\.(weight|bias)z.final_layer_norm.zproj\.(weight|bias)zaux.)label_embs_concatzUnexpected key: )
startswithreplacerematchgroupr.   )keykey_rK   s      r   _map_keyrO   3   s    D
~~l#kk,+	xxEsK HHKSQE<U[[^<LMM HHOQTUE/A/?vekkRSnEUVV HHRTWXE/A/?|EKKXYNK[\\HH8#>E7A7GHHHH137E7A7GHHHHFLE9%++a.9IJJHH:C@E0Q0@AAHH_adeE,U[[^,<KTUGWXXHHSUXYE,U[[^,<LUVHXYYHHBCHE,U[[^,<<]^c^i^ijk^l]mnnHHBCHE,U[[^,<<WX]XcXcdeXfWghhHHOQTUE,U[[^,<<Nu{{[\~N^__HH+S1E ekk!n%&&
##

'v.
//    c                 `    i }| j                         D ]  \  }}t        |      }||||<    |S )N)itemsrO   )
state_dict	convertedkvs       r   _convert_state_dictrW   v   s=    I  "1QK=IaL # rP   originalreturnc                     | j                   j                  }|dk(  rt        |       S |dk(  rt        |       S |dk(  rt	        |       S |dk(  rt        |       S t        d|       )a+  Builds :class:`Wav2Vec2Model` from the corresponding model object of
    `fairseq <https://github.com/pytorch/fairseq>`_.

    Args:
        original (torch.nn.Module):
            An instance of fairseq's Wav2Vec2.0 or HuBERT model.
            One of ``fairseq.models.wav2vec.wav2vec2_asr.Wav2VecEncoder``,
            ``fairseq.models.wav2vec.wav2vec2.Wav2Vec2Model`` or
            ``fairseq.models.hubert.hubert_asr.HubertEncoder``.

    Returns:
        Wav2Vec2Model: Imported model.

    Example - Loading pretrain-only model
        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
        >>>
        >>> # Load model using fairseq
        >>> model_file = 'wav2vec_small.pt'
        >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
        >>> original = model[0]
        >>> imported = import_fairseq_model(original)
        >>>
        >>> # Perform feature extraction
        >>> waveform, _ = torchaudio.load('audio.wav')
        >>> features, _ = imported.extract_features(waveform)
        >>>
        >>> # Compare result with the original model from fairseq
        >>> reference = original.feature_extractor(waveform).transpose(1, 2)
        >>> torch.testing.assert_allclose(features, reference)

    Example - Fine-tuned model
        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
        >>>
        >>> # Load model using fairseq
        >>> model_file = 'wav2vec_small_960h.pt'
        >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
        >>> original = model[0]
        >>> imported = import_fairseq_model(original.w2v_encoder)
        >>>
        >>> # Perform encoding
        >>> waveform, _ = torchaudio.load('audio.wav')
        >>> emission, _ = imported(waveform)
        >>>
        >>> # Compare result with the original model from fairseq
        >>> mask = torch.zeros_like(waveform)
        >>> reference = original(waveform, mask)['encoder_out'].transpose(0, 1)
        >>> torch.testing.assert_allclose(emission, reference)
    r   Wav2VecEncoderHubertModelHubertEncoderzDExpected an instance of `Wav2Vec2Model` or `Wav2VecEncoder`. Found: )r(   r)   _import_wav2vec2_pretraining_import_wav2vec2_finetuning_import_hubert_pretraining_import_hubert_finetuningr.   )rX   class_s     r   import_fairseq_modelrc      sx    b ((F +H55!!*844)(33 (22
[\b[cd
eerP   c                     t        | j                        }t        di |d| j                  j                  i}|j                  t        | j                                      |S )Naux_num_out rC   r?   r   projr0   load_state_dictrW   rS   rX   rB   models      r   r_   r_      sP    8--.FLVL1K1KLE	-h.A.A.CDELrP   c                     t        |       }t        di |dd i}|j                  t        | j	                               d       |S Nre   F)strictrf   rC   r   ri   rW   rS   rj   s      r   r^   r^      E    8$F6V66E	-h.A.A.CDUSLrP   c                     t        | j                        }t        di |d| j                  j                  i}|j                  t        | j                               d       |S rm   rg   rj   s      r   ra   ra      sU    8--.FLVL1K1KLE	-h.A.A.CDUSLrP   c                     t        |       }t        di |dd i}|j                  t        | j	                               d       |S rm   ro   rj   s      r   r`   r`      rp   rP   )__doc__rJ   torch.nnr   rk   r   r   rC   rO   rW   rc   r_   r^   ra   r`   rf   rP   r   <module>ru      s    
  1$N@0F:f6 :fm :fz& ] 6 m  =  M rP   