# ############################################################################ # Model: E2E ASR with transformer and transducer # Encoder: Conformer # Decoder: LSTM + beamsearch + RNNLM # Tokens: BPE with unigram # losses: Transducer + CTC (optional) + CE (optional) # Training: GigaSpeech # Authors: Titouan Parcollet 2024 # ############################################################################ # Feature parameters sample_rate: 16000 n_fft: 512 n_mels: 80 win_length: 32 # BPE parameters token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 ####################### Model Parameters ####################################### # Transformer d_model: 768 joint_dim: 512 nhead: 8 num_encoder_layers: 12 num_decoder_layers: 0 d_ffn: 2048 transformer_dropout: 0.1 activation: !name:torch.nn.GELU output_neurons: 1024 dec_dim: 512 dec_emb_dropout: 0.2 dec_dropout: 0.1 # Decoding parameters blank_index: 0 bos_index: 1 eos_index: 2 pad_index: 0 beam_size: 10 nbest: 1 # by default {state,expand}_beam = 2.3 as mention in paper # https://arxiv.org/abs/1904.02619 state_beam: 2.3 expand_beam: 2.3 normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref n_fft: !ref n_mels: !ref win_length: !ref ############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 num_layers_per_block: 1 out_channels: (64, 32) kernel_sizes: (3, 3) strides: (2, 2) residuals: (False, False) Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length input_size: 640 tgt_vocab: !ref d_model: !ref nhead: !ref num_encoder_layers: !ref num_decoder_layers: !ref d_ffn: !ref dropout: !ref activation: !ref encoder_module: conformer attention_type: RelPosMHAXL normalize_before: True causal: False # We must call an encoder wrapper so the decoder isn't run (we don't have any) enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper transformer: !ref # For MTL CTC over the encoder proj_ctc: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref # Define some projection layers to make sure that enc and dec # output dim are the same before joining proj_enc: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: False proj_dec: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: False emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref consider_as_one_hot: True blank_id: !ref dec: !new:speechbrain.nnet.RNN.LSTM input_shape: [null, null, !ref - 1] hidden_size: !ref num_layers: 1 re_init: True Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint joint: sum # joint [sum | concat] nonlinearity: !ref transducer_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: False # for MTL # update model if any HEAD module is added modules: CNN: !ref enc: !ref emb: !ref dec: !ref Tjoint: !ref transducer_lin: !ref normalize: !ref proj_ctc: !ref proj_dec: !ref proj_enc: !ref # update model if any HEAD module is added model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref , !ref , !ref , !ref , !ref ] ############################## Decoding & optimiser ############################ Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher decode_network_lst: [!ref , !ref , !ref ] tjoint: !ref classifier_network: [!ref ] blank_id: !ref beam_size: 1 nbest: 1 #Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher # decode_network_lst: [!ref , !ref , !ref ] # tjoint: !ref # classifier_network: [!ref ] # blank_id: !ref # beam_size: !ref # nbest: !ref # state_beam: !ref # expand_beam: !ref tokenizer: !new:sentencepiece.SentencePieceProcessor pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref normalizer: !ref tokenizer: !ref make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming - !ref # self fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper module: !new:speechbrain.nnet.containers.LengthsCapableSequential - !ref - !ref - !ref # don't consider normalization as part of the input filter chain. # normalization will operate at chunk level, which mismatches training # somewhat, but does not appear to result in noticeable degradation. properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties - [!ref , !ref ]