# This code tries to improve TalkNet's performance on long phonemes by avoiding # the use of tokens. To use it, modify `nemo/collections/tts/modules/talknet.py` # by adding the import and replacing `GaussianEmbedding`. from nemo.collections.tts.modules.fastspeech2_submodules import LengthRegulator class GaussianEmbedding(nn.Module): """Gaussian embedding layer..""" EPS = 1e-6 def __init__( self, vocab, d_emb, sigma_c=2.0, merge_blanks=False, ): super().__init__() self.embed = nn.Embedding(len(vocab.labels), d_emb) self.pad = vocab.pad self.sigma_c = sigma_c self.merge_blanks = merge_blanks # We keep everything above the same (even though we only use # self.embed) so that existing code and models will still work. self.length_regulator = LengthRegulator() def forward(self, text, durs): # Remove tokens. We keep the first so that the model # knows if there's silence at the beginning of the clip. text = torch.cat( ( text[:, 0].unsqueeze(1), text[:, 1::2], ), 1 ) # Add the duration of each token to the preceeding token # (again, except for the first ). durs = torch.cat( ( durs[:, 0].unsqueeze(1), durs[:, 1::2] + durs[:, 2::2], ), 1 ) # Embed and repeat tokens embed = self.embed(text) return self.length_regulator(embed, durs)