# This code tries to improve TalkNet's performance on long phonemes by avoiding
# the use of <blank> tokens. To use it, modify `nemo/collections/tts/modules/talknet.py`
# by adding the import and replacing `GaussianEmbedding`.


from nemo.collections.tts.modules.fastspeech2_submodules import LengthRegulator


class GaussianEmbedding(nn.Module):
    """Gaussian embedding layer.."""

    EPS = 1e-6

    def __init__(
        self, vocab, d_emb, sigma_c=2.0, merge_blanks=False,
    ):
        super().__init__()

        self.embed = nn.Embedding(len(vocab.labels), d_emb)
        self.pad = vocab.pad
        self.sigma_c = sigma_c
        self.merge_blanks = merge_blanks

        # We keep everything above the same (even though we only use
        # self.embed) so that existing code and models will still work.
        self.length_regulator = LengthRegulator()

    def forward(self, text, durs):
        # Remove <blank> tokens. We keep the first <blank> so that the model
        # knows if there's silence at the beginning of the clip.
        text = torch.cat(
            (
                text[:, 0].unsqueeze(1),
                text[:, 1::2],
            ),
            1
        )

        # Add the duration of each <blank> token to the preceeding token
        # (again, except for the first <blank>).
        durs = torch.cat(
            (
                durs[:, 0].unsqueeze(1),
                durs[:, 1::2] + durs[:, 2::2],
            ),
            1
        )

        # Embed and repeat tokens
        embed = self.embed(text)
        return self.length_regulator(embed, durs)