Source code for sgnlp.models.csgec.modules.conv_glu

from numpy import sqrt
import torch.nn as nn
import torch.nn.functional as F


[docs]class ConvGLU(nn.Module):
    """
    CNN based encoder. Inputs are padded on both sides before passing through a 1D CNN, a GLU activation function, a skip connection, an optional dropout layer and a fully connected linear layer.
    """

    def __init__(self, input_dim, kernel_size, dropout):
        """
        input_dim : int
            Encoder input (and output) embedding dimension size.
        kernel_size : int
            Kernel size / patch size. Number of tokens for each convolution.
        dropout : float
            Probability of setting each embedding dimension to 0 during training.
        """

        super(ConvGLU, self).__init__()
        self.conv = nn.Conv1d(
            in_channels=input_dim,
            out_channels=input_dim * 2,  # note that this is multiplied by 2 for the GLU
            kernel_size=kernel_size,
            padding=int((kernel_size - 1) / 2),
        )
        self.dropout = nn.Dropout2d(dropout)

[docs]    def forward(self, H):
        """
        H : torch Tensor
            Output from the previous encoder layer. Shape of (batch size, sequence length, hidden dim / number of "channels").
        """

        residual_H = H
        H = H.transpose(1, 2)
        H = self.conv(H)
        H = H.transpose(1, 2)
        H = F.glu(H)
        H = (H + residual_H) * sqrt(0.5)

        return H


[docs]class ConvGLUDecoder(nn.Module):
    """
    CNN based encoder. Inputs are padded on both sides before passing through a 1D CNN, a GLU activation function, a skip connection, an optional dropout layer and a fully connected linear layer.
    """

    def __init__(self, input_dim, kernel_size, dropout, padding_idx):
        """
        input_dim : int
            Encoder input (and output) embedding dimension size.
        kernel_size : int
            Kernel size / patch size. Number of tokens for each convolution.
        dropout : float
            Probability of setting each embedding dimension to 0 during training.
        """

        super(ConvGLUDecoder, self).__init__()
        self.conv = nn.Conv1d(
            in_channels=input_dim,
            out_channels=input_dim * 2,  # note that this is multiplied by 2 for the GLU
            kernel_size=kernel_size,
            padding=0,
        )
        self.padding_idx = padding_idx
        self.kernel_size = kernel_size

[docs]    def forward(self, H):
        """
        H : torch Tensor
            Output from the previous encoder layer. Shape of (batch size, sequence length, hidden dim / number of "channels").
        """
        # print("H", H.shape)
        H = H.transpose(1, 2)
        H = F.pad(
            H, (self.kernel_size - H.shape[2], 0), value=0
        )  # TODO Check the padding idx
        H = self.conv(H)
        H = H.transpose(1, 2)
        H = F.glu(H)

        return H