Conv1dTranspose creates the wrong dimensions

Question

I'm trying to build an undercomplete autoencoder for music dimensionality reduction. My Autoencoder class is modular, I can give in input a list of convlayers sizes and it creates me automatically the model. The problem is that when I try to create a model with more than 2 convolutional layers the decoder returns me the wrong dimensions. Here there is my AE model:

class Autoencoder(K.Model):
    """
        Modular Autoencoder class
    """
    def __init__(self,
                 in_shape: Tuple[int,int,int], 
                 num_hidden_layers: int, 
                 activation_func: str = "relu",
                 hidden_activation: str = "sigmoid",
                 conv_layers_sizes: List[int] = [128, 64]):
    
        super(Autoencoder, self).__init__()
        # Structure of the model
        self.in_shape = in_shape  
        self.latent_dim = num_hidden_layers
        self.conv_layers_sizes = conv_layers_sizes
        self.hidden_activation = hidden_activation

        # hyperparameters of the model
        self.activation_func = activation_func

        # build encoder and decoder
        self.encoder = self._build_encoder() 
        self.decoder = self._build_decoder()

        #self.last_cLayer_size = None


    def summary(self):
        """
        Function used to show the summary of the Autoencoder
        """
        self.encoder.summary()
        self.decoder.summary()


    def _create_conv_layer(self, 
                           n_filters: int, 
                           enc_model) -> None:
        """
        _create_conv_layer add a Conv1D layer to a given model
        """
        enc_model.add(layers.Conv1D(n_filters,
                                    kernel_size=3,
                                    activation=self.activation_func,
                                    padding='same',
                                    strides=2, 
                                    name=f"Conv{n_filters}"))
        enc_model.add(layers.BatchNormalization())
        # try to not use max pooling
        #enc_model.add(layers.MaxPool1D(2, strides=1, 
        #                               padding="same",
        #                               name=f"BtchNorm_{n_filters}"))


    def _create_deconv_layer(self, 
                             n_filters: int, 
                             decon_model) -> None:
        """
        _create_deconv_layer add a Conv1DTranspose layer to a given model
        """
        decon_model.add(layers.Conv1DTranspose(
            n_filters,
            kernel_size=3,
            strides=2,
            activation=self.activation_func,
            padding='same', 
            name=f"ConvTransp{n_filters}"))                                               


    def _build_encoder(self) -> K.Sequential:
        """
        _build_encoder creates the encoder
        """
        model_encoder = K.Sequential(name="encoder")
        model_encoder.add(layers.InputLayer(input_shape=self.in_shape))
        # create encoder
        for layer_size in self.conv_layers_sizes:
            self._create_conv_layer(n_filters=layer_size, 
                                    enc_model=model_encoder)

        # get the dimensions of the last layer in order to paste it into the 
        # decoder
        last_layer = model_encoder.layers[-2]
        _, *self.last_cLayer_size = last_layer.output_shape

        #model_encoder.add(layers.GlobalAveragePooling2D(name="Flatten"))
        model_encoder.add(layers.Flatten(name="Flatten"))
        
        # hidden layers
        model_encoder.add(layers.Dense(self.latent_dim, 
                                       activation=self.hidden_activation, 
                                       name=f"hidden_unit_{self.latent_dim}"))
        return model_encoder
    

    def _build_decoder(self) -> K.Sequential:
        """
        _build_decoder creates the decoder
        """
        model_decoder = K.Sequential(name="decoder")
        model_decoder.add(layers.InputLayer(input_shape=self.latent_dim, 
                                            name="decoder_input"))
        # calculate the dimension of the dense layer after the hidden 
        # representations
        dense_to_reshape = 1
        for dim in self.last_cLayer_size:
            dense_to_reshape *= dim

        # create a trainable dense layer with dimension adeguate to be reshaped
        # in the same shape of the last layer of the encoder
        model_decoder.add(layers.Dense(dense_to_reshape, 
                                       activation=self.activation_func))
        model_decoder.add(layers.Dropout(0.8))
        model_decoder.add(layers.Reshape(self.last_cLayer_size))
        
        # create deconvolutions
        for layer_size in reversed(self.conv_layers_sizes):
            self._create_deconv_layer(layer_size, model_decoder)
        
        model_decoder.add(layers.Conv1D(self.in_shape[-1], 
                                        kernel_size=3, 
                                        activation='sigmoid', 
                                        padding='same',
                                        name=f"last_layer"))
        return model_decoder


    def call(self, x):
        """
        Adapts the call method to the Autoencoder.

        In this case call just reapplies all ops in the graph to the new inputs 
        (e.g. build a new computational graph from the provided inputs).
        """
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

here there are two examples of the output:

AE_model = Autoencoder(in_shape=(20, 862),    
                        num_hidden_layers=2, 
                        conv_layers_sizes=[128, 64], 
                       activation_func="relu", 
                       hidden_activation="sigmoid")
AE_model.summary()

OUT:
Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Conv128 (Conv1D)             (None, 10, 128)           331136    
_________________________________________________________________
batch_normalization_10 (Batc (None, 10, 128)           512       
_________________________________________________________________
Conv64 (Conv1D)              (None, 5, 64)             24640     
_________________________________________________________________
batch_normalization_11 (Batc (None, 5, 64)             256       
_________________________________________________________________
Flatten (Flatten)            (None, 320)               0         
_________________________________________________________________
hidden_unit_2 (Dense)        (None, 2)                 642       
=================================================================
Total params: 357,186
Trainable params: 356,802
Non-trainable params: 384
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_5 (Dense)              (None, 320)               960       
_________________________________________________________________
dropout_5 (Dropout)          (None, 320)               0         
_________________________________________________________________
reshape_5 (Reshape)          (None, 5, 64)             0         
_________________________________________________________________
ConvTransp64 (Conv1DTranspos (None, 10, 64)            12352     
_________________________________________________________________
ConvTransp128 (Conv1DTranspo (None, 20, 128)           24704     
_________________________________________________________________
last_layer (Conv1D)          (None, 20, 862)           331870    
=================================================================

AE_model = Autoencoder(in_shape=(20, 862),    
                        num_hidden_layers=2, 
                        conv_layers_sizes=[128, 64, 32], 
                       activation_func="relu", 
                       hidden_activation="sigmoid")
AE_model.summary()

OUT:
Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Conv128 (Conv1D)             (None, 10, 128)           331136    
_________________________________________________________________
batch_normalization_12 (Batc (None, 10, 128)           512       
_________________________________________________________________
Conv64 (Conv1D)              (None, 5, 64)             24640     
_________________________________________________________________
batch_normalization_13 (Batc (None, 5, 64)             256       
_________________________________________________________________
Conv32 (Conv1D)              (None, 3, 32)             6176      
_________________________________________________________________
batch_normalization_14 (Batc (None, 3, 32)             128       
_________________________________________________________________
Flatten (Flatten)            (None, 96)                0         
_________________________________________________________________
hidden_unit_2 (Dense)        (None, 2)                 194       
=================================================================
Total params: 363,042
Trainable params: 362,594
Non-trainable params: 448
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_6 (Dense)              (None, 96)                288       
_________________________________________________________________
dropout_6 (Dropout)          (None, 96)                0         
_________________________________________________________________
reshape_6 (Reshape)          (None, 3, 32)             0         
_________________________________________________________________
ConvTransp32 (Conv1DTranspos (None, 6, 32)             3104      
_________________________________________________________________
ConvTransp64 (Conv1DTranspos (None, 12, 64)            6208      
_________________________________________________________________
ConvTransp128 (Conv1DTranspo (None, 24, 128)           24704     
_________________________________________________________________
last_layer (Conv1D)          (None, 24, 862)           331870    
=================================================================

Why having more than 2 layers messes up with my final dimensions?

Giuseppe Minardi · Accepted Answer · 2021-06-05 12:18:46Z

0

I ended up with a solution on my own. The problem is the stride when the stride is > 1. Setting the stride to be always 1 makes Everything works. An explanation can be found here

answered Jun 5, 2021 at 12:18

Giuseppe Minardi

4114 silver badges18 bronze badges

Sign up to request clarification or add additional context in comments.

Collectives™ on Stack Overflow

Conv1dTranspose creates the wrong dimensions

1 Answer 1

Comments

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Related