0

I'm trying to build an undercomplete autoencoder for music dimensionality reduction. My Autoencoder class is modular, I can give in input a list of convlayers sizes and it creates me automatically the model. The problem is that when I try to create a model with more than 2 convolutional layers the decoder returns me the wrong dimensions. Here there is my AE model:

class Autoencoder(K.Model):
    """
        Modular Autoencoder class
    """
    def __init__(self,
                 in_shape: Tuple[int,int,int], 
                 num_hidden_layers: int, 
                 activation_func: str = "relu",
                 hidden_activation: str = "sigmoid",
                 conv_layers_sizes: List[int] = [128, 64]):
    
        super(Autoencoder, self).__init__()
        # Structure of the model
        self.in_shape = in_shape  
        self.latent_dim = num_hidden_layers
        self.conv_layers_sizes = conv_layers_sizes
        self.hidden_activation = hidden_activation

        # hyperparameters of the model
        self.activation_func = activation_func

        # build encoder and decoder
        self.encoder = self._build_encoder() 
        self.decoder = self._build_decoder()

        #self.last_cLayer_size = None


    def summary(self):
        """
        Function used to show the summary of the Autoencoder
        """
        self.encoder.summary()
        self.decoder.summary()


    def _create_conv_layer(self, 
                           n_filters: int, 
                           enc_model) -> None:
        """
        _create_conv_layer add a Conv1D layer to a given model
        """
        enc_model.add(layers.Conv1D(n_filters,
                                    kernel_size=3,
                                    activation=self.activation_func,
                                    padding='same',
                                    strides=2, 
                                    name=f"Conv{n_filters}"))
        enc_model.add(layers.BatchNormalization())
        # try to not use max pooling
        #enc_model.add(layers.MaxPool1D(2, strides=1, 
        #                               padding="same",
        #                               name=f"BtchNorm_{n_filters}"))


    def _create_deconv_layer(self, 
                             n_filters: int, 
                             decon_model) -> None:
        """
        _create_deconv_layer add a Conv1DTranspose layer to a given model
        """
        decon_model.add(layers.Conv1DTranspose(
            n_filters,
            kernel_size=3,
            strides=2,
            activation=self.activation_func,
            padding='same', 
            name=f"ConvTransp{n_filters}"))                                               


    def _build_encoder(self) -> K.Sequential:
        """
        _build_encoder creates the encoder
        """
        model_encoder = K.Sequential(name="encoder")
        model_encoder.add(layers.InputLayer(input_shape=self.in_shape))
        # create encoder
        for layer_size in self.conv_layers_sizes:
            self._create_conv_layer(n_filters=layer_size, 
                                    enc_model=model_encoder)

        # get the dimensions of the last layer in order to paste it into the 
        # decoder
        last_layer = model_encoder.layers[-2]
        _, *self.last_cLayer_size = last_layer.output_shape

        #model_encoder.add(layers.GlobalAveragePooling2D(name="Flatten"))
        model_encoder.add(layers.Flatten(name="Flatten"))
        
        # hidden layers
        model_encoder.add(layers.Dense(self.latent_dim, 
                                       activation=self.hidden_activation, 
                                       name=f"hidden_unit_{self.latent_dim}"))
        return model_encoder
    

    def _build_decoder(self) -> K.Sequential:
        """
        _build_decoder creates the decoder
        """
        model_decoder = K.Sequential(name="decoder")
        model_decoder.add(layers.InputLayer(input_shape=self.latent_dim, 
                                            name="decoder_input"))
        # calculate the dimension of the dense layer after the hidden 
        # representations
        dense_to_reshape = 1
        for dim in self.last_cLayer_size:
            dense_to_reshape *= dim

        # create a trainable dense layer with dimension adeguate to be reshaped
        # in the same shape of the last layer of the encoder
        model_decoder.add(layers.Dense(dense_to_reshape, 
                                       activation=self.activation_func))
        model_decoder.add(layers.Dropout(0.8))
        model_decoder.add(layers.Reshape(self.last_cLayer_size))
        
        # create deconvolutions
        for layer_size in reversed(self.conv_layers_sizes):
            self._create_deconv_layer(layer_size, model_decoder)
        
        model_decoder.add(layers.Conv1D(self.in_shape[-1], 
                                        kernel_size=3, 
                                        activation='sigmoid', 
                                        padding='same',
                                        name=f"last_layer"))
        return model_decoder


    def call(self, x):
        """
        Adapts the call method to the Autoencoder.

        In this case call just reapplies all ops in the graph to the new inputs 
        (e.g. build a new computational graph from the provided inputs).
        """
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

here there are two examples of the output:

AE_model = Autoencoder(in_shape=(20, 862),    
                        num_hidden_layers=2, 
                        conv_layers_sizes=[128, 64], 
                       activation_func="relu", 
                       hidden_activation="sigmoid")
AE_model.summary()

OUT:
Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Conv128 (Conv1D)             (None, 10, 128)           331136    
_________________________________________________________________
batch_normalization_10 (Batc (None, 10, 128)           512       
_________________________________________________________________
Conv64 (Conv1D)              (None, 5, 64)             24640     
_________________________________________________________________
batch_normalization_11 (Batc (None, 5, 64)             256       
_________________________________________________________________
Flatten (Flatten)            (None, 320)               0         
_________________________________________________________________
hidden_unit_2 (Dense)        (None, 2)                 642       
=================================================================
Total params: 357,186
Trainable params: 356,802
Non-trainable params: 384
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_5 (Dense)              (None, 320)               960       
_________________________________________________________________
dropout_5 (Dropout)          (None, 320)               0         
_________________________________________________________________
reshape_5 (Reshape)          (None, 5, 64)             0         
_________________________________________________________________
ConvTransp64 (Conv1DTranspos (None, 10, 64)            12352     
_________________________________________________________________
ConvTransp128 (Conv1DTranspo (None, 20, 128)           24704     
_________________________________________________________________
last_layer (Conv1D)          (None, 20, 862)           331870    
=================================================================
AE_model = Autoencoder(in_shape=(20, 862),    
                        num_hidden_layers=2, 
                        conv_layers_sizes=[128, 64, 32], 
                       activation_func="relu", 
                       hidden_activation="sigmoid")
AE_model.summary()

OUT:
Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Conv128 (Conv1D)             (None, 10, 128)           331136    
_________________________________________________________________
batch_normalization_12 (Batc (None, 10, 128)           512       
_________________________________________________________________
Conv64 (Conv1D)              (None, 5, 64)             24640     
_________________________________________________________________
batch_normalization_13 (Batc (None, 5, 64)             256       
_________________________________________________________________
Conv32 (Conv1D)              (None, 3, 32)             6176      
_________________________________________________________________
batch_normalization_14 (Batc (None, 3, 32)             128       
_________________________________________________________________
Flatten (Flatten)            (None, 96)                0         
_________________________________________________________________
hidden_unit_2 (Dense)        (None, 2)                 194       
=================================================================
Total params: 363,042
Trainable params: 362,594
Non-trainable params: 448
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_6 (Dense)              (None, 96)                288       
_________________________________________________________________
dropout_6 (Dropout)          (None, 96)                0         
_________________________________________________________________
reshape_6 (Reshape)          (None, 3, 32)             0         
_________________________________________________________________
ConvTransp32 (Conv1DTranspos (None, 6, 32)             3104      
_________________________________________________________________
ConvTransp64 (Conv1DTranspos (None, 12, 64)            6208      
_________________________________________________________________
ConvTransp128 (Conv1DTranspo (None, 24, 128)           24704     
_________________________________________________________________
last_layer (Conv1D)          (None, 24, 862)           331870    
=================================================================

Why having more than 2 layers messes up with my final dimensions?

1 Answer 1

0

I ended up with a solution on my own. The problem is the stride when the stride is > 1. Setting the stride to be always 1 makes Everything works. An explanation can be found here

Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.