I'm trying to build an undercomplete autoencoder for music dimensionality reduction. My Autoencoder class is modular, I can give in input a list of convlayers sizes and it creates me automatically the model. The problem is that when I try to create a model with more than 2 convolutional layers the decoder returns me the wrong dimensions. Here there is my AE model:
class Autoencoder(K.Model):
"""
Modular Autoencoder class
"""
def __init__(self,
in_shape: Tuple[int,int,int],
num_hidden_layers: int,
activation_func: str = "relu",
hidden_activation: str = "sigmoid",
conv_layers_sizes: List[int] = [128, 64]):
super(Autoencoder, self).__init__()
# Structure of the model
self.in_shape = in_shape
self.latent_dim = num_hidden_layers
self.conv_layers_sizes = conv_layers_sizes
self.hidden_activation = hidden_activation
# hyperparameters of the model
self.activation_func = activation_func
# build encoder and decoder
self.encoder = self._build_encoder()
self.decoder = self._build_decoder()
#self.last_cLayer_size = None
def summary(self):
"""
Function used to show the summary of the Autoencoder
"""
self.encoder.summary()
self.decoder.summary()
def _create_conv_layer(self,
n_filters: int,
enc_model) -> None:
"""
_create_conv_layer add a Conv1D layer to a given model
"""
enc_model.add(layers.Conv1D(n_filters,
kernel_size=3,
activation=self.activation_func,
padding='same',
strides=2,
name=f"Conv{n_filters}"))
enc_model.add(layers.BatchNormalization())
# try to not use max pooling
#enc_model.add(layers.MaxPool1D(2, strides=1,
# padding="same",
# name=f"BtchNorm_{n_filters}"))
def _create_deconv_layer(self,
n_filters: int,
decon_model) -> None:
"""
_create_deconv_layer add a Conv1DTranspose layer to a given model
"""
decon_model.add(layers.Conv1DTranspose(
n_filters,
kernel_size=3,
strides=2,
activation=self.activation_func,
padding='same',
name=f"ConvTransp{n_filters}"))
def _build_encoder(self) -> K.Sequential:
"""
_build_encoder creates the encoder
"""
model_encoder = K.Sequential(name="encoder")
model_encoder.add(layers.InputLayer(input_shape=self.in_shape))
# create encoder
for layer_size in self.conv_layers_sizes:
self._create_conv_layer(n_filters=layer_size,
enc_model=model_encoder)
# get the dimensions of the last layer in order to paste it into the
# decoder
last_layer = model_encoder.layers[-2]
_, *self.last_cLayer_size = last_layer.output_shape
#model_encoder.add(layers.GlobalAveragePooling2D(name="Flatten"))
model_encoder.add(layers.Flatten(name="Flatten"))
# hidden layers
model_encoder.add(layers.Dense(self.latent_dim,
activation=self.hidden_activation,
name=f"hidden_unit_{self.latent_dim}"))
return model_encoder
def _build_decoder(self) -> K.Sequential:
"""
_build_decoder creates the decoder
"""
model_decoder = K.Sequential(name="decoder")
model_decoder.add(layers.InputLayer(input_shape=self.latent_dim,
name="decoder_input"))
# calculate the dimension of the dense layer after the hidden
# representations
dense_to_reshape = 1
for dim in self.last_cLayer_size:
dense_to_reshape *= dim
# create a trainable dense layer with dimension adeguate to be reshaped
# in the same shape of the last layer of the encoder
model_decoder.add(layers.Dense(dense_to_reshape,
activation=self.activation_func))
model_decoder.add(layers.Dropout(0.8))
model_decoder.add(layers.Reshape(self.last_cLayer_size))
# create deconvolutions
for layer_size in reversed(self.conv_layers_sizes):
self._create_deconv_layer(layer_size, model_decoder)
model_decoder.add(layers.Conv1D(self.in_shape[-1],
kernel_size=3,
activation='sigmoid',
padding='same',
name=f"last_layer"))
return model_decoder
def call(self, x):
"""
Adapts the call method to the Autoencoder.
In this case call just reapplies all ops in the graph to the new inputs
(e.g. build a new computational graph from the provided inputs).
"""
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
here there are two examples of the output:
AE_model = Autoencoder(in_shape=(20, 862),
num_hidden_layers=2,
conv_layers_sizes=[128, 64],
activation_func="relu",
hidden_activation="sigmoid")
AE_model.summary()
OUT:
Model: "encoder"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
Conv128 (Conv1D) (None, 10, 128) 331136
_________________________________________________________________
batch_normalization_10 (Batc (None, 10, 128) 512
_________________________________________________________________
Conv64 (Conv1D) (None, 5, 64) 24640
_________________________________________________________________
batch_normalization_11 (Batc (None, 5, 64) 256
_________________________________________________________________
Flatten (Flatten) (None, 320) 0
_________________________________________________________________
hidden_unit_2 (Dense) (None, 2) 642
=================================================================
Total params: 357,186
Trainable params: 356,802
Non-trainable params: 384
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_5 (Dense) (None, 320) 960
_________________________________________________________________
dropout_5 (Dropout) (None, 320) 0
_________________________________________________________________
reshape_5 (Reshape) (None, 5, 64) 0
_________________________________________________________________
ConvTransp64 (Conv1DTranspos (None, 10, 64) 12352
_________________________________________________________________
ConvTransp128 (Conv1DTranspo (None, 20, 128) 24704
_________________________________________________________________
last_layer (Conv1D) (None, 20, 862) 331870
=================================================================
AE_model = Autoencoder(in_shape=(20, 862),
num_hidden_layers=2,
conv_layers_sizes=[128, 64, 32],
activation_func="relu",
hidden_activation="sigmoid")
AE_model.summary()
OUT:
Model: "encoder"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
Conv128 (Conv1D) (None, 10, 128) 331136
_________________________________________________________________
batch_normalization_12 (Batc (None, 10, 128) 512
_________________________________________________________________
Conv64 (Conv1D) (None, 5, 64) 24640
_________________________________________________________________
batch_normalization_13 (Batc (None, 5, 64) 256
_________________________________________________________________
Conv32 (Conv1D) (None, 3, 32) 6176
_________________________________________________________________
batch_normalization_14 (Batc (None, 3, 32) 128
_________________________________________________________________
Flatten (Flatten) (None, 96) 0
_________________________________________________________________
hidden_unit_2 (Dense) (None, 2) 194
=================================================================
Total params: 363,042
Trainable params: 362,594
Non-trainable params: 448
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_6 (Dense) (None, 96) 288
_________________________________________________________________
dropout_6 (Dropout) (None, 96) 0
_________________________________________________________________
reshape_6 (Reshape) (None, 3, 32) 0
_________________________________________________________________
ConvTransp32 (Conv1DTranspos (None, 6, 32) 3104
_________________________________________________________________
ConvTransp64 (Conv1DTranspos (None, 12, 64) 6208
_________________________________________________________________
ConvTransp128 (Conv1DTranspo (None, 24, 128) 24704
_________________________________________________________________
last_layer (Conv1D) (None, 24, 862) 331870
=================================================================
Why having more than 2 layers messes up with my final dimensions?