Update version number: now 0.2.0

Merge pull request #814 from matsuyamax/shapeinfer
Fix variable sharing issue with NonNeg constraint.
2015-10-10 17:51:59 -07:00 · 2015-10-10 17:17:41 -07:00 · 2015-10-10 16:50:33 -07:00 · 2015-10-10 13:09:37 -07:00 · 2015-10-10 20:50:26 +01:00 · 2015-10-09 22:47:43 -07:00
@@ -8,11 +8,15 @@ before_install:
  - export PATH=/home/travis/miniconda/bin:$PATH
  - conda update --yes conda
 python:
+  - "2.7"
  - "3.4"
 # command to install dependencies
 install:
  - conda install --yes python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py
-  # Coverage packages are on my binstar channel
-  - python setup.py install
+  - pip install pytest-cov python-coveralls
+  - pip install git+git://github.com/Theano/Theano.git
 # command to run tests
-script: py.test tests/
+script:
+  - PYTHONPATH=$PWD:$PYTHONPATH py.test -v --cov-report term-missing --cov keras tests/
+after_success:
+  - coveralls
@@ -34,13 +34,16 @@ from keras.layers.core import Dense, Dropout, Activation
 from keras.optimizers import SGD

 model = Sequential()
-model.add(Dense(20, 64, init='uniform'))
+# Dense(64) is a fully-connected layer with 64 hidden units.
+# in the first layer, you must specify the expected input data shape:
+# here, 20-dimensional vectors.
+model.add(Dense(64, input_dim=20, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform'))
+model.add(Dense(2, init='uniform'))
 model.add(Activation('softmax'))

 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@@ -54,11 +57,11 @@ score = model.evaluate(X_test, y_test, batch_size=16)

 ```python
 model = Sequential()
-model.add(Dense(20, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, input_dim=20, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform', activation='softmax'))
+model.add(Dense(2, init='uniform', activation='softmax'))

 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='mean_squared_error', optimizer=sgd)
@@ -73,26 +76,29 @@ from keras.layers.convolutional import Convolution2D, MaxPooling2D
 from keras.optimizers import SGD

 model = Sequential()
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
+# input: 100x100 images with 3 channels -> (3, 100, 100) tensors.
+# this applies 32 convolution filters of size 3x3 each.
+model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100)))
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
+model.add(Convolution2D(32, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+model.add(Convolution2D(64, 3, 3, border_mode='valid'))
 model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
+model.add(Convolution2D(64, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

 model.add(Flatten())
-model.add(Dense(64*8*8, 256))
+# Note: Keras does automatic shape inference.
+model.add(Dense(256))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))

-model.add(Dense(256, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))

 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@@ -111,10 +117,10 @@ from keras.layers.embeddings import Embedding
 from keras.layers.recurrent import LSTM

 model = Sequential()
-model.add(Embedding(max_features, 256))
-model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
+model.add(Embedding(max_features, 256, input_length=maxlen))
+model.add(LSTM(output_dim=128, activation='sigmoid', inner_activation='hard_sigmoid'))
 model.add(Dropout(0.5))
-model.add(Dense(128, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))

 model.compile(loss='binary_crossentropy', optimizer='rmsprop')
@@ -126,51 +132,67 @@ score = model.evaluate(X_test, Y_test, batch_size=16)
 ### Architecture for learning image captions with a convnet and a Gated Recurrent Unit:
 (word-level embedding, caption of maximum length 16 words).

-Note that getting this to actually "work" will require using a bigger convnet, initialized with pre-trained weights.
-Displaying readable results will also require an embedding decoder.
+Note that getting this to work well will require using a bigger convnet, initialized with pre-trained weights.

 ```python
 max_caption_len = 16
+vocab_size = 10000

-model = Sequential()
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
-model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+# first, let's define an image model that
+# will encode pictures into 128-dimensional vectors.
+# it should be initialized with pre-trained weights.
+image_model = Sequential()
+image_model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100)))
+image_model.add(Activation('relu'))
+image_model.add(Convolution2D(32, 3, 3))
+image_model.add(Activation('relu'))
+image_model.add(MaxPooling2D(pool_size=(2, 2)))

-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
-model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(Convolution2D(64, 3, 3, border_mode='full'))
+image_model.add(Activation('relu'))
+image_model.add(Convolution2D(64, 3, 3))
+image_model.add(Activation('relu'))
+image_model.add(MaxPooling2D(pool_size=(2, 2)))

-model.add(Convolution2D(128, 64, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
-model.add(Convolution2D(128, 128, 3, 3)) 
-model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(Flatten())
+image_model.add(Dense(128))

-model.add(Flatten())
-model.add(Dense(128*4*4, 256))
-model.add(Activation('relu'))
-model.add(Dropout(0.5))
+# let's load the weights from a save file.
+image_model.load_weights('weight_file.h5')

-model.add(RepeatVector(max_caption_len)) 
-# the GRU below returns sequences of max_caption_len vectors of size 256 (our word embedding size)
-model.add(GRU(256, 256, return_sequences=True))
+# next, let's define a RNN model that encodes sequences of words
+# into sequences of 128-dimensional word vectors.
+language_model = Sequential()
+language_model.add(Embedding(vocab_size, 256, input_length=max_caption_len))
+language_model.add(GRU(output_dim=128, return_sequences=True))
+language_model.add(Dense(128))

-model.compile(loss='mean_squared_error', optimizer='rmsprop')
+# let's repeat the image vector to turn it into a sequence.
+image_model.add(RepeatVector(max_caption_len))

-# "images" is a numpy array of shape (nb_samples, nb_channels=3, width, height) 
-# "captions" is a numpy array of shape (nb_samples, max_caption_len=16, embedding_dim=256)
-# captions are supposed already embedded (dense vectors).
-model.fit(images, captions, batch_size=16, nb_epoch=100)
-    
+# the output of both models will be tensors of shape (samples, max_caption_len, 128).
+# let's concatenate these 2 vector sequences.
+model = Merge([image_model, language_model], mode='concat', concat_axis=-1)
+# let's encode this vector sequence into a single vector
+model.add(GRU(256, 256, return_sequences=False))
+# which will be used to compute a probability
+# distribution over what the next word in the caption should be!
+model.add(Dense(vocab_size))
+model.add(Activation('softmax'))
+
+model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+# "images" is a numpy float array of shape (nb_samples, nb_channels=3, width, height).
+# "captions" is a numpy integer array of shape (nb_samples, max_caption_len)
+# containing word index sequences representing partial captions.
+# "next_words" is a numpy float array of shape (nb_samples, vocab_size)
+# containing a categorical encoding (0s and 1s) of the next word in the corresponding
+# partial caption.
+model.fit([images, partial_captions], next_words, batch_size=16, nb_epoch=100)
 ```

 In the examples folder, you will find example models for real datasets:
- CIFAR10 small images classification: Convnet with realtime data augmentation
+- CIFAR10 small images classification: Convolutional Neural Network (CNN) with realtime data augmentation
 - IMDB movie review sentiment classification: LSTM over sequences of words
 - Reuters newswires topic classification: Multilayer Perceptron (MLP)
 - MNIST handwritten digits classification: MLP & CNN
@@ -183,7 +205,7 @@ In the examples folder, you will find example models for real datasets:

 For complete coverage of the API, check out [the Keras documentation](http://keras.io).

-A few highlights: convnets, LSTM, GRU, word2vec-style embeddings, PReLU, batch normalization...
+A few highlights: convnets, LSTM, GRU, word2vec-style embeddings, PReLU, BatchNormalization...

 ## Installation

@@ -196,7 +218,7 @@ Keras uses the following dependencies:
 - HDF5 and h5py (optional, required if you use model saving/loading functions)
 - Optional but recommended if you use CNNs: cuDNN.

-Once you have the dependencies installed, cd to the Keras folder and run the install command:
+To install, `cd` to the Keras folder and run the install command:
 ```
 sudo python setup.py install
 ```
@@ -213,4 +235,3 @@ Keras (κέρας) means _horn_ in Greek. It is a reference to a literary image
 Keras was developed as part of the research effort of project ONEIROS (Open-ended Neuro-Electronic Intelligent Robot Operating System).

 >_"Oneiroi are beyond our unravelling --who can be sure what tale they tell? Not all that men look for comes to pass. Two gates there are that give passage to fleeting Oneiroi; one is made of horn, one of ivory. The Oneiroi that pass through sawn ivory are deceitful, bearing a message that will not be fulfilled; those that come out through polished horn have truth behind them, to be accomplished for men who see them."_ Homer, Odyssey 19. 562 ff (Shewring translation).
-
@@ -14,6 +14,7 @@ pages:
 - Home: index.md
 - Index: documentation.md
 - Examples: examples.md
+- FAQ: faq.md
 - Optimizers: optimizers.md
 - Objectives: objectives.md
 - Models: models.md
@@ -6,12 +6,12 @@ Activations can either be used through an `Activation` layer, or through the `ac
 ```python
 from keras.layers.core import Activation, Dense

-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64))
 model.add(Activation('tanh'))
 ```
 is equivalent to:
 ```python
-model.add(Dense(20, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, activation='tanh'))
 ```

 You can also pass an element-wise Theano function as an activation:
@@ -20,7 +20,7 @@ You can also pass an element-wise Theano function as an activation:
 def tanh(x):
    return theano.tensor.tanh(x)

-model.add(Dense(20, 64, init='uniform', activation=tanh))
+model.add(Dense(64, activation=tanh))
 model.add(Activation(tanh))
 ```

@@ -33,7 +33,10 @@ The `logs` dictionary will contain keys for quantities relevant to the current b
 keras.callbacks.ModelCheckpoint(filepath, verbose=0, save_best_only=False)
 ```

-Save the model after every epoch. If `save_best_only=True`, the latest best model according to the validation loss will not be overwritten. 
+Save the model after every epoch. If `save_best_only=True`, the latest best model according to the validation loss will not be overwritten.
+`filepath` can contain named formatting options, which will be filled the value of `epoch` and keys in `logs` (passed in `on_epoch_end`).
+
+For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, then multiple files will be save with the epoch number and the validation loss.


 ```python
@@ -72,7 +75,7 @@ class LossHistory(keras.callbacks.Callback):
        self.losses.append(logs.get('loss'))

 model = Sequential()
-model.add(Dense(784, 10, init='uniform'))
+model.add(Dense(10, input_dim=784, init='uniform'))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

@@ -94,7 +97,7 @@ print history.losses
 from keras.callbacks import ModelCheckpoint

 model = Sequential()
-model.add(Dense(784, 10, init='uniform'))
+model.add(Dense(10, input_dim=784, init='uniform'))
 model.add(Activation('softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

@@ -12,7 +12,7 @@ These layers expose 2 keyword arguments:

 ```python
 from keras.constraints import maxnorm
-model.add(Dense(64, 64, W_constraint = maxnorm(2)))
+model.add(Dense(64, W_constraint = maxnorm(2)))
 ```

 ## Available constraints
@@ -5,6 +5,7 @@
 - [Home](index.md)
 - [Index](documentation.md)
 - [Examples](examples.md)
+- [FAQ](faq.md)

 ---

@@ -1,7 +1,7 @@

 Here are a few examples to get you started!

-### Multilayer Perceptron (MLP)
+### Multilayer Perceptron (MLP):

 ```python
 from keras.models import Sequential
@@ -9,13 +9,16 @@ from keras.layers.core import Dense, Dropout, Activation
 from keras.optimizers import SGD

 model = Sequential()
-model.add(Dense(20, 64, init='uniform'))
+# Dense(64) is a fully-connected layer with 64 hidden units.
+# in the first layer, you must specify the expected input data shape:
+# here, 20-dimensional vectors.
+model.add(Dense(64, input_dim=20, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64, init='uniform'))
 model.add(Activation('tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform'))
+model.add(Dense(2, init='uniform'))
 model.add(Activation('softmax'))

 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@@ -25,25 +28,21 @@ model.fit(X_train, y_train, nb_epoch=20, batch_size=16)
 score = model.evaluate(X_test, y_test, batch_size=16)
 ```

---
-
-### Alternative implementation of MLP
+### Alternative implementation of MLP:

 ```python
 model = Sequential()
-model.add(Dense(20, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, input_dim=20, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 64, init='uniform', activation='tanh'))
+model.add(Dense(64, init='uniform', activation='tanh'))
 model.add(Dropout(0.5))
-model.add(Dense(64, 2, init='uniform', activation='softmax'))
+model.add(Dense(2, init='uniform', activation='softmax'))

 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss='mean_squared_error', optimizer=sgd)
 ```

---
-
-### VGG-like convnet
+### VGG-like convnet:

 ```python
 from keras.models import Sequential
@@ -52,26 +51,29 @@ from keras.layers.convolutional import Convolution2D, MaxPooling2D
 from keras.optimizers import SGD

 model = Sequential()
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
+# input: 100x100 images with 3 channels -> (3, 100, 100) tensors.
+# this applies 32 convolution filters of size 3x3 each.
+model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100)))
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
+model.add(Convolution2D(32, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+model.add(Convolution2D(64, 3, 3, border_mode='valid'))
 model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
+model.add(Convolution2D(64, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

 model.add(Flatten())
-model.add(Dense(64*8*8, 256))
+# Note: Keras does automatic shape inference.
+model.add(Dense(256))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))

-model.add(Dense(256, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))

 sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
@@ -81,9 +83,7 @@ model.fit(X_train, Y_train, batch_size=32, nb_epoch=1)

 ```

---
-
-### Sequence classification with LSTM
+### Sequence classification with LSTM:

 ```python
 from keras.models import Sequential
@@ -92,11 +92,10 @@ from keras.layers.embeddings import Embedding
 from keras.layers.recurrent import LSTM

 model = Sequential()
-# Add a mask_zero=True to the Embedding connstructor if 0 is a left-padding value in your data
-model.add(Embedding(max_features, 256))
-model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
+model.add(Embedding(max_features, 256, input_length=maxlen))
+model.add(LSTM(output_dim=128, activation='sigmoid', inner_activation='hard_sigmoid'))
 model.add(Dropout(0.5))
-model.add(Dense(128, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))

 model.compile(loss='binary_crossentropy', optimizer='rmsprop')
@@ -105,59 +104,73 @@ model.fit(X_train, Y_train, batch_size=16, nb_epoch=10)
 score = model.evaluate(X_test, Y_test, batch_size=16)
 ```

---
+### Architecture for learning image captions with a convnet and a Gated Recurrent Unit:
+(word-level embedding, caption of maximum length 16 words).

-### Image captioning
-
-Architecture for learning image captions with a convnet and a Gated Recurrent Unit (word-level embedding, caption of maximum length 16 words).
-
-Note that getting this to actually "work" will require using a bigger convnet, initialized with pre-trained weights.
-Displaying readable results will also require an embedding decoder.
+Note that getting this to work well will require using a bigger convnet, initialized with pre-trained weights.

 ```python
 max_caption_len = 16
+vocab_size = 10000

-model = Sequential()
-model.add(Convolution2D(32, 3, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
-model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+# first, let's define an image model that
+# will encode pictures into 128-dimensional vectors.
+# it should be initialized with pre-trained weights.
+image_model = Sequential()
+image_model.add(Convolution2D(32, 3, 3, border_mode='full', input_shape=(3, 100, 100)))
+image_model.add(Activation('relu'))
+image_model.add(Convolution2D(32, 3, 3))
+image_model.add(Activation('relu'))
+image_model.add(MaxPooling2D(pool_size=(2, 2)))

-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3)) 
-model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(Convolution2D(64, 3, 3, border_mode='full'))
+image_model.add(Activation('relu'))
+image_model.add(Convolution2D(64, 3, 3))
+image_model.add(Activation('relu'))
+image_model.add(MaxPooling2D(pool_size=(2, 2)))

-model.add(Convolution2D(128, 64, 3, 3, border_mode='full')) 
-model.add(Activation('relu'))
-model.add(Convolution2D(128, 128, 3, 3)) 
-model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+image_model.add(Flatten())
+image_model.add(Dense(128))

-model.add(Flatten())
-model.add(Dense(128*4*4, 256))
-model.add(Activation('relu'))
-model.add(Dropout(0.5))
+# let's load the weights from a save file.
+image_model.load_weights('weight_file.h5')

-model.add(RepeatVector(max_caption_len)) 
-# the GRU below returns sequences of max_caption_len vectors of size 256 (our word embedding size)
-model.add(GRU(256, 256, return_sequences=True))
+# next, let's define a RNN model that encodes sequences of words
+# into sequences of 128-dimensional word vectors.
+language_model = Sequential()
+language_model.add(Embedding(vocab_size, 256, input_length=max_caption_len))
+language_model.add(GRU(output_dim=128, return_sequences=True))
+language_model.add(Dense(128))

-model.compile(loss='mean_squared_error', optimizer='rmsprop')
+# let's repeat the image vector to turn it into a sequence.
+image_model.add(RepeatVector(max_caption_len))

-# "images" is a numpy array of shape (nb_samples, nb_channels=3, width, height) 
-# "captions" is a numpy array of shape (nb_samples, max_caption_len=16, embedding_dim=256)
-# captions are supposed already embedded (dense vectors).
-model.fit(images, captions, batch_size=16, nb_epoch=100)
-    
+# the output of both models will be tensors of shape (samples, max_caption_len, 128).
+# let's concatenate these 2 vector sequences.
+model = Merge([image_model, language_model], mode='concat', concat_axis=-1)
+# let's encode this vector sequence into a single vector
+model.add(GRU(256, 256, return_sequences=False))
+# which will be used to compute a probability
+# distribution over what the next word in the caption should be!
+model.add(Dense(vocab_size))
+model.add(Activation('softmax'))
+
+model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+# "images" is a numpy float array of shape (nb_samples, nb_channels=3, width, height).
+# "captions" is a numpy integer array of shape (nb_samples, max_caption_len)
+# containing word index sequences representing partial captions.
+# "next_words" is a numpy float array of shape (nb_samples, vocab_size)
+# containing a categorical encoding (0s and 1s) of the next word in the corresponding
+# partial caption.
+model.fit([images, partial_captions], next_words, batch_size=16, nb_epoch=100)
 ```

---
-
-In the [examples folder](https://github.com/fchollet/keras/tree/master/examples), you will find example models for real datasets:
-
- CIFAR10 small images classification: Convnet with realtime data augmentation
+In the examples folder, you will find example models for real datasets:
+- CIFAR10 small images classification: Convolutional Neural Network (CNN) with realtime data augmentation
 - IMDB movie review sentiment classification: LSTM over sequences of words
- Reuters newswires topic classification: Multilayer Perceptron
+- Reuters newswires topic classification: Multilayer Perceptron (MLP)
+- MNIST handwritten digits classification: MLP & CNN
+- Character-level text generation with LSTM
+
+...and more.
@@ -0,0 +1,179 @@
+# Keras FAQ: Frequently Asked Keras Questions
+
+[How can I run Keras on GPU?](#how-can-i-run-keras-on-gpu)
+
+[How can I save a Keras model?](#how-can-i-save-a-keras-model)
+
+[Why is the training loss much higher than the testing loss?](#why-is-the-training-loss-much-higher-than-the-testing-loss)
+
+[How can I visualize the output of an intermediate layer?](#how-can-i-visualize-the-output-of-an-intermediate-layer)
+
+[Isn't there a bug with Merge or Graph related to input concatenation?](#isnt-there-a-bug-with-merge-or-graph-related-to-input-concatenation)
+
+[How can I use Keras with datasets that don't fit in memory?](#how-can-i-use-keras-with-datasets-that-dont-fit-in-memory)
+
+[How can I interrupt training when the validation loss isn't decreasing anymore?](#how-can-i-interrupt-training-when-the-validation-loss-isnt-decreasing-anymore)
+
+[How is the validation split computed?](#how-is-the-validation-split-computed)
+
+[Is the data shuffled during training?](#is-the-data-shuffled-during-training)
+
+[How can I record the training / validation loss / accuracy at each epoch?](#how-can-i-record-the-training-validation-loss-accuracy-at-each-epoch)
+
+---
+
+### How can I run Keras on GPU?
+
+Method 1: use Theano flags.
+```bash
+THEANO_FLAGS=device=gpu,floatX=float32 python my_keras_script.py
+```
+
+The name 'gpu' might have to be changed depending on your device's identifier (e.g. `gpu0`, `gpu1`, etc).
+
+Method 2: set up your `.theanorc`: [Instructions](http://deeplearning.net/software/theano/library/config.html)
+
+Method 3: manually set `theano.config.device`, `theano.config.floatX` at the beginning of your code:
+```python
+import theano
+theano.config.device = 'gpu'
+theano.config.floatX = 'float32'
+```
+
+---
+
+### How can I save a Keras model?
+
+*It is not recommended to use pickle or cPickle to save a Keras model.*
+
+If you only need to save the architecture of a model, and not its weights, you can do:
+
+```python
+# save as JSON
+json_string = model.to_json()
+
+# save as YAML
+yaml_string = model.to_yaml()
+```
+
+You can then build a fresh model from this data:
+
+```python
+# model reconstruction from JSON:
+from keras.models import model_from_json
+model = model_from_json(json_string)
+
+# model reconstruction from YAML
+model = model_from_yaml(yaml_string)
+```
+
+If you need to save the weights of a model, you can do so in HDF5:
+```python
+model.save_weights('my_model_weights.h5')
+```
+
+Assuming you have code for instantiating your model, you can then load the weights you saved into a model with the same architecture:
+
+```python
+model.load_weights('my_model_weights.h5')
+```
+
+This leads us to a way to save and reconstruct models from only serialized data:
+```python
+json_string = model.to_json()
+open('my_model_architecture.json', 'w').write(json_string)
+model.save_weights('my_model_weights.h5')
+
+# elsewhere...
+model = model_from_json(open('my_model_architecture.json').read())
+model.load_weights('my_model_weights.h5')
+```
+
+---
+
+### Why is the training loss much higher than the testing loss?
+
+A Keras model has two modes: training and testing. Regularization mechanisms, such as Dropout and L1/L2 weight regularization, are turned off at testing time.
+
+Besides, the training loss is the average of the losses over each batch of training data. Because your model is changing over time, the loss over the first batches of an epoch is generally higher than over the last batches. On the other hand, the testing loss for an epoch is computed using the model as it is at the end of the epoch, resulting in a lower loss.
+
+---
+
+### How can I visualize the output of an intermediate layer?
+
+You can build a Theano function that will return the output of a certain layer given a certain input, for example:
+
+```python
+# with a Sequential model
+get_3rd_layer_output = theano.function([model.layers[0].input], 
+                                       model.layers[3].get_output(train=False))
+layer_output = get_3rd_layer_output(X)
+
+# with a Graph model
+get_conv_layer_output = theano.function([model.inputs[i].input for i in model.input_order],
+                                        model.outputs['conv'].get_output(train=False),
+                                        on_unused_input='ignore')
+conv_output = get_conv_output(input_data_dict)
+```
+
+---
+
+### Isn't there a bug with Merge or Graph related to input concatenation?
+
+Yes, there was a known bug with tensor concatenation in Thenao that was fixed early 2015. 
+Please upgrade to the latest version of Theano:
+
+```bash
+sudo pip install git+git://github.com/Theano/Theano.git
+```
+
+---
+
+### How can I use Keras with datasets that don't fit in memory?
+
+You can do batch training using `model.train_on_batch(X, y)` and `model.test_on_batch(X, y)`. See the [models documentation](models.md).
+
+You can also see batch training in action in our [CIFAR10 example](https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py).
+
+---
+
+### How can I interrupt training when the validation loss isn't decreasing anymore?
+
+You can use an `EarlyStopping` callback:
+
+```python
+from keras.callbacks import EarlyStopping
+early_stopping = EarlyStopping(monitor='val_loss', patience=2)
+model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
+```
+
+Find out more in the [callbacks documentation](callbacks.md).
+
+---
+
+### How is the validation split computed?
+
+If you set the `validation_split` arugment in `model.fit` to e.g. 0.1, then the validation data used will be the *last 10%* of the data. If you set it to 0.25, it will be the last 25% of the data, etc.
+
+
+---
+
+### Is the data shuffled during training?
+
+Yes, if the `shuffle` argument in `model.fit` is set to `True` (which is the default), the training data will be randomly shuffled at each epoch.
+
+Validation data isn't shuffled.
+
+---
+
+
+### How can I record the training / validation loss / accuracy at each epoch?
+
+The `model.fit` method returns an `History` callback, which has a `history` attribute containing the lists of successive losses / accuracies.
+
+```python
+hist = model.fit(X, y, validation_split=0.2)
+print(hist.history)
+```
+
+---
@@ -46,9 +46,9 @@ Stacking layers is as easy as `.add()`:
 ```python
 from keras.layers.core import Dense, Activation

-model.add(Dense(input_dim=100, output_dim=64, init="glorot_uniform"))
+model.add(Dense(output_dim=64, input_dim=100, init="glorot_uniform"))
 model.add(Activation("relu"))
-model.add(Dense(input_dim=64, output_dim=10, init="glorot_uniform"))
+model.add(Dense(output_dim=10, init="glorot_uniform"))
 model.add(Activation("softmax"))
 ```

@@ -6,7 +6,7 @@ Initializations define the probability distribution used to set the initial rand
 The keyword arguments used for passing initializations to layers will depend on the layer. Usually it is simply `init`:

 ```python
-model.add(Dense(64, 64, init='uniform'))
+model.add(Dense(64, init='uniform'))
 ```

 ## Available initializations
@@ -7,7 +7,8 @@ keras.layers.advanced_activations.LeakyReLU(alpha=0.3)

 Special version of a Rectified Linear Unit that allows a small gradient when the unit is not active (`f(x) = alpha*x for x < 0`).

- __Input shape__: This layer does not assume a specific input shape. As a result, it cannot be used as the first layer in a model.
+
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -19,18 +20,16 @@ Special version of a Rectified Linear Unit that allows a small gradient when the
 ## PReLU

 ```python
-keras.layers.advanced_activations.PReLU(input_shape)
+keras.layers.advanced_activations.PReLU()
 ```

 Parametrized linear unit. Similar to a LeakyReLU, where each input unit has its alpha coefficient, and where these coefficients are learned during training.

- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

- __Arguments__:
-    - __input_shape__: tuple.
-
 - __References__:
    - [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](http://arxiv.org/pdf/1502.01852v1.pdf)

@@ -39,18 +38,15 @@ Parametrized linear unit. Similar to a LeakyReLU, where each input unit has its
 ## ParametricSoftplus

 ```python
-keras.layers.advanced_activations.ParametricSoftplus(input_shape)
+keras.layers.advanced_activations.ParametricSoftplus()
 ```

 Parametric Softplus of the form: (`f(x) = alpha * (1 + exp(beta * x))`). This is essentially a smooth version of ReLU where the parameters control the sharpness of the rectification. The parameters are initialized to more closely approximate a ReLU than the standard `softplus`: `alpha` initialized to `0.2` and `beta`  initialized to `5.0`. The parameters are fit separately for each hidden unit.

- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape=...` when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

- __Arguments__:
-    - __input_shape__: tuple.
-
 - __References__:
    - [Inferring Nonlinear Neuronal Computation Based on Physiologically Plausible Inputs](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003143)

@@ -62,7 +58,8 @@ keras.layers.advanced_activations.ThresholdedLinear(theta)

 Parametrized linear unit. provides a threshold near zero where values are zeroed.

- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -80,7 +77,7 @@ keras.layers.advanced_activations.ThresholdedReLu(theta)

 Parametrized rectified linear unit. provides a threshold near zero where values are zeroed.

- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape=...` when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -2,22 +2,20 @@
 ## Convolution1D

 ```python
-keras.layers.convolutional.Convolution1D(input_dim, nb_filter, filter_length, 
+keras.layers.convolutional.Convolution1D(nb_filter, filter_length, 
        init='uniform', activation='linear', weights=None, 
        border_mode='valid', subsample_length=1, 
        W_regularizer=None, b_regularizer=None, W_constraint=None, 
-        b_constraint=None)
+        b_constraint=None, input_dim=None, input_length=None)
 ```

-Convolution operator for filtering neighborhoods of one-dimensional inputs.
-
+Convolution operator for filtering neighborhoods of one-dimensional inputs. When using this layer as the first layer in a model, either provide the keyword argument `input_dim` (int, e.g. 128 for sequences of 128-dimensional vectors), or `input_shape` (tuple of integers, e.g. (10, 128) for sequences of 10 vectors of 128-dimensional vectors).

 - __Input shape__: 3D tensor with shape: `(nb_samples, steps, input_dim)`.

 - __Output shape__: 3D tensor with shape: `(nb_samples, steps, nb_filter)`. `steps` value might have changed due to padding.

 - __Arguments__:
-    - __input_dim__: Number of channels/dimensions in the input.
    - __nb_filter__: Number of convolution kernels to use (dimensionality of the output).
    - __filter_length__: The extension (spatial or temporal) of each filter.
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
@@ -30,31 +28,32 @@ Convolution operator for filtering neighborhoods of one-dimensional inputs.
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
+    - __input_dim__: Number of channels/dimensions in the input. Either this argument or the keyword argument `input_shape` must be provided when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).

 ---

 ## Convolution2D

 ```python
-keras.layers.convolutional.Convolution2D(nb_filter, stack_size, nb_row, nb_col, 
+keras.layers.convolutional.Convolution2D(nb_filter, nb_row, nb_col, 
        init='glorot_uniform', activation='linear', weights=None, 
        border_mode='valid', subsample=(1, 1),
        W_regularizer=None, b_regularizer=None, W_constraint=None)
 ```

-Convolution operator for filtering windows of two-dimensional inputs. 
+Convolution operator for filtering windows of two-dimensional inputs. When using this layer as the first layer in a model, provide the keyword argument `input_shape` (tuple of integers, does not include the sample axis), e.g. `input_shape=(3, 128, 128)` for 128x128 RGB pictures.

- __Input shape__: 4D tensor with shape: `(nb_samples, stack_size, nb_row, nb_col)`.
+- __Input shape__: 4D tensor with shape: `(nb_samples, channels, rows, cols)`.

- __Output shape__: 4D tensor with shape: `(nb_samples, nb_filter, nb_row, nb_col)`. `nb_row`, `nb_col` might have changed due to padding.
+- __Output shape__: 4D tensor with shape: `(nb_samples, nb_filter, rows, cols)`. `rows`, `cols` might have changed due to padding.


 - __Arguments__:

-    - __nb_filter__: Number of convolution kernels to use.
-    - __stack_size__: Number of channels in the input.
-    - __nb_row__: Number of rows in the convolution kernels
-    - __nb_col__: Number of columns in the convolution kernels
+    - __nb_filter__: Number of convolution filters to use.
+    - __nb_row__: Number of rows in the convolution kernel.
+    - __nb_col__: Number of columns in the convolution kernel.
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
    - __activation__: name of activation function to use (see: [activations](../activations.md)), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
    - __weights__: list of numpy arrays to set as initial weights.
@@ -90,7 +89,7 @@ keras.layers.convolutional.MaxPooling1D(pool_length=2, stride=None, ignore_borde
 ## MaxPooling2D

 ```python
-keras.layers.convolutional.MaxPooling2D(poolsize=(2, 2), ignore_border=True)
+keras.layers.convolutional.MaxPooling2D(pool_size=(2, 2), ignore_border=True)
 ```

 - __Input shape__: 4D tensor with shape: `(nb_samples, stack_size, nb_row, nb_col)`.
@@ -76,8 +76,9 @@ get_config()

 ## Dense
 ```python
-keras.layers.core.Dense(input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None \
-W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None)
+keras.layers.core.Dense(output_dim, init='glorot_uniform', activation='linear', weights=None
+W_regularizer=None, b_regularizer=None, activity_regularizer=None,
+W_constraint=None, b_constraint=None, input_dim=None)
 ```

 Standard 1D fully-connect layer. 
@@ -88,7 +89,6 @@ Standard 1D fully-connect layer.

 - __Arguments__:

-    - __input_dim__: int >= 0. 
    - __output_dim__: int >= 0. 
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
    - __activation__: name of activation function to use (see: [activations](../activations.md)), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
@@ -98,21 +98,22 @@ Standard 1D fully-connect layer.
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model. 

 ---

 ## TimeDistributedDense
 ```python
-keras.layers.core.TimeDistributedDense(input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None \
-W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None)
+keras.layers.core.TimeDistributedDense(output_dim, init='glorot_uniform', activation='linear', weights=None
+W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None,
+input_dim=None, input_length=None)
 ```

 Fully-connected layer distributed over the time dimension. Useful after a recurrent network set to `return_sequences=True`.

- __Input shape__: 3D tensor with shape: `(nb_samples, nb_timesteps, input_dim)`.
+- __Input shape__: 3D tensor with shape: `(nb_samples, timesteps, input_dim)`.

 - __Arguments__:
-    - __input_dim__: int >= 0. 
    - __output_dim__: int >= 0. 
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
    - __activation__: name of activation function to use (see: [activations](../activations.md)), or alternatively, elementwise Theano function. If you don't specify anything, no activation is applied (ie. "linear" activation: a(x) = x).
@@ -122,12 +123,14 @@ Fully-connected layer distributed over the time dimension. Useful after a recurr
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).

 - __Example__:
 ```python
-# input shape: (nb_samples, nb_timesteps, 10)
-model.add(LSTM(10, 5, return_sequences=True)) # output shape: (nb_samples, nb_timesteps, 5)
-model.add(TimeDistributedDense(5, 10)) # output shape: (nb_samples, nb_timesteps, 10)
+# input shape: (nb_samples, timesteps, 10)
+model.add(LSTM(5, return_sequences=True, input_dim=10)) # output shape: (nb_samples, timesteps, 5)
+model.add(TimeDistributedDense(15)) # output shape: (nb_samples, timesteps, 15)
 ```


@@ -151,7 +154,7 @@ A customizable autoencoder model. If `output_reconstruction = True` then dim(inp

    - __decoder__: A [layer](./) or [layer container](./containers.md).
    
-    - __output_reconstruction__: If this is False the when .predict() is called the output is the deepest hidden layer's activation. Otherwise the output of the final decoder layer is presented. Be sure your validation data confirms to this logic if you decide to use any.
+    - __output_reconstruction__: If this is False, then when .predict() is called, the output is the deepest hidden layer's activation. Otherwise, the output of the final decoder layer is presented. Be sure your validation data conforms to this logic if you decide to use any.
    
    - __weights__: list of numpy arrays to set as initial weights. The list should have 1 element, of shape `(input_dim, output_dim)`.

@@ -160,8 +163,8 @@ A customizable autoencoder model. If `output_reconstruction = True` then dim(inp
 from keras.layers import containers

 # input shape: (nb_samples, 32)
-encoder = containers.Sequential([Dense(32, 16), Dense(16, 8)])
-decoder = containers.Sequential([Dense(8, 16), Dense(16, 32)])
+encoder = containers.Sequential([Dense(16, input_dim=32), Dense(8)])
+decoder = containers.Sequential([Dense(16, input_dim=8), Dense(32)])

 autoencoder = Sequential()
 autoencoder.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=False))
@@ -176,7 +179,8 @@ keras.layers.core.Activation(activation)
 ```
 Apply an activation function to the input. 

- __Input shape__: This layer does not assume a specific input shape. As a result, it cannot be used as the first layer in a model.
+
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -193,7 +197,8 @@ keras.layers.core.Dropout(p)
 ```
 Apply dropout to the input. Dropout consists in randomly setting a fraction `p` of input units to 0 at each update during training time, which helps prevent overfitting. Reference: [Dropout: A Simple Way to Prevent Neural Networks from Overfitting](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)

- __Input shape__: This layer does not assume a specific input shape. 
+
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -206,24 +211,25 @@ Apply dropout to the input. Dropout consists in randomly setting a fraction `p`

 ## Reshape
 ```python
-keras.layers.core.Reshape(*dims)
+keras.layers.core.Reshape(dims)
 ```

 Reshape the input to a new shape containing the same number of units. 

- __Input shape__: This layer does not assume a specific input shape. 

- __Output shape__: `(nb_samples, *dims)`.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
+
+- __Output shape__: `(nb_samples, dims)`.

 - __Arguments__:

-    - *dims: integers. Dimensions of the new shape.
+    - dims: tuple of integers. Dimensions of the new shape.

 - __Example__:
 ```python
 # input shape: (nb_samples, 10)
-model.add(Dense(10, 100)) # output shape: (nb_samples, 100)
-model.add(Reshape(10, 10))  # output shape: (nb_samples, 10, 10)
+model.add(Dense(100, input_dim=10)) # output shape: (nb_samples, 100)
+model.add(Reshape(dims=(10, 10)))  # output shape: (nb_samples, 10, 10)
 ```

 ---
@@ -235,7 +241,7 @@ keras.layers.core.Flatten()

 Convert a nD input to 1D. 

- __Input shape__: (nb_samples, *). This layer cannot be used as the first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: `(nb_samples, nb_input_units)`.

@@ -250,7 +256,7 @@ Repeat the 1D input n times. Dimensions of input are assumed to be `(nb_samples,

 Note that the output is still a single tensor; `RepeatVector` does not split the data flow.

- __Input shape__: This layer does not assume a specific input shape. This layer cannot be used as the first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: `(nb_samples, n, input_dims)`.

@@ -265,18 +271,18 @@ keras.layers.core.Permute(dims)
 ```
 Permute the dimensions of the input data according to the given tuple. Sometimes useful for connecting RNNs and convnets together.

- __Input shape: This layer does not assume a specific input shape.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

- __Output shape: Same as the input shape, but with the dimensions re-ordered according to the ordering specified by the tuple.
+- __Output shape__: Same as the input shape, but with the dimensions re-ordered according to the ordering specified by the tuple.

- __Argument: tuple specifying the permutation scheme (e.g. `(2, 1)` permutes the first and second dimension of the input).
+- __Argument__: tuple specifying the permutation scheme (e.g. `(2, 1)` permutes the first and second dimension of the input).

 - __Example__:
 ```python
 # input shape: (nb_samples, 10)
-model.add(Dense(10, 50)) # output shape: (nb_samples, 50)
-model.add(Reshape(10, 5)) # output shape: (nb_samples, 10, 5)
-model.add(Permute((2, 1))) #output shape: (nb_samples, 5, 10)
+model.add(Dense(50, input_dim=10)) # output shape: (nb_samples, 50)
+model.add(Reshape(dims=(10, 5))) # output shape: (nb_samples, 10, 5)
+model.add(Permute(dims=(2, 1))) #output shape: (nb_samples, 5, 10)
 ```

 ---
@@ -294,8 +300,9 @@ This layer can be used, for instance, to induce activation sparsity in the previ

 ## MaxoutDense
 ```python
-keras.layers.core.MaxoutDense(input_dim, output_dim, nb_feature=4, init='glorot_uniform', weights=None, \
-        W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None)
+keras.layers.core.MaxoutDense(output_dim, nb_feature=4, init='glorot_uniform', weights=None,
+        W_regularizer=None, b_regularizer=None, activity_regularizer=None,
+        W_constraint=None, b_constraint=None, input_dim=None)
 ```

 A dense maxout layer. A `MaxoutDense` layer takes the element-wise maximum of `nb_feature` `Dense(input_dim, output_dim)` linear layers. This allows the layer to learn a convex, piecewise linear activation function over the inputs. See [this paper](http://arxiv.org/pdf/1302.4389.pdf) for more details. Note that this is a *linear* layer -- if you wish to apply activation function (you shouldn't need to -- they are universal function approximators), an `Activation` layer must be added after.
@@ -306,7 +313,6 @@ A dense maxout layer. A `MaxoutDense` layer takes the element-wise maximum of `n

 - __Arguments__:

-    - __input_dim__: int >= 0. 
    - __output_dim__: int >= 0. 
    - __nb_feature__: int >= 0. the number of features to create for the maxout. This is equivalent to the number of piecewise elements to be allowed for the activation function. 
    - __init__: name of initialization function for the weights of the layer (see: [initializations](../initializations.md)), or alternatively, Theano function to use for weights initialization. This parameter is only relevant if you don't pass a `weights` argument.
@@ -316,12 +322,12 @@ A dense maxout layer. A `MaxoutDense` layer takes the element-wise maximum of `n
    - __activity_regularizer__: instance of [ActivityRegularizer](../regularizers.md), applied to the network output.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the main weights matrix.
    - __b_constraint__: instance of the [constraints](../constraints.md) module, applied to the bias.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.

 ```python
 # input shape: (nb_samples, 10)
-model.add(Dense(10, 100)) # output shape: (nb_samples, 100)
-model.add(MaxoutDense(100, 100, nb_feature=10)) # output shape: (nb_samples, 100)
-model.add(RepeatVector(2))  # output shape: (nb_samples, 2, 10)
+model.add(Dense(100, input_dim=10)) # output shape: (nb_samples, 100)
+model.add(MaxoutDense(50, nb_feature=10)) # output shape: (nb_samples, 50)
 ```

 ## Merge
@@ -329,27 +335,27 @@ model.add(RepeatVector(2))  # output shape: (nb_samples, 2, 10)
 keras.layers.core.Merge(models, mode='sum')
 ```

-Merge the output of a list of layers (or containers) into a single tensor, following one of two modes: `sum` or `concat`. 
+Merge the output of a list of layers (or containers) into a single tensor, following one of three modes: `sum`, `mul` or `concat`. 

 - __Arguments__:
    - __layers__: List of layers or [containers](/layers/containers/).
-    - __mode__: String, one of `{'sum', 'concat'}`. `sum` will simply sum the outputs of the layers (therefore all layers should have an output with the same shape). `concat` will concatenate the outputs along the last dimension (therefore all layers should have an output that only differ along the last dimension). 
+    - __mode__: String, one of `{'sum', 'mul', 'concat'}`. `sum` and `mul` will simply sum/multiply the outputs of the layers (therefore all layers should have an output with the same shape). `concat` will concatenate the outputs along the last dimension (therefore all layers should have an output that only differ along the last dimension). 

 - __Example__:

 ```python
 left = Sequential()
-left.add(Dense(784, 50))
+left.add(Dense(50, input_shape=(784,)))
 left.add(Activation('relu'))

 right = Sequential()
-right.add(Dense(784, 50))
+right.add(Dense(50, input_shape=(784,)))
 right.add(Activation('relu'))

 model = Sequential()
 model.add(Merge([left, right], mode='sum'))

-model.add(Dense(50, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -2,15 +2,15 @@
 ## Embedding

 ```python
-keras.layers.embeddings.Embedding(input_dim, output_dim, init='uniform', weights=None, W_regularizer=None, W_constraint=None, mask_zero=False)
+keras.layers.embeddings.Embedding(input_dim, output_dim, init='uniform', input_length=None, weights=None, W_regularizer=None, W_constraint=None, mask_zero=False)
 ```

 Turn positive integers (indexes) into denses vectors of fixed size,
 eg. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`

- __Input shape__: 2D tensor with shape: `(nb_samples, maxlen)`.
+- __Input shape__: 2D tensor with shape: `(nb_samples, sequence_length)`.

- __Output shape__: 3D tensor with shape: `(nb_samples, maxlen, output_dim)`.
+- __Output shape__: 3D tensor with shape: `(nb_samples, sequence_length, output_dim)`.

 - __Arguments__:

@@ -21,12 +21,13 @@ eg. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
    - __W_regularizer__: instance of the [regularizers](../regularizers.md) module (eg. L1 or L2 regularization), applied to the embedding matrix.
    - __W_constraint__: instance of the [constraints](../constraints.md) module (eg. maxnorm, nonneg), applied to the embedding matrix.
 	- __mask_zero__: Whether or not the input value 0 is a special "padding" value that should be masked out. This is useful for [recurrent layers](recurrent.md) which may take variable length input. If this is `True` then all subsequent layers in the model need to support masking or an exception will be raised.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).


 ## WordContextProduct

 ```python
-keras.layers.embeddings.WordContextProduct(input_dim, proj_dim=128, 
+keras.layers.embeddings.WordContextProduct(input_dim, proj_dim=128,
        init='uniform', activation='sigmoid', weights=None)
 ```

@@ -6,9 +6,9 @@ keras.layers.noise.GaussianNoise(sigma)
 ```
 Apply to the input an additive zero-centred gaussian noise with standard deviation `sigma`. This is useful to mitigate overfitting (you could see it as a kind of random data augmentation). Gaussian Noise (GS) is a natural choice as corruption process for real valued inputs.

-The Gaussian noise is only added at training time.
+Only active at training time.

- __Input shape__: This layer does not assume a specific input shape. 
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -24,11 +24,9 @@ keras.layers.noise.GaussianDropout(p)
 ```
 Apply to the input an multiplicative one-centred gaussian noise with standard deviation `sqrt(p/(1-p))`. p refers to drop probability to match Dropout layer syntax. 

-http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
+Only active at training time.

-The Gaussian noise is only used at training time.
-
- __Input shape__: This layer does not assume a specific input shape. 
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

@@ -36,3 +34,4 @@ The Gaussian noise is only used at training time.

    - __p__: float, drop probability as with Dropout.

+
@@ -2,17 +2,16 @@
 ## BatchNormalization

 ```python
-keras.layers.normalization.BatchNormalization(input_shape, epsilon=1e-6, weights=None)
+keras.layers.normalization.BatchNormalization(epsilon=1e-6, weights=None)
 ```

 Normalize the activations of the previous layer at each batch.

- __Input shape__: Same as `input_shape`. This layer cannot be used as first layer in a model.
+- __Input shape__: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.

 - __Output shape__: Same as input.

- __Arguments__:
-    - __input_shape__: tuple.
+- __Arguments__: 
    - __epsilon__: small float > 0. Fuzz parameter.
    - __weights__: Initialization weights. List of 2 numpy arrays, with shapes: `[(input_shape,), (input_shape,)]`

@@ -2,9 +2,9 @@
 ## SimpleRNN

 ```python
-keras.layers.recurrent.SimpleRNN(input_dim, output_dim, 
+keras.layers.recurrent.SimpleRNN(output_dim, 
        init='glorot_uniform', inner_init='orthogonal', activation='sigmoid', weights=None,
-        truncate_gradient=-1, return_sequences=False)
+        truncate_gradient=-1, return_sequences=False, input_dim=None, input_length=None)
 ```
 Fully connected RNN where output is to fed back to input. 

@@ -18,23 +18,25 @@ Fully connected RNN where output is to fed back to input.


 - __Arguments__:
-    - __input_dim__: dimension of the input.
    - __output_dim__: dimension of the internal projections and the final output.
    - __init__: weight initialization function. Can be the name of an existing function (str), or a Theano function (see: [initializations](../initializations.md)).
    - __activation__: activation function. Can be the name of an existing function (str), or a Theano function (see: [activations](../activations.md)).
    - __weights__: list of numpy arrays to set as initial weights. The list should have 3 elements, of shapes: `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).

 ---

 ## SimpleDeepRNN

 ```python
-keras.layers.recurrent.SimpleDeepRNN(input_dim, output_dim, depth=3,
+keras.layers.recurrent.SimpleDeepRNN(output_dim, depth=3,
        init='glorot_uniform', inner_init='orthogonal', 
        activation='sigmoid', inner_activation='hard_sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
+        input_dim=None, input_length=None)
 ```
 Fully connected RNN where the output of multiple timesteps (up to "depth" steps in the past) is fed back to the input: 

@@ -64,6 +66,8 @@ Not a particularly useful model, included for demonstration purposes.
    - __weights__: list of numpy arrays to set as initial weights. The list should have depth+2 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).


 ---
@@ -74,7 +78,8 @@ Not a particularly useful model, included for demonstration purposes.
 keras.layers.recurrent.GRU(input_dim, output_dim=128, 
        init='glorot_uniform', inner_init='orthogonal',
        activation='sigmoid', inner_activation='hard_sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
+        input_dim=None, input_length=None)
 ```

 Gated Recurrent Unit - Cho et al. 2014.
@@ -97,6 +102,8 @@ Gated Recurrent Unit - Cho et al. 2014.
    - __weights__: list of numpy arrays to set as initial weights. The list should have 9 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).

 - __References__: 
    - [On the Properties of Neural Machine Translation: Encoder–Decoder Approaches](http://www.aclweb.org/anthology/W14-4012)
@@ -110,7 +117,8 @@ Gated Recurrent Unit - Cho et al. 2014.
 keras.layers.recurrent.LSTM(input_dim, output_dim=128, 
        init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one',
        activation='tanh', inner_activation='hard_sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
+        input_dim=None, input_length=None)
 ```

 Long-Short Term Memory unit - Hochreiter 1997.
@@ -134,6 +142,8 @@ Long-Short Term Memory unit - Hochreiter 1997.
    - __weights__: list of numpy arrays to set as initial weights. The list should have 12 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).

 - __References__: 
    - [Long short-term memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf) (original 1997 paper)
@@ -148,7 +158,8 @@ Long-Short Term Memory unit - Hochreiter 1997.
 keras.layers.recurrent.JZS1(input_dim, output_dim=128, 
        init='glorot_uniform', inner_init='orthogonal', 
        activation='tanh', inner_activation='sigmoid',
-        weights=None, truncate_gradient=-1, return_sequences=False)
+        weights=None, truncate_gradient=-1, return_sequences=False,
+        input_dim=None, input_length=None)
 ```

 Top 3 RNN architectures evolved from the evaluation of thousands of models. Serves as alternatives to LSTMs and GRUs. Corresponds to `MUT1`, `MUT2`, and `MUT3` architectures described in the paper: An Empirical Exploration of Recurrent Network Architectures, Jozefowicz et al. 2015.
@@ -171,6 +182,8 @@ Top 3 RNN architectures evolved from the evaluation of thousands of models. Serv
    - __weights__: list of numpy arrays to set as initial weights. The list should have 9 elements.
    - __truncate_gradient__: Number of steps to use in truncated BPTT. See: [Theano "scan"](http://deeplearning.net/software/theano/library/scan.html).
    - __return_sequences__: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+    - __input_dim__: dimensionality of the input (integer). This argument (or alternatively, the keyword argument `input_shape`) is required when using this layer as the first layer in a model.
+    - __input_length__: Length of input sequences, when it is constant. This argument is required if you are going to connect `Flatten` then `Dense` layers upstream (without it, the shape of the dense outputs cannot be computed).

 - __References__: 
    - [An Empirical Exploration of Recurrent Network Architectures](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
@@ -27,8 +27,8 @@ model = keras.models.Sequential()
            - __shuffle__: boolean or str (for 'batch'). Whether to shuffle the samples at each epoch. 'batch' is a special option for dealing with the limitations of HDF5 data; it shuffles in batch-sized chunks.
            - __show_accuracy__: boolean. Whether to display class accuracy in the logs to stdout at each epoch.
            - __class_weight__: dictionary mapping classes to a weight value, used for scaling the loss function (during training only).
-            - __sample_weight__: list or numpy array with 1:1 mapping to the training samples, used for scaling the loss function (during training only). For time-distributed data, there is one weight per sample *per timestep*, i.e. if your output data is shaped `(nb_samples, timesteps, output_dim)`, your mask should be of shape `(nb_samples, timesteps)`. This allows you to mask out or reweight individual output timesteps, which is useful in sequence to sequence learning.
-    - __evaluate__(X, y, batch_size=128, show_accuracy=False, verbose=1): Show performance of the model over some validation data.
+            - __sample_weight__: list or numpy array with 1:1 mapping to the training samples, used for scaling the loss function (during training only). For time-distributed data, there is one weight per sample *per timestep*, i.e. if your output data is shaped `(nb_samples, timesteps, output_dim)`, your mask should be of shape `(nb_samples, timesteps, 1)`. This allows you to mask out or reweight individual output timesteps, which is useful in sequence to sequence learning.
+    - __evaluate__(X, y, batch_size=128, show_accuracy=False, verbose=1, sample_weight=None): Show performance of the model over some validation data.
        - __Return__: The loss score over the data, or a `(loss, accuracy)` tuple if `show_accuracy=True`.
        - __Arguments__: Same meaning as fit method above. verbose is used as a binary flag (progress bar or nothing).
    - __predict__(X, batch_size=128, verbose=1):
@@ -37,9 +37,9 @@ model = keras.models.Sequential()
    - __predict_classes__(X, batch_size=128, verbose=1): Return an array of class predictions for some test data.
        - __Return__: An array of labels for some test data.
        - __Arguments__: Same meaning as fit method above. verbose is used as a binary flag (progress bar or nothing).
-    - __train_on_batch__(X, y, accuracy=False): Single gradient update on one batch.
+    - __train_on_batch__(X, y, accuracy=False, class_weight=None, sample_weight=None): Single gradient update on one batch.
        - __Return__: loss over the data, or tuple `(loss, accuracy)` if `accuracy=True`.
-    - __test_on_batch__(X, y, accuracy=False): Single performance evaluation on one batch.
+    - __test_on_batch__(X, y, accuracy=False, sample_weight=None): Single performance evaluation on one batch.
        - __Return__: loss over the data, or tuple `(loss, accuracy)` if `accuracy=True`.
    - __save_weights__(fname, overwrite=False): Store the weights of all layers to a HDF5 file. If overwrite==False and the file already exists, an exception will be thrown.
    - __load_weights__(fname): Sets the weights of a model, based to weights stored by __save_weights__. You can only __load_weights__ on a savefile from a model with an identical architecture. __load_weights__ can be called either before or after the __compile__ step.
@@ -52,7 +52,7 @@ from keras.layers.core import Dense, Dropout, Activation
 from keras.optimizers import SGD

 model = Sequential()
-model.add(Dense(64, 2, init='uniform'))
+model.add(Dense(2, init='uniform', input_dim=64))
 model.add(Activation('softmax'))

 model.compile(loss='mse', optimizer='sgd')
@@ -125,10 +125,10 @@ Arbitrary connection graph. It can have any number of inputs and outputs, with e
 model = keras.models.Graph()
 ```
 - __Methods__:
-    - __add_input__(name, ndim=2, dtype='float'): Add an input with shape dimensionality `ndim`. 
+    - __add_input__(name, input_shape, dtype='float'): Add an input with shape dimensionality `ndim`. 
        - __Arguments__:
-            - __ndim__: Use `ndim=2` for vector input `(samples, features)`, ndim=3 for temporal input `(samples, time, features)`, ndim=4 for image input `(samples, channels, height, width)`.
-            - __dtype__: `float` or `int`. Use `int` if the input is connected to an Embedding layer, `float` otherwise.
+            - __input_shape__: Integer tuple, shape of the expected input (not including the samples axis). E.g. (10,) for 10-dimensional vectors, (None, 128) for sequences (of variable length) of 128-dimensional vectors, (3, 32, 32) for 32x32 images with RGB channels.
+            - __dtype__: `float` or `int`. Type of the expected input data.
    - __add_output__(name, input=None, inputs=[], merge_mode='concat'): Add an output connect to `input` or `inputs`.
        - __Arguments__:
            - __name__: str. unique identifier of the output.
@@ -176,10 +176,10 @@ __Examples__:
 ```python
 # graph model with one input and two outputs
 graph = Graph()
-graph.add_input(name='input', ndim=2)
-graph.add_node(Dense(32, 16), name='dense1', input='input')
-graph.add_node(Dense(32, 4), name='dense2', input='input')
-graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+graph.add_input(name='input', input_shape=(32,))
+graph.add_node(Dense(16), name='dense1', input='input')
+graph.add_node(Dense(4), name='dense2', input='input')
+graph.add_node(Dense(4), name='dense3', input='dense1')
 graph.add_output(name='output1', input='dense2')
 graph.add_output(name='output2', input='dense3')

@@ -191,11 +191,11 @@ history = graph.fit({'input':X_train, 'output1':y_train, 'output2':y2_train}, nb
 ```python
 # graph model with two inputs and one output
 graph = Graph()
-graph.add_input(name='input1', ndim=2)
-graph.add_input(name='input2', ndim=2)
-graph.add_node(Dense(32, 16), name='dense1', input='input1')
-graph.add_node(Dense(32, 4), name='dense2', input='input2')
-graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+graph.add_input(name='input1', input_shape=(32,))
+graph.add_input(name='input2', input_shape=(32,))
+graph.add_node(Dense(16), name='dense1', input='input1')
+graph.add_node(Dense(4), name='dense2', input='input2')
+graph.add_node(Dense(4), name='dense3', input='dense1')
 graph.add_output(name='output', inputs=['dense2', 'dense3'], merge_mode='sum')
 graph.compile('rmsprop', {'output':'mse'})

@@ -5,7 +5,7 @@ An optimizer is one of the two arguments required for compiling a Keras model:

 ```python
 model = Sequential()
-model.add(Dense(20, 64, init='uniform'))
+model.add(Dense(64, init='uniform', input_dim=10))
 model.add(Activation('tanh'))
 model.add(Activation('softmax'))

@@ -15,7 +15,7 @@ These layers expose 3 keyword arguments:

 ```python
 from keras.regularizers import l2, activity_l2
-model.add(Dense(64, 64, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
+model.add(Dense(64, input_dim=64, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
 ```

 ## Available penalties
@@ -1,9 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 from keras.models import Sequential, slice_X
-from keras.layers.core import Activation, Dense, RepeatVector
+from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
 from keras.layers import recurrent
-from sklearn.utils import shuffle
 import numpy as np

 """
@@ -25,18 +24,15 @@ and
 http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
 Theoretically it introduces shorter term dependencies between source and target.

-
 Two digits inverted:
 + One layer JZS1 (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs

 Three digits inverted:
 + One layer JZS1 (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs

-
 Four digits inverted:
 + One layer JZS1 (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs

-
 Five digits inverted:
 + One layer JZS1 (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs

@@ -122,23 +118,32 @@ for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, maxlen=DIGITS + 1)

 # Shuffle (X, y) in unison as the later parts of X will almost all be larger digits
-X, y = shuffle(X, y)
+indices = np.arange(len(y))
+np.random.shuffle(indices)
+X = X[indices]
+y = y[indices]
 # Explicitly set apart 10% for validation data that we never train over
 split_at = len(X) - len(X) / 10
 (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
 (y_train, y_val) = (y[:split_at], y[split_at:])

+print(X_train.shape)
+print(y_train.shape)
+
 print('Build model...')
 model = Sequential()
 # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
-model.add(RNN(len(chars), HIDDEN_SIZE))
+# note: in a situation where your input sequences have a variable length,
+# use input_shape=(None, nb_feature).
+model.add(RNN(HIDDEN_SIZE, input_shape=(None, len(chars))))
 # For the decoder's input, we repeat the encoded input for each time step
 model.add(RepeatVector(DIGITS + 1))
 # The decoder RNN could be multiple layers stacked or a single layer
 for _ in xrange(LAYERS):
-    model.add(RNN(HIDDEN_SIZE, HIDDEN_SIZE, return_sequences=True))
+    model.add(RNN(HIDDEN_SIZE, return_sequences=True))
+
 # For each of step of the output sequence, decide which character should be chosen
-model.add(Dense(HIDDEN_SIZE, len(chars)))
+model.add(TimeDistributedDense(len(chars)))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='adam')
@@ -148,7 +153,7 @@ for iteration in range(1, 200):
    print()
    print('-' * 50)
    print('Iteration', iteration)
-    model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=1, validation_data=(X_val, y_val), show_accuracy=True)
+    model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1, validation_data=(X_val, y_val), show_accuracy=True)
    ###
    # Select 10 samples from the validation set at random so we can visualize errors
    for i in xrange(10):
@@ -181,15 +181,15 @@ print('Build model...')

 sentrnn = Sequential()
 sentrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE, mask_zero=True))
-sentrnn.add(RNN(EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, return_sequences=False))
+sentrnn.add(RNN(SENT_HIDDEN_SIZE, return_sequences=False))

 qrnn = Sequential()
 qrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE))
-qrnn.add(RNN(EMBED_HIDDEN_SIZE, QUERY_HIDDEN_SIZE, return_sequences=False))
+qrnn.add(RNN(QUERY_HIDDEN_SIZE, return_sequences=False))

 model = Sequential()
 model.add(Merge([sentrnn, qrnn], mode='concat'))
-model.add(Dense(SENT_HIDDEN_SIZE + QUERY_HIDDEN_SIZE, vocab_size, activation='softmax'))
+model.add(Dense(vocab_size, activation='softmax'))

 model.compile(optimizer='adam', loss='categorical_crossentropy', class_mode='categorical')

@@ -28,6 +28,11 @@ nb_classes = 10
 nb_epoch = 200
 data_augmentation = True

+# input image dimensions
+img_rows, img_cols = 32, 32
+# the CIFAR10 images are RGB
+img_channels = 3
+
 # the data, shuffled and split between tran and test sets
 (X_train, y_train), (X_test, y_test) = cifar10.load_data()
 print('X_train shape:', X_train.shape)
@@ -40,26 +45,26 @@ Y_test = np_utils.to_categorical(y_test, nb_classes)

 model = Sequential()

-model.add(Convolution2D(32, 3, 3, 3, border_mode='full'))
+model.add(Convolution2D(32, 3, 3, border_mode='full',
+                        input_shape=(img_channels, img_rows, img_cols)))
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
+model.add(Convolution2D(32, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

-model.add(Convolution2D(64, 32, 3, 3, border_mode='full'))
+model.add(Convolution2D(64, 3, 3, border_mode='full'))
 model.add(Activation('relu'))
-model.add(Convolution2D(64, 64, 3, 3))
+model.add(Convolution2D(64, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

 model.add(Flatten())
-model.add(Dense(64*8*8, 512))
+model.add(Dense(512))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-
-model.add(Dense(512, nb_classes))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 # let's train the model using SGD + momentum (how original).
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337) # for reproducibility
+np.random.seed(1337)  # for reproducibility

 from keras.preprocessing import sequence
 from keras.optimizers import RMSprop
@@ -25,7 +25,7 @@ max_features = 5000
 maxlen = 100
 batch_size = 32
 embedding_dims = 100
-nb_filters = 250
+nb_filter = 250
 filter_length = 3
 hidden_dims = 250
 nb_epoch = 3
@@ -47,35 +47,29 @@ model = Sequential()

 # we start off with an efficient embedding layer which maps
 # our vocab indices into embedding_dims dimensions
-model.add(Embedding(max_features, embedding_dims))
+model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
 model.add(Dropout(0.25))

-# we add a Convolution1D, which will learn nb_filters
+# we add a Convolution1D, which will learn nb_filter
 # word group filters of size filter_length:
-model.add(Convolution1D(input_dim=embedding_dims,
-                        nb_filter=nb_filters,
+model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode="valid",
                        activation="relu",
                        subsample_length=1))
-
 # we use standard max pooling (halving the output of the previous layer):
 model.add(MaxPooling1D(pool_length=2))

 # We flatten the output of the conv layer, so that we can add a vanilla dense layer:
 model.add(Flatten())

-# Computing the output shape of a conv layer can be tricky;
-# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
-output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
-
 # We add a vanilla hidden layer:
-model.add(Dense(output_size, hidden_dims))
+model.add(Dense(hidden_dims))
 model.add(Dropout(0.25))
 model.add(Activation('relu'))

 # We project onto a single unit output layer, and squash it with a sigmoid:
-model.add(Dense(hidden_dims, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))

 model.compile(loss='binary_crossentropy', optimizer='rmsprop', class_mode="binary")
@@ -48,10 +48,10 @@ print('X_test shape:', X_test.shape)

 print('Build model...')
 model = Sequential()
-model.add(Embedding(max_features, 128))
-model.add(LSTM(128, 128))  # try using a GRU instead, for fun
+model.add(Embedding(max_features, 128, input_length=maxlen))
+model.add(LSTM(128))  # try using a GRU instead, for fun
 model.add(Dropout(0.5))
-model.add(Dense(128, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))

 # try using different optimizers and different optimizer configs
@@ -20,11 +20,11 @@ from sklearn.preprocessing import StandardScaler

    Compatible Python 2.7-3.4. Requires Scikit-Learn and Pandas.

-    Recommended to run on GPU: 
+    Recommended to run on GPU:
        Command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python kaggle_otto_nn.py
        On EC2 g2.2xlarge instance: 19s/epoch. 6-7 minutes total training time.

-    Best validation score at epoch 21: 0.4881 
+    Best validation score at epoch 21: 0.4881

    Try it at home:
        - with/without BatchNormalization (BatchNormalization helps!)
@@ -78,7 +78,6 @@ def make_submission(y_prob, ids, encoder, fname):
            f.write('\n')
    print("Wrote submission to file {}.".format(fname))

-
 print("Loading data...")
 X, labels = load_data('train.csv', train=True)
 X, scaler = preprocess_data(X)
@@ -96,31 +95,29 @@ print(dims, 'dims')
 print("Building model...")

 model = Sequential()
-model.add(Dense(dims, 512, init='glorot_uniform'))
-model.add(PReLU((512,)))
-model.add(BatchNormalization((512,)))
+model.add(Dense(512, input_shape=(dims,)))
+model.add(PReLU())
+model.add(BatchNormalization())
 model.add(Dropout(0.5))

-model.add(Dense(512, 512, init='glorot_uniform'))
-model.add(PReLU((512,)))
-model.add(BatchNormalization((512,)))
+model.add(Dense(512))
+model.add(PReLU())
+model.add(BatchNormalization())
 model.add(Dropout(0.5))

-model.add(Dense(512, 512, init='glorot_uniform'))
-model.add(PReLU((512,)))
-model.add(BatchNormalization((512,)))
+model.add(Dense(512))
+model.add(PReLU())
+model.add(BatchNormalization())
 model.add(Dropout(0.5))

-model.add(Dense(512, nb_classes, init='glorot_uniform'))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer="adam")

 print("Training model...")
-
 model.fit(X, y, nb_epoch=20, batch_size=128, validation_split=0.15)

 print("Generating submission...")
-
 proba = model.predict_proba(X_test)
 make_submission(proba, ids, encoder, fname='keras-otto.csv')
@@ -4,7 +4,8 @@ from keras.layers.core import Dense, Activation, Dropout
 from keras.layers.recurrent import LSTM
 from keras.datasets.data_utils import get_file
 import numpy as np
-import random, sys
+import random
+import sys

 '''
    Example script to generate text from Nietzsche's writings.
@@ -15,7 +16,7 @@ import random, sys
    It is recommended to run this script on GPU, as recurrent
    networks are quite computationally intensive.

-    If you try this script on new data, make sure your corpus 
+    If you try this script on new data, make sure your corpus
    has at least ~100k characters. ~1M is better.
 '''

@@ -34,7 +35,7 @@ step = 3
 sentences = []
 next_chars = []
 for i in range(0, len(text) - maxlen, step):
-    sentences.append(text[i : i + maxlen])
+    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
 print('nb sequences:', len(sentences))

@@ -50,20 +51,21 @@ for i, sentence in enumerate(sentences):
 # build the model: 2 stacked LSTM
 print('Build model...')
 model = Sequential()
-model.add(LSTM(len(chars), 512, return_sequences=True))
+model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
 model.add(Dropout(0.2))
-model.add(LSTM(512, 512, return_sequences=False))
+model.add(LSTM(512, return_sequences=False))
 model.add(Dropout(0.2))
-model.add(Dense(512, len(chars)))
+model.add(Dense(len(chars)))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

-# helper function to sample an index from a probability array
+
 def sample(a, temperature=1.0):
-    a = np.log(a)/temperature
-    a = np.exp(a)/np.sum(np.exp(a))
-    return np.argmax(np.random.multinomial(1,a,1))
+    # helper function to sample an index from a probability array
+    a = np.log(a) / temperature
+    a = np.exp(a) / np.sum(np.exp(a))
+    return np.argmax(np.random.multinomial(1, a, 1))

 # train the model, output generated text after each iteration
 for iteration in range(1, 60):
@@ -79,7 +81,7 @@ for iteration in range(1, 60):
        print('----- diversity:', diversity)

        generated = ''
-        sentence = text[start_index : start_index + maxlen]
+        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
@@ -22,11 +22,20 @@ batch_size = 128
 nb_classes = 10
 nb_epoch = 12

+# input image dimensions
+img_rows, img_cols = 28, 28
+# number of convolutional filters to use
+nb_filters = 32
+# size of pooling area for max pooling
+nb_pool = 2
+# convolution kernel size
+nb_conv = 3
+
 # the data, shuffled and split between tran and test sets
 (X_train, y_train), (X_test, y_test) = mnist.load_data()

-X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
-X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
+X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
+X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
 X_train = X_train.astype("float32")
 X_test = X_test.astype("float32")
 X_train /= 255
@@ -41,19 +50,20 @@ Y_test = np_utils.to_categorical(y_test, nb_classes)

 model = Sequential()

-model.add(Convolution2D(32, 1, 3, 3, border_mode='full'))
+model.add(Convolution2D(nb_filters, nb_conv, nb_conv,
+                        border_mode='full',
+                        input_shape=(1, img_rows, img_cols)))
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 32, 3, 3))
+model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
 model.add(Dropout(0.25))

 model.add(Flatten())
-model.add(Dense(32*196, 128))
+model.add(Dense(128))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-
-model.add(Dense(128, nb_classes))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='adadelta')
@@ -55,11 +55,12 @@ Y_test = np_utils.to_categorical(y_test, nb_classes)

 print('Evaluate IRNN...')
 model = Sequential()
-model.add(SimpleRNN(input_dim=1, output_dim=hidden_units,
+model.add(SimpleRNN(output_dim=hidden_units,
                    init=lambda shape: normal(shape, scale=0.001),
                    inner_init=lambda shape: identity(shape, scale=1.0),
-                    activation='relu', truncate_gradient=BPTT_truncate))
-model.add(Dense(hidden_units, nb_classes))
+                    activation='relu', truncate_gradient=BPTT_truncate,
+                    input_shape=(None, 1)))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))
 rmsprop = RMSprop(lr=learning_rate)
 model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
@@ -73,8 +74,8 @@ print('IRNN test accuracy:', scores[1])

 print('Compare to LSTM...')
 model = Sequential()
-model.add(LSTM(1, hidden_units))
-model.add(Dense(hidden_units, nb_classes))
+model.add(LSTM(hidden_units, input_shape=(None, 1)))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))
 rmsprop = RMSprop(lr=learning_rate)
 model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
@@ -37,13 +37,13 @@ Y_train = np_utils.to_categorical(y_train, nb_classes)
 Y_test = np_utils.to_categorical(y_test, nb_classes)

 model = Sequential()
-model.add(Dense(784, 128))
+model.add(Dense(128, input_shape=(784,)))
 model.add(Activation('relu'))
 model.add(Dropout(0.2))
-model.add(Dense(128, 128))
+model.add(Dense(128))
 model.add(Activation('relu'))
 model.add(Dropout(0.2))
-model.add(Dense(128, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))

 rms = RMSprop()
@@ -45,10 +45,10 @@ print('Y_test shape:', Y_test.shape)

 print("Building model...")
 model = Sequential()
-model.add(Dense(max_words, 512))
+model.add(Dense(512, input_shape=(max_words,)))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-model.add(Dense(512, nb_classes))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='adam')
@@ -32,7 +32,7 @@ from __future__ import print_function

 import numpy as np
 import theano
-import six.moves.cPickle
+from six.moves import cPickle
 import os, re, json

 from keras.preprocessing import sequence, text
@@ -90,7 +90,7 @@ def text_generator(path=data_path):
 # model management
 if load_tokenizer:
    print('Load tokenizer...')
-    tokenizer = six.moves.cPickle.load(open(os.path.join(save_dir, tokenizer_fname), 'rb'))
+    tokenizer = cPickle.load(open(os.path.join(save_dir, tokenizer_fname), 'rb'))
 else:
    print("Fit tokenizer...")
    tokenizer = text.Tokenizer(nb_words=max_features)
@@ -99,13 +99,13 @@ else:
        print("Save tokenizer...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
-        six.moves.cPickle.dump(tokenizer, open(os.path.join(save_dir, tokenizer_fname), "wb"))
+        cPickle.dump(tokenizer, open(os.path.join(save_dir, tokenizer_fname), "wb"))

 # training process
 if train_model:
    if load_model:
        print('Load model...')
-        model = six.moves.cPickle.load(open(os.path.join(save_dir, model_load_fname), 'rb'))
+        model = cPickle.load(open(os.path.join(save_dir, model_load_fname), 'rb'))
    else:
        print('Build model...')
        model = Sequential()
@@ -129,7 +129,7 @@ if train_model:
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
-                loss = model.train(X, labels)
+                loss = model.train_on_batch(X, labels)
                losses.append(loss)
                if len(losses) % 100 == 0:
                    progbar.update(i, values=[("loss", np.mean(losses))])
@@ -142,7 +142,7 @@ if train_model:
        print("Saving model...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
-        six.moves.cPickle.dump(model, open(os.path.join(save_dir, model_save_fname), "wb"))
+        cPickle.dump(model, open(os.path.join(save_dir, model_save_fname), "wb"))


 print("It's test time!")
@@ -158,7 +158,6 @@ norm_weights = np_utils.normalize(weights)

 word_index = tokenizer.word_index
 reverse_word_index = dict([(v, k) for k, v in list(word_index.items())])
-word_index = tokenizer.word_index


 def embed_word(w):
@@ -17,7 +17,7 @@ def softplus(x):


 def relu(x):
-    return (x + abs(x)) / 2.0
+    return T.nnet.relu(x)


 def tanh(x):
@@ -99,10 +99,11 @@ class Callback(object):
 class BaseLogger(Callback):
    def on_train_begin(self, logs={}):
        self.verbose = self.params['verbose']
+        self.nb_epoch = self.params['nb_epoch']

    def on_epoch_begin(self, epoch, logs={}):
        if self.verbose:
-            print('Epoch %d' % epoch)
+            print('Epoch %d/%d' % (epoch + 1, self.nb_epoch))
            self.progbar = Progbar(target=self.params['nb_sample'],
                                   verbose=self.verbose)
        self.seen = 0
@@ -182,6 +183,7 @@ class ModelCheckpoint(Callback):
        self.best = np.Inf

    def on_epoch_end(self, epoch, logs={}):
+        filepath = self.filepath.format(epoch=epoch, **logs)
        if self.save_best_only:
            current = logs.get(self.monitor)
            if current is None:
@@ -190,16 +192,16 @@ class ModelCheckpoint(Callback):
                if current < self.best:
                    if self.verbose > 0:
                        print("Epoch %05d: %s improved from %0.5f to %0.5f, saving model to %s"
-                              % (epoch, self.monitor, self.best, current, self.filepath))
+                              % (epoch, self.monitor, self.best, current, filepath))
                    self.best = current
-                    self.model.save_weights(self.filepath, overwrite=True)
+                    self.model.save_weights(filepath, overwrite=True)
                else:
                    if self.verbose > 0:
                        print("Epoch %05d: %s did not improve" % (epoch, self.monitor))
        else:
            if self.verbose > 0:
-                print("Epoch %05d: saving model to %s" % (epoch, self.filepath))
-            self.model.save_weights(self.filepath, overwrite=True)
+                print("Epoch %05d: saving model to %s" % (epoch, filepath))
+            self.model.save_weights(filepath, overwrite=True)


 class EarlyStopping(Callback):
@@ -271,4 +273,4 @@ class LearningRateScheduler(Callback):
        self.schedule = schedule

    def on_epoch_begin(self, epoch, logs={}):
-        model.lr.set_value(self.schedule(epoch))
+        self.model.optimizer.lr.set_value(self.schedule(epoch))
@@ -29,7 +29,8 @@ class MaxNorm(Constraint):

 class NonNeg(Constraint):
    def __call__(self, p):
-        p *= T.ge(p, 0)
+        p = theano.shared(p)
+        p *= T.ge(p, 0.)
        return p


@@ -1,15 +1,15 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 import sys
-import six.moves.cPickle
+from six.moves import cPickle
 from six.moves import range

 def load_batch(fpath, label_key='labels'):
    f = open(fpath, 'rb')
    if sys.version_info < (3,):
-        d = six.moves.cPickle.load(f)
+        d = cPickle.load(f)
    else:
-        d = six.moves.cPickle.load(f, encoding="bytes")
+        d = cPickle.load(f, encoding="bytes")
        # decode utf8
        for k, v in d.items():
            del(d[k])
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-import six.moves.cPickle
+from six.moves import cPickle
 import gzip
 from .data_utils import get_file
 import random
@@ -17,7 +17,7 @@ def load_data(path="imdb.pkl", nb_words=None, skip_top=0, maxlen=None, test_spli
    else:
        f = open(path, 'rb')

-    X, labels = six.moves.cPickle.load(f)
+    X, labels = cPickle.load(f)
    f.close()

    np.random.seed(seed)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import gzip
 from .data_utils import get_file
-import six.moves.cPickle
+from six.moves import cPickle
 import sys


@@ -14,9 +14,9 @@ def load_data(path="mnist.pkl.gz"):
        f = open(path, 'rb')

    if sys.version_info < (3,):
-        data = six.moves.cPickle.load(f)
+        data = cPickle.load(f)
    else:
-        data = six.moves.cPickle.load(f, encoding="bytes")
+        data = cPickle.load(f, encoding="bytes")

    f.close()

@@ -5,7 +5,7 @@ from .data_utils import get_file
 import string
 import random
 import os
-import six.moves.cPickle
+from six.moves import cPickle
 from six.moves import zip
 import numpy as np

@@ -78,8 +78,8 @@ def make_reuters_dataset(path=os.path.join('datasets', 'temp', 'reuters21578'),
    dataset = (X, labels)
    print('-')
    print('Saving...')
-    six.moves.cPickle.dump(dataset, open(os.path.join('datasets', 'data', 'reuters.pkl'), 'w'))
-    six.moves.cPickle.dump(tokenizer.word_index, open(os.path.join('datasets', 'data', 'reuters_word_index.pkl'), 'w'))
+    cPickle.dump(dataset, open(os.path.join('datasets', 'data', 'reuters.pkl'), 'w'))
+    cPickle.dump(tokenizer.word_index, open(os.path.join('datasets', 'data', 'reuters_word_index.pkl'), 'w'))


 def load_data(path="reuters.pkl", nb_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=113,
@@ -88,7 +88,7 @@ def load_data(path="reuters.pkl", nb_words=None, skip_top=0, maxlen=None, test_s
    path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/reuters.pkl")
    f = open(path, 'rb')

-    X, labels = six.moves.cPickle.load(f)
+    X, labels = cPickle.load(f)
    f.close()

    np.random.seed(seed)
@@ -140,7 +140,7 @@ def load_data(path="reuters.pkl", nb_words=None, skip_top=0, maxlen=None, test_s
 def get_word_index(path="reuters_word_index.pkl"):
    path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/reuters_word_index.pkl")
    f = open(path, 'rb')
-    return six.moves.cPickle.load(f)
+    return cPickle.load(f)


 if __name__ == "__main__":
@@ -58,7 +58,7 @@ def he_uniform(shape):


 def orthogonal(shape, scale=1.1):
-    ''' From Lasagne
+    ''' From Lasagne. Reference: Saxe et al., http://arxiv.org/abs/1312.6120
    '''
    flat_shape = (shape[0], np.prod(shape[1:]))
    a = np.random.normal(0.0, 1.0, flat_shape)
@@ -6,17 +6,19 @@ import numpy as np


 class LeakyReLU(MaskedLayer):
-    def __init__(self, alpha=0.3):
-        super(LeakyReLU, self).__init__()
+    def __init__(self, alpha=0.3, **kwargs):
+        super(LeakyReLU, self).__init__(**kwargs)
        self.alpha = alpha

    def get_output(self, train):
        X = self.get_input(train)
-        return ((X + abs(X)) / 2.0) + self.alpha * ((X - abs(X)) / 2.0)
+        return T.nnet.relu(X, self.alpha)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "alpha": self.alpha}
+        config = {"name": self.__class__.__name__,
+                  "alpha": self.alpha}
+        base_config = super(LeakyReLU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class PReLU(MaskedLayer):
@@ -25,26 +27,31 @@ class PReLU(MaskedLayer):
            Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
                http://arxiv.org/pdf/1502.01852v1.pdf
    '''
-    def __init__(self, input_shape, init='zero', weights=None):
-        super(PReLU, self).__init__()
+    def __init__(self, init='zero', weights=None, **kwargs):
        self.init = initializations.get(init)
+        self.initial_weights = weights
+        super(PReLU, self).__init__(**kwargs)
+
+    def build(self):
+        input_shape = self.input_shape[1:]
        self.alphas = self.init(input_shape)
        self.params = [self.alphas]
-        self.input_shape = input_shape

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def get_output(self, train):
        X = self.get_input(train)
-        pos = ((X + abs(X)) / 2.0)
-        neg = self.alphas * ((X - abs(X)) / 2.0)
+        pos = T.nnet.relu(X)
+        neg = self.alphas * (X - abs(X)) * 0.5
        return pos + neg

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_shape": self.input_shape,
-                "init": self.init.__name__}
+        config = {"name": self.__class__.__name__,
+                  "init": self.init.__name__}
+        base_config = super(PReLU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class ParametricSoftplus(MaskedLayer):
@@ -55,28 +62,35 @@ class ParametricSoftplus(MaskedLayer):
            Inferring Nonlinear Neuronal Computation Based on Physiologically Plausible Inputs
            http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003143
    '''
-    def __init__(self, input_shape, alpha_init=0.2, beta_init=5.0, weights=None):
-
-        super(ParametricSoftplus, self).__init__()
+    def __init__(self, alpha_init=0.2, beta_init=5.0,
+                 weights=None, **kwargs):
        self.alpha_init = alpha_init
        self.beta_init = beta_init
-        self.alphas = sharedX(alpha_init * np.ones(input_shape))
-        self.betas = sharedX(beta_init * np.ones(input_shape))
+        self.initial_weights = weights
+        super(ParametricSoftplus, self).__init__(**kwargs)
+
+    def build(self):
+        input_shape = self.input_shape[1:]
+        self.alphas = sharedX(self.alpha_init * np.ones(input_shape))
+        self.betas = sharedX(self.beta_init * np.ones(input_shape))
        self.params = [self.alphas, self.betas]
        self.input_shape = input_shape

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def get_output(self, train):
        X = self.get_input(train)
        return T.nnet.softplus(self.betas * X) * self.alphas

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_shape": self.input_shape,
-                "alpha_init": self.alpha_init,
-                "beta_init": self.beta_init}
+        config = {"name": self.__class__.__name__,
+                  "alpha_init": self.alpha_init,
+                  "beta_init": self.beta_init}
+        base_config = super(ParametricSoftplus, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+

 class ThresholdedLinear(MaskedLayer):
    '''
@@ -86,19 +100,22 @@ class ThresholdedLinear(MaskedLayer):
            Zero-Bias Autoencoders and the Benefits of Co-Adapting Features
            http://arxiv.org/pdf/1402.3337.pdf
    '''
-    def __init__(self, theta=1.0):
-        super(ThresholdedLinear, self).__init__()
+    def __init__(self, theta=1.0, **kwargs):
+        super(ThresholdedLinear, self).__init__(**kwargs)
        self.theta = theta
-    
+
    def get_output(self, train):
        X = self.get_input(train)
-        return T.switch( abs(X) < self.theta, 0, X )
+        return T.switch(abs(X) < self.theta, 0, X)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-            "theta": self.theta}
+        config = {"name": self.__class__.__name__,
+                  "theta": self.theta}
+        base_config = super(ThresholdedLinear, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))

-class ThresholdedReLu(MaskedLayer):
+
+class ThresholdedReLU(MaskedLayer):
    '''
        Thresholded Rectified Activation

@@ -106,14 +123,16 @@ class ThresholdedReLu(MaskedLayer):
            Zero-Bias Autoencoders and the Benefits of Co-Adapting Features
            http://arxiv.org/pdf/1402.3337.pdf
    '''
-    def __init__(self, theta=1.0):
-        super(ThresholdedReLu, self).__init__()
+    def __init__(self, theta=1.0, **kwargs):
+        super(ThresholdedReLU, self).__init__(**kwargs)
        self.theta = theta
-    
+
    def get_output(self, train):
        X = self.get_input(train)
-        return T.switch( X > self.theta, X, 0 )
+        return T.switch(X > self.theta, X, 0)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-            "theta": self.theta}
+        config = {"name": self.__class__.__name__,
+                  "theta": self.theta}
+        base_config = super(ThresholdedReLU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -37,7 +37,6 @@ class Sequential(Layer):
            self.layers[-1].set_previous(self.layers[-2])
            if not hasattr(self.layers[0], 'input'):
                self.set_input()
-        layer.init_updates()

        params, regularizers, constraints, updates = layer.get_params()
        self.params += params
@@ -45,6 +44,10 @@ class Sequential(Layer):
        self.constraints += constraints
        self.updates += updates

+    @property
+    def output_shape(self):
+        return self.layers[-1].output_shape
+
    def get_output(self, train=False):
        return self.layers[-1].get_output(train)

@@ -80,6 +83,9 @@ class Sequential(Layer):
        return {"name": self.__class__.__name__,
                "layers": [layer.get_config() for layer in self.layers]}

+    def count_params(self):
+        return sum([layer.count_params() for layer in self.layers])
+

 class Graph(Layer):
    '''
@@ -145,18 +151,29 @@ class Graph(Layer):
    def input(self):
        return self.get_input()

+    @property
+    def output_shape(self):
+        if self.nb_output == 1:
+            # return tuple
+            return self.outputs[self.output_order[0]].output_shape
+        else:
+            # return dictionary mapping output names to shape tuples
+            return dict([(k, v.output_shape) for k, v in self.outputs.items()])
+
    def get_output(self, train=False):
        if len(self.inputs) == len(self.outputs) == 1:
            return self.outputs[self.output_order[0]].get_output(train)
        else:
            return dict([(k, v.get_output(train)) for k, v in self.outputs.items()])

-    def add_input(self, name, ndim=2, dtype='float'):
+    def add_input(self, name, input_shape, dtype='float'):
        if name in self.namespace:
            raise Exception('Duplicate node identifier: ' + name)
        self.namespace.add(name)
        self.input_order.append(name)
        layer = Layer()  # empty layer
+        layer.set_input_shape(input_shape)
+        ndim = len(input_shape) + 1
        if dtype == 'float':
            layer.input = ndim_tensor(ndim)
        else:
@@ -166,9 +183,12 @@ class Graph(Layer):
                raise Exception('Type "int" can only be used with ndim==2 (Embedding).')
        layer.input.name = name
        self.inputs[name] = layer
-        self.input_config.append({'name': name, 'ndim': ndim, 'dtype': dtype})
+        self.input_config.append({'name': name,
+                                  'input_shape': input_shape,
+                                  'dtype': dtype})

-    def add_node(self, layer, name, input=None, inputs=[], merge_mode='concat', create_output=False):
+    def add_node(self, layer, name, input=None, inputs=[],
+                 merge_mode='concat', concat_axis=-1, create_output=False):
        if hasattr(layer, 'set_name'):
            layer.set_name(name)
        if name in self.namespace:
@@ -189,7 +209,7 @@ class Graph(Layer):
                    to_merge.append(self.inputs[n])
                else:
                    raise Exception('Unknown identifier: ' + n)
-            merge = Merge(to_merge, mode=merge_mode)
+            merge = Merge(to_merge, mode=merge_mode, concat_axis=concat_axis)
            layer.set_previous(merge)

        self.namespace.add(name)
@@ -197,8 +217,9 @@ class Graph(Layer):
        self.node_config.append({'name': name,
                                 'input': input,
                                 'inputs': inputs,
-                                 'merge_mode': merge_mode})
-        layer.init_updates()
+                                 'merge_mode': merge_mode,
+                                 'concat_axis': concat_axis,
+                                 'create_output': create_output})
        params, regularizers, constraints, updates = layer.get_params()
        self.params += params
        self.regularizers += regularizers
@@ -208,7 +229,8 @@ class Graph(Layer):
        if create_output:
            self.add_output(name, input=name)

-    def add_output(self, name, input=None, inputs=[], merge_mode='concat'):
+    def add_output(self, name, input=None, inputs=[],
+                   merge_mode='concat', concat_axis=-1):
        if name in self.output_order:
            raise Exception('Duplicate output identifier: ' + name)
        if input:
@@ -224,14 +246,15 @@ class Graph(Layer):
                if n not in self.nodes:
                    raise Exception('Unknown identifier: ' + n)
                to_merge.append(self.nodes[n])
-            merge = Merge(to_merge, mode=merge_mode)
+            merge = Merge(to_merge, mode=merge_mode, concat_axis=concat_axis)
            self.outputs[name] = merge

        self.output_order.append(name)
        self.output_config.append({'name': name,
                                   'input': input,
                                   'inputs': inputs,
-                                   'merge_mode': merge_mode})
+                                   'merge_mode': merge_mode,
+                                   'concat_axis': concat_axis})

    def get_config(self):
        return {"name": self.__class__.__name__,
@@ -241,3 +264,6 @@ class Graph(Layer):
                "input_order": self.input_order,
                "output_order": self.output_order,
                "nodes": dict([(c["name"], self.nodes[c["name"]].get_config()) for c in self.node_config])}
+
+    def count_params(self):
+        return sum([layer.count_params() for layer in self.nodes.values()])
@@ -3,153 +3,241 @@ from __future__ import absolute_import

 import theano
 import theano.tensor as T
-from theano.sandbox.cuda import dnn
+from theano.tensor.signal import downsample

 from .. import activations, initializations, regularizers, constraints
-from ..utils.theano_utils import shared_zeros
+from ..utils.theano_utils import shared_zeros, on_gpu
 from ..layers.core import Layer

+if on_gpu():
+    from theano.sandbox.cuda import dnn
+
+
+def conv_output_length(input_length, filter_size, border_mode, stride):
+    if input_length is None:
+        return None
+    assert border_mode in {'same', 'full', 'valid'}
+    if border_mode == 'same':
+        output_length = input_length
+    elif border_mode == 'full':
+        output_length = input_length + filter_size - 1
+    elif border_mode == 'valid':
+        output_length = input_length - filter_size + 1
+    return (output_length + stride - 1) // stride
+
+
+def pool_output_length(input_length, pool_size, ignore_border, stride):
+    if input_length is None:
+        return None
+    if ignore_border:
+        output_length = input_length - pool_size + 1
+        output_length = (output_length + stride - 1) // stride
+    else:
+        if pool_size == input_length:
+            output_length = min(input_length, stride - stride % 2)
+            if output_length <= 0:
+                output_length = 1
+        elif stride >= pool_size:
+            output_length = (input_length + stride - 1) // stride
+        else:
+            output_length = (input_length - pool_size + stride - 1) // stride
+            if output_length <= 0:
+                output_length = 1
+            else:
+                output_length += 1
+    return output_length
+

 class Convolution1D(Layer):
-    def __init__(self, input_dim, nb_filter, filter_length,
+    input_ndim = 3
+
+    def __init__(self, nb_filter, filter_length,
                 init='uniform', activation='linear', weights=None,
                 border_mode='valid', subsample_length=1,
                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
-                 W_constraint=None, b_constraint=None):
+                 W_constraint=None, b_constraint=None, input_dim=None, input_length=None, **kwargs):

        if border_mode not in {'valid', 'full', 'same'}:
            raise Exception('Invalid border mode for Convolution1D:', border_mode)
-
-        super(Convolution1D, self).__init__()
        self.nb_filter = nb_filter
-        self.input_dim = input_dim
        self.filter_length = filter_length
-        self.subsample_length = subsample_length
        self.init = initializations.get(init)
        self.activation = activations.get(activation)
-        self.subsample = (1, subsample_length)
        self.border_mode = border_mode
+        self.subsample_length = subsample_length

-        self.input = T.tensor3()
-        self.W_shape = (nb_filter, input_dim, filter_length, 1)
-        self.W = self.init(self.W_shape)
-        self.b = shared_zeros((nb_filter,))
-
-        self.params = [self.W, self.b]
-
-        self.regularizers = []
+        self.subsample = (subsample_length, 1)

        self.W_regularizer = regularizers.get(W_regularizer)
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
        self.b_regularizer = regularizers.get(b_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        self.activity_regularizer = regularizers.get(activity_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.constraints = [self.W_constraint, self.b_constraint]

-        if weights is not None:
-            self.set_weights(weights)
+        self.initial_weights = weights

-    def get_output(self, train):
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(Convolution1D, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
+        self.input = T.tensor3()
+        self.W_shape = (self.nb_filter, input_dim, self.filter_length, 1)
+        self.W = self.init(self.W_shape)
+        self.b = shared_zeros((self.nb_filter,))
+        self.params = [self.W, self.b]
+        self.regularizers = []
+
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+
+        if self.b_regularizer:
+            self.b_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
+        if self.activity_regularizer:
+            self.activity_regularizer.set_layer(self)
+            self.regularizers.append(self.activity_regularizer)
+
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    @property
+    def output_shape(self):
+        length = conv_output_length(self.input_shape[1], self.filter_length, self.border_mode, self.subsample[0])
+        return (self.input_shape[0], length, self.nb_filter)
+
+    def get_output(self, train=False):
        X = self.get_input(train)
        X = T.reshape(X, (X.shape[0], X.shape[1], X.shape[2], 1)).dimshuffle(0, 2, 1, 3)

        border_mode = self.border_mode
-        if border_mode == 'same':
-            border_mode = 'full'
+        if on_gpu() and dnn.dnn_available():
+            if border_mode == 'same':
+                assert(self.subsample_length == 1)
+                pad_x = (self.filter_length - self.subsample_length) // 2
+                conv_out = dnn.dnn_conv(img=X,
+                                        kerns=self.W,
+                                        border_mode=(pad_x, 0))
+            else:
+                conv_out = dnn.dnn_conv(img=X,
+                                        kerns=self.W,
+                                        border_mode=border_mode,
+                                        subsample=self.subsample)
+        else:
+            if border_mode == 'same':
+                assert(self.subsample_length == 1)
+                border_mode = 'full'

-        conv_out = T.nnet.conv.conv2d(X, self.W, border_mode=border_mode, subsample=self.subsample)
-        if self.border_mode == 'same':
-            shift_x = (self.filter_length - 1) // 2
-            conv_out = conv_out[:, :, shift_x:X.shape[2] + shift_x, :]
+            input_shape = self.input_shape
+            image_shape = (input_shape[0], input_shape[2], input_shape[1], 1)
+            conv_out = T.nnet.conv.conv2d(X, self.W,
+                                          border_mode=border_mode,
+                                          subsample=self.subsample,
+                                          image_shape=image_shape,
+                                          filter_shape=self.W_shape)
+            if self.border_mode == 'same':
+                shift_x = (self.filter_length - 1) // 2
+                conv_out = conv_out[:, :, shift_x:X.shape[2] + shift_x, :]

        output = self.activation(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
        output = T.reshape(output, (output.shape[0], output.shape[1], output.shape[2])).dimshuffle(0, 2, 1)
        return output

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "nb_filter": self.nb_filter,
-                "filter_length": self.filter_length,
-                "init": self.init.__name__,
-                "activation": self.activation.__name__,
-                "border_mode": self.border_mode,
-                "subsample_length": self.subsample_length,
-                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
-                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
-                "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        config = {"name": self.__class__.__name__,
+                  "nb_filter": self.nb_filter,
+                  "filter_length": self.filter_length,
+                  "init": self.init.__name__,
+                  "activation": self.activation.__name__,
+                  "border_mode": self.border_mode,
+                  "subsample_length": self.subsample_length,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "b_constraint": self.b_constraint.get_config() if self.b_constraint else None,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(Convolution1D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Convolution2D(Layer):
-    def __init__(self, nb_filter, stack_size, nb_row, nb_col,
+    input_ndim = 4
+
+    def __init__(self, nb_filter, nb_row, nb_col,
                 init='glorot_uniform', activation='linear', weights=None,
                 border_mode='valid', subsample=(1, 1),
                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
-                 W_constraint=None, b_constraint=None):
+                 W_constraint=None, b_constraint=None, **kwargs):

        if border_mode not in {'valid', 'full', 'same'}:
            raise Exception('Invalid border mode for Convolution2D:', border_mode)
-
-        super(Convolution2D, self).__init__()
-        self.init = initializations.get(init)
-        self.activation = activations.get(activation)
-        self.subsample = subsample
-        self.border_mode = border_mode
        self.nb_filter = nb_filter
-        self.stack_size = stack_size
-
        self.nb_row = nb_row
        self.nb_col = nb_col
-
-        self.input = T.tensor4()
-        self.W_shape = (nb_filter, stack_size, nb_row, nb_col)
-        self.W = self.init(self.W_shape)
-        self.b = shared_zeros((nb_filter,))
-
-        self.params = [self.W, self.b]
-
-        self.regularizers = []
+        self.init = initializations.get(init)
+        self.activation = activations.get(activation)
+        self.border_mode = border_mode
+        self.subsample = tuple(subsample)

        self.W_regularizer = regularizers.get(W_regularizer)
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
        self.b_regularizer = regularizers.get(b_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        self.activity_regularizer = regularizers.get(activity_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.constraints = [self.W_constraint, self.b_constraint]

-        if weights is not None:
-            self.set_weights(weights)
+        self.initial_weights = weights
+        super(Convolution2D, self).__init__(**kwargs)

-    def get_output(self, train):
+    def build(self):
+        stack_size = self.input_shape[1]
+        self.input = T.tensor4()
+        self.W_shape = (self.nb_filter, stack_size, self.nb_row, self.nb_col)
+        self.W = self.init(self.W_shape)
+        self.b = shared_zeros((self.nb_filter,))
+        self.params = [self.W, self.b]
+        self.regularizers = []
+
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+
+        if self.b_regularizer:
+            self.b_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
+        if self.activity_regularizer:
+            self.activity_regularizer.set_layer(self)
+            self.regularizers.append(self.activity_regularizer)
+
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        rows = input_shape[2]
+        cols = input_shape[3]
+        rows = conv_output_length(rows, self.nb_row, self.border_mode, self.subsample[0])
+        cols = conv_output_length(cols, self.nb_col, self.border_mode, self.subsample[1])
+        return (input_shape[0], self.nb_filter, rows, cols)
+
+    def get_output(self, train=False):
        X = self.get_input(train)
        border_mode = self.border_mode
-        if dnn.dnn_available() and theano.config.device[:3] == 'gpu':
+        if on_gpu() and dnn.dnn_available():
            if border_mode == 'same':
                assert(self.subsample == (1, 1))
                pad_x = (self.nb_row - self.subsample[0]) // 2
@@ -165,10 +253,13 @@ class Convolution2D(Layer):
        else:
            if border_mode == 'same':
                border_mode = 'full'
+                assert(self.subsample == (1, 1))

            conv_out = T.nnet.conv.conv2d(X, self.W,
                                          border_mode=border_mode,
-                                          subsample=self.subsample)
+                                          subsample=self.subsample,
+                                          image_shape=self.input_shape,
+                                          filter_shape=self.W_shape)
            if self.border_mode == 'same':
                shift_x = (self.nb_row - 1) // 2
                shift_y = (self.nb_col - 1) // 2
@@ -177,118 +268,237 @@ class Convolution2D(Layer):
        return self.activation(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "nb_filter": self.nb_filter,
-                "stack_size": self.stack_size,
-                "nb_row": self.nb_row,
-                "nb_col": self.nb_col,
-                "init": self.init.__name__,
-                "activation": self.activation.__name__,
-                "border_mode": self.border_mode,
-                "subsample": self.subsample,
-                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
-                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
-                "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        config = {"name": self.__class__.__name__,
+                  "nb_filter": self.nb_filter,
+                  "nb_row": self.nb_row,
+                  "nb_col": self.nb_col,
+                  "init": self.init.__name__,
+                  "activation": self.activation.__name__,
+                  "border_mode": self.border_mode,
+                  "subsample": self.subsample,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        base_config = super(Convolution2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class MaxPooling1D(Layer):
-    def __init__(self, pool_length=2, stride=None, ignore_border=True):
-        super(MaxPooling1D, self).__init__()
+    input_ndim = 3
+
+    def __init__(self, pool_length=2, stride=None, ignore_border=True, **kwargs):
+        super(MaxPooling1D, self).__init__(**kwargs)
+        if stride is None:
+            stride = pool_length
        self.pool_length = pool_length
        self.stride = stride
-        if self.stride:
-            self.st = (self.stride, 1)
-        else:
-            self.st = None
+        self.st = (self.stride, 1)

        self.input = T.tensor3()
-        self.poolsize = (pool_length, 1)
+        self.pool_size = (pool_length, 1)
        self.ignore_border = ignore_border

-    def get_output(self, train):
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        length = pool_output_length(input_shape[1], self.pool_length, self.ignore_border, self.stride)
+        return (input_shape[0], length, input_shape[2])
+
+    def get_output(self, train=False):
        X = self.get_input(train)
        X = T.reshape(X, (X.shape[0], X.shape[1], X.shape[2], 1)).dimshuffle(0, 2, 1, 3)
-        output = T.signal.downsample.max_pool_2d(X, ds=self.poolsize, st=self.st, ignore_border=self.ignore_border)
+        output = downsample.max_pool_2d(X, ds=self.pool_size, st=self.st, ignore_border=self.ignore_border)
        output = output.dimshuffle(0, 2, 1, 3)
        return T.reshape(output, (output.shape[0], output.shape[1], output.shape[2]))

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "stride": self.stride,
-                "pool_length": self.pool_length,
-                "ignore_border": self.ignore_border}
+        config = {"name": self.__class__.__name__,
+                  "stride": self.stride,
+                  "pool_length": self.pool_length,
+                  "ignore_border": self.ignore_border}
+        base_config = super(MaxPooling1D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class MaxPooling2D(Layer):
-    def __init__(self, poolsize=(2, 2), stride=None, ignore_border=True):
-        super(MaxPooling2D, self).__init__()
+    input_ndim = 4
+
+    def __init__(self, pool_size=(2, 2), stride=None, ignore_border=True, **kwargs):
+        super(MaxPooling2D, self).__init__(**kwargs)
        self.input = T.tensor4()
-        self.poolsize = poolsize
-        self.stride = stride
+        self.pool_size = tuple(pool_size)
+        if stride is None:
+            stride = self.pool_size
+        self.stride = tuple(stride)
        self.ignore_border = ignore_border

-    def get_output(self, train):
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        rows = pool_output_length(input_shape[2], self.pool_size[0], self.ignore_border, self.stride[0])
+        cols = pool_output_length(input_shape[3], self.pool_size[1], self.ignore_border, self.stride[1])
+        return (input_shape[0], input_shape[1], rows, cols)
+
+    def get_output(self, train=False):
        X = self.get_input(train)
-        output = T.signal.downsample.max_pool_2d(X, ds=self.poolsize, st=self.stride, ignore_border=self.ignore_border)
+        output = downsample.max_pool_2d(X, ds=self.pool_size, st=self.stride, ignore_border=self.ignore_border)
        return output

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "poolsize": self.poolsize,
-                "ignore_border": self.ignore_border,
-                "stride": self.stride}
+        config = {"name": self.__class__.__name__,
+                  "pool_size": self.pool_size,
+                  "ignore_border": self.ignore_border,
+                  "stride": self.stride}
+        base_config = super(MaxPooling2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class UpSample1D(Layer):
-    def __init__(self, length=2):
-        super(UpSample1D, self).__init__()
+    input_ndim = 3
+
+    def __init__(self, length=2, **kwargs):
+        super(UpSample1D, self).__init__(**kwargs)
        self.length = length
        self.input = T.tensor3()

-    def get_output(self, train):
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0], self.length * input_shape[1], input_shape[2])
+
+    def get_output(self, train=False):
        X = self.get_input(train)
        output = theano.tensor.extra_ops.repeat(X, self.length, axis=1)
        return output

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "length": self.length}
+        config = {"name": self.__class__.__name__,
+                  "length": self.length}
+        base_config = super(UpSample1D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class UpSample2D(Layer):
-    def __init__(self, size=(2, 2)):
-        super(UpSample2D, self).__init__()
-        self.input = T.tensor4()
-        self.size = size
+    input_ndim = 4

-    def get_output(self, train):
+    def __init__(self, size=(2, 2), **kwargs):
+        super(UpSample2D, self).__init__(**kwargs)
+        self.input = T.tensor4()
+        self.size = tuple(size)
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0], input_shape[1], self.size[0] * input_shape[2], self.size[1] * input_shape[3])
+
+    def get_output(self, train=False):
        X = self.get_input(train)
        Y = theano.tensor.extra_ops.repeat(X, self.size[0], axis=2)
        output = theano.tensor.extra_ops.repeat(Y, self.size[1], axis=3)
        return output

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "size": self.size}
+        config = {"name": self.__class__.__name__,
+                  "size": self.size}
+        base_config = super(UpSample2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding1D(Layer):
+    """Zero-padding layer for 1D input (e.g. temporal sequence).
+
+    Input shape
+    -----------
+    3D tensor with shape (samples, axis_to_pad, features)
+
+    Output shape
+    ------------
+    3D tensor with shape (samples, padded_axis, features)
+
+    Arguments
+    ---------
+    padding: int
+        How many zeros to add at the beginning and end of
+        the padding dimension (axis 1).
+    """
+    input_ndim = 3
+
+    def __init__(self, padding=1, **kwargs):
+        super(ZeroPadding1D, self).__init__(**kwargs)
+        self.padding = padding
+        self.input = T.tensor3()
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0], input_shape[1] + self.padding * 2, input_shape[2])
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        input_shape = X.shape
+        output_shape = (input_shape[0],
+                        input_shape[1] + 2 * self.padding,
+                        input_shape[2])
+        output = T.zeros(output_shape)
+        return T.set_subtensor(output[:, self.padding:X.shape[1] + self.padding, :], X)
+
+    def get_config(self):
+        config = {"name": self.__class__.__name__,
+                  "padding": self.padding}
+        base_config = super(ZeroPadding1D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class ZeroPadding2D(Layer):
-    def __init__(self, pad=(1, 1)):
-        super(ZeroPadding2D, self).__init__()
-        self.pad = pad
+    """Zero-padding layer for 1D input (e.g. temporal sequence).
+
+    Input shape
+    -----------
+    4D tensor with shape (samples, depth, first_axis_to_pad, second_axis_to_pad)
+
+    Output shape
+    ------------
+    4D tensor with shape (samples, depth, first_padded_axis, second_padded_axis)
+
+    Arguments
+    ---------
+    padding: tuple of int (length 2)
+        How many zeros to add at the beginning and end of
+        the 2 padding dimensions (axis 3 and 4).
+    """
+    input_ndim = 4
+
+    def __init__(self, padding=(1, 1), **kwargs):
+        super(ZeroPadding2D, self).__init__(**kwargs)
+        self.padding = tuple(padding)
        self.input = T.tensor4()

-    def get_output(self, train):
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0],
+                input_shape[1],
+                input_shape[2] + 2 * self.padding[0],
+                input_shape[3] + 2 * self.padding[1])
+
+    def get_output(self, train=False):
        X = self.get_input(train)
-        pad = self.pad
-        in_shape = X.shape
-        out_shape = (in_shape[0], in_shape[1], in_shape[2] + 2 * pad[0], in_shape[3] + 2 * pad[1])
-        out = T.zeros(out_shape)
-        indices = (slice(None), slice(None), slice(pad[0], in_shape[2] + pad[0]), slice(pad[1], in_shape[3] + pad[1]))
-        return T.set_subtensor(out[indices], X)
+        input_shape = X.shape
+        output_shape = (input_shape[0],
+                        input_shape[1],
+                        input_shape[2] + 2 * self.padding[0],
+                        input_shape[3] + 2 * self.padding[1])
+        output = T.zeros(output_shape)
+        indices = (slice(None),
+                   slice(None),
+                   slice(self.padding[0], input_shape[2] + self.padding[0]),
+                   slice(self.padding[1], input_shape[3] + self.padding[1]))
+        return T.set_subtensor(output[indices], X)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "pad": self.pad}
+        config = {"name": self.__class__.__name__,
+                  "padding": self.padding}
+        base_config = super(ZeroPadding2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -5,8 +5,11 @@ import theano
 import theano.tensor as T
 import numpy as np

+from collections import OrderedDict
+import copy
+
 from .. import activations, initializations, regularizers, constraints
-from ..utils.theano_utils import shared_zeros, floatX
+from ..utils.theano_utils import shared_zeros, floatX, ndim_tensor
 from ..utils.generic_utils import make_tuple
 from ..regularizers import ActivityRegularizer, Regularizer

@@ -15,17 +18,32 @@ from six.moves import zip


 class Layer(object):
-    def __init__(self):
-        self.params = []
-
-    def init_updates(self):
-        self.updates = []
+    def __init__(self, **kwargs):
+        for kwarg in kwargs:
+            assert kwarg in {'input_shape'}, "Keyword argument not understood: " + kwarg
+        if 'input_shape' in kwargs:
+            self.set_input_shape(kwargs['input_shape'])
+        if not hasattr(self, 'params'):
+            self.params = []

    def set_previous(self, layer, connection_map={}):
        assert self.nb_input == layer.nb_output == 1, "Cannot connect layers: input count and output count should be 1."
-        if not self.supports_masked_input() and layer.get_output_mask() is not None:
-            raise Exception("Cannot connect non-masking layer to layer with masked output")
+        if hasattr(self, 'input_ndim'):
+            assert self.input_ndim == len(layer.output_shape), "Incompatible shapes: layer expected input with ndim=" +\
+                str(self.input_ndim) + " but previous layer has output_shape " + str(layer.output_shape)
+        if layer.get_output_mask() is not None:
+            assert self.supports_masked_input(), "Cannot connect non-masking layer to layer with masked output"
        self.previous = layer
+        self.build()
+
+    def build(self):
+        '''Instantiation of layer weights.
+
+        Called after `set_previous`, or after `set_input_shape`,
+        once the layer has a defined input shape.
+        Must be implemented on all layers that have weights.
+        '''
+        pass

    @property
    def nb_input(self):
@@ -35,14 +53,45 @@ class Layer(object):
    def nb_output(self):
        return 1

+    @property
+    def input_shape(self):
+        # if layer is not connected (e.g. input layer),
+        # input shape can be set manually via _input_shape attribute.
+        if hasattr(self, 'previous'):
+            return self.previous.output_shape
+        elif hasattr(self, '_input_shape'):
+            return self._input_shape
+        else:
+            raise Exception('Layer is not connected. Did you forget to set "input_shape"?')
+
+    def set_input_shape(self, input_shape):
+        if type(input_shape) not in [tuple, list]:
+            raise Exception('Invalid input shape - input_shape should be a tuple of int.')
+        input_shape = (None,) + tuple(input_shape)
+        if hasattr(self, 'input_ndim') and self.input_ndim:
+            if self.input_ndim != len(input_shape):
+                raise Exception('Invalid input shape - Layer expects input ndim=' +
+                                str(self.input_ndim) + ', was provided with input shape ' + str(input_shape))
+        self._input_shape = input_shape
+        self.input = ndim_tensor(len(self._input_shape))
+        self.build()
+
+    @property
+    def output_shape(self):
+        # default assumption: tensor shape unchanged.
+        return self.input_shape
+
    def get_output(self, train=False):
        return self.get_input(train)

    def get_input(self, train=False):
        if hasattr(self, 'previous'):
            return self.previous.get_output(train=train)
-        else:
+        elif hasattr(self, 'input'):
            return self.input
+        else:
+            raise Exception('Layer is not connected\
+                and is not an input layer.')

    def supports_masked_input(self):
        ''' Whether or not this layer respects the output mask of its previous layer in its calculations. If you try
@@ -67,6 +116,8 @@ class Layer(object):
        return None

    def set_weights(self, weights):
+        assert len(self.params) == len(weights), 'Provided weight array does not match layer weights (' + \
+            str(len(self.params)) + ' layer params vs. ' + str(len(weights)) + ' provided weights)'
        for p, w in zip(self.params, weights):
            if p.eval().shape != w.shape:
                raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
@@ -79,7 +130,10 @@ class Layer(object):
        return weights

    def get_config(self):
-        return {"name": self.__class__.__name__}
+        config = {"name": self.__class__.__name__}
+        if hasattr(self, '_input_shape'):
+            config['input_shape'] = self._input_shape[1:]
+        return config

    def get_params(self):
        consts = []
@@ -110,6 +164,9 @@ class Layer(object):
        for i in range(len(self.params)):
            self.params[i].name = '%s_p%d' % (name, i)

+    def count_params(self):
+        return sum([np.prod(p.shape.eval()) for p in self.params])
+

 class MaskedLayer(Layer):
    '''
@@ -142,8 +199,8 @@ class Masking(MaskedLayer):
    otherwise it is 1.

    """
-    def __init__(self, mask_value=0.):
-        super(Masking, self).__init__()
+    def __init__(self, mask_value=0., **kwargs):
+        super(Masking, self).__init__(**kwargs)
        self.mask_value = mask_value
        self.input = T.tensor3()

@@ -156,18 +213,64 @@ class Masking(MaskedLayer):
        return X * T.shape_padright(T.any((1. - T.eq(X, self.mask_value)), axis=-1))

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "mask_value": self.mask_value}
+        config = {"name": self.__class__.__name__,
+                  "mask_value": self.mask_value}
+        base_config = super(Masking, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class TimeDistributedMerge(Layer):
+    '''Sum/multiply/average over the outputs of a TimeDistributed layer.
+
+    mode: {'sum', 'mul', 'ave'}
+    Tensor input dimensions:   (nb_sample, time, features)
+    Tensor output dimensions:  (nb_sample, features)
+    '''
+    input_ndim = 3
+
+    def __init__(self, mode='sum', **kwargs):
+        super(TimeDistributedMerge, self).__init__(**kwargs)
+        self.mode = mode
+        self.params = []
+        self.regularizers = []
+        self.constraints = []
+        self.updates = []
+
+    @property
+    def output_shape(self):
+        return (None, self.input_shape[2])
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        if self.mode == 'sum' or self.mode == 'ave':
+            s = theano.tensor.sum(X, axis=1)
+            if self.mode == 'ave':
+                s /= X.shape[1]
+            return s
+        elif self.mode == 'mul':
+            s = theano.tensor.mul(X, axis=1)
+            return s
+        else:
+            raise Exception('Unknown merge mode')
+
+    def get_config(self):
+        config = {"name": self.__class__.__name__,
+                  "mode": self.mode}
+        base_config = super(TimeDistributedMerge, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Merge(Layer):
-    def __init__(self, layers, mode='sum'):
+    def __init__(self, layers, mode='sum', concat_axis=-1):
        ''' Merge the output of a list of layers or containers into a single tensor.
-            mode: {'sum', 'concat'}
+            mode: {'sum', 'mul', 'concat', 'ave'}
        '''
        if len(layers) < 2:
            raise Exception("Please specify two or more input layers (or containers) to merge")
+        if mode not in {'sum', 'mul', 'concat', 'ave'}:
+            raise Exception("Invalid merge mode: " + str(mode))
        self.mode = mode
+        self.concat_axis = concat_axis
        self.layers = layers
        self.params = []
        self.regularizers = []
@@ -183,18 +286,45 @@ class Merge(Layer):
                    self.params.append(p)
                    self.constraints.append(c)

+    @property
+    def output_shape(self):
+        input_shapes = [layer.output_shape for layer in self.layers]
+        if self.mode in ['sum', 'mul', 'ave']:
+            return input_shapes[0]
+        elif self.mode == 'concat':
+            output_shape = list(input_shapes[0])
+            for shape in input_shapes[1:]:
+                output_shape[self.concat_axis] += shape[self.concat_axis]
+            return tuple(output_shape)
+
    def get_params(self):
        return self.params, self.regularizers, self.constraints, self.updates

    def get_output(self, train=False):
-        if self.mode == 'sum':
+        if self.mode == 'sum' or self.mode == 'ave':
            s = self.layers[0].get_output(train)
            for i in range(1, len(self.layers)):
                s += self.layers[i].get_output(train)
+            if self.mode == 'ave':
+                s /= len(self.layers)
            return s
        elif self.mode == 'concat':
            inputs = [self.layers[i].get_output(train) for i in range(len(self.layers))]
-            return T.concatenate(inputs, axis=-1)
+            return T.concatenate(inputs, axis=self.concat_axis)
+        elif self.mode == 'join':
+            inputs = OrderedDict()
+            for i in range(len(self.layers)):
+                X = self.layers[i].get_output(train)
+                if X.name is None:
+                    raise ValueError("merge_mode='join' only works with named inputs")
+                else:
+                    inputs[X.name] = self.layers[i].get_output(train)
+            return inputs
+        elif self.mode == 'mul':
+            s = self.layers[0].get_output(train)
+            for i in range(1, len(self.layers)):
+                s *= self.layers[i].get_output(train)
+            return s
        else:
            raise Exception('Unknown merge mode')

@@ -232,17 +362,20 @@ class Merge(Layer):
            weights = weights[nb_param:]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "layers": [l.get_config() for l in self.layers],
-                "mode": self.mode}
+        config = {"name": self.__class__.__name__,
+                  "layers": [l.get_config() for l in self.layers],
+                  "mode": self.mode,
+                  "concat_axis": self.concat_axis}
+        base_config = super(Merge, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Dropout(MaskedLayer):
    '''
        Hinton's dropout.
    '''
-    def __init__(self, p):
-        super(Dropout, self).__init__()
+    def __init__(self, p, **kwargs):
+        super(Dropout, self).__init__(**kwargs)
        self.p = p
        self.srng = RandomStreams(seed=np.random.randint(10e6))

@@ -257,16 +390,18 @@ class Dropout(MaskedLayer):
        return X

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "p": self.p}
+        config = {"name": self.__class__.__name__,
+                  "p": self.p}
+        base_config = super(Dropout, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Activation(MaskedLayer):
    '''
        Apply an activation function to an output.
    '''
-    def __init__(self, activation, target=0, beta=0.1):
-        super(Activation, self).__init__()
+    def __init__(self, activation, target=0, beta=0.1, **kwargs):
+        super(Activation, self).__init__(**kwargs)
        self.activation = activations.get(activation)
        self.target = target
        self.beta = beta
@@ -276,10 +411,12 @@ class Activation(MaskedLayer):
        return self.activation(X)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "activation": self.activation.__name__,
-                "target": self.target,
-                "beta": self.beta}
+        config = {"name": self.__class__.__name__,
+                  "activation": self.activation.__name__,
+                  "target": self.target,
+                  "beta": self.beta}
+        base_config = super(Activation, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Reshape(Layer):
@@ -288,35 +425,52 @@ class Reshape(Layer):
        Can't be used as first layer in a model (no fixed input!)
        First dimension is assumed to be nb_samples.
    '''
-    def __init__(self, *dims):
-        super(Reshape, self).__init__()
-        self.dims = dims
+    def __init__(self, dims, **kwargs):
+        super(Reshape, self).__init__(**kwargs)
+        self.dims = tuple(dims)
+
+    @property
+    def output_shape(self):
+        return (self.input_shape[0],) + self.dims

    def get_output(self, train=False):
        X = self.get_input(train)
-        nshape = make_tuple(X.shape[0], *self.dims)
-        return theano.tensor.reshape(X, nshape)
+        new_shape = (X.shape[0],) + self.dims
+        return theano.tensor.reshape(X, new_shape)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "dims": self.dims}
+        config = {"name": self.__class__.__name__,
+                  "dims": self.dims}
+        base_config = super(Reshape, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Permute(Layer):
    '''
-        Permute the dimensions of the data according to the given tuple
+        Permute the dimensions of the input according to the given tuple.
    '''
-    def __init__(self, dims):
-        super(Permute, self).__init__()
-        self.dims = dims
+    def __init__(self, dims, **kwargs):
+        super(Permute, self).__init__(**kwargs)
+        self.dims = tuple(dims)

-    def get_output(self, train):
+    @property
+    def output_shape(self):
+        input_shape = list(self.input_shape)
+        output_shape = copy.copy(input_shape)
+        for i, dim in enumerate(self.dims):
+            target_dim = input_shape[dim]
+            output_shape[i+1] = target_dim
+        return tuple(output_shape)
+
+    def get_output(self, train=False):
        X = self.get_input(train)
        return X.dimshuffle((0,) + self.dims)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "dims": self.dims}
+        config = {"name": self.__class__.__name__,
+                  "dims": self.dims}
+        base_config = super(Permute, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Flatten(Layer):
@@ -324,8 +478,13 @@ class Flatten(Layer):
        Reshape input to flat shape.
        First dimension is assumed to be nb_samples.
    '''
-    def __init__(self):
-        super(Flatten, self).__init__()
+    def __init__(self, **kwargs):
+        super(Flatten, self).__init__(**kwargs)
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0], np.prod(input_shape[1:]))

    def get_output(self, train=False):
        X = self.get_input(train)
@@ -341,10 +500,15 @@ class RepeatVector(Layer):
        Dimensions of input are assumed to be (nb_samples, dim).
        Return tensor of shape (nb_samples, n, dim).
    '''
-    def __init__(self, n):
-        super(RepeatVector, self).__init__()
+    def __init__(self, n, **kwargs):
+        super(RepeatVector, self).__init__(**kwargs)
        self.n = n

+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0], self.n, input_shape[1])
+
    def get_output(self, train=False):
        X = self.get_input(train)
        tensors = [X]*self.n
@@ -352,59 +516,69 @@ class RepeatVector(Layer):
        return stacked.dimshuffle((1, 0, 2))

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "n": self.n}
+        config = {"name": self.__class__.__name__,
+                  "n": self.n}
+        base_config = super(RepeatVector, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class Dense(Layer):
    '''
        Just your regular fully connected NN layer.
    '''
-    def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None, name=None,
-                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
-                 W_constraint=None, b_constraint=None):
+    input_ndim = 2

-        super(Dense, self).__init__()
+    def __init__(self, output_dim, init='glorot_uniform', activation='linear', weights=None,
+                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
+                 W_constraint=None, b_constraint=None, input_dim=None, **kwargs):
        self.init = initializations.get(init)
        self.activation = activations.get(activation)
-        self.input_dim = input_dim
        self.output_dim = output_dim

-        self.input = T.matrix()
-        self.W = self.init((self.input_dim, self.output_dim))
-        self.b = shared_zeros((self.output_dim))
-
-        self.params = [self.W, self.b]
-
-        self.regularizers = []
        self.W_regularizer = regularizers.get(W_regularizer)
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
        self.b_regularizer = regularizers.get(b_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        self.activity_regularizer = regularizers.get(activity_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.constraints = [self.W_constraint, self.b_constraint]

-        if weights is not None:
-            self.set_weights(weights)
+        self.initial_weights = weights

-        if name is not None:
-            self.set_name(name)
+        self.input_dim = input_dim
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_dim,)
+        super(Dense, self).__init__(**kwargs)

-    def set_name(self, name):
-        self.W.name = '%s_W' % name
-        self.b.name = '%s_b' % name
+    def build(self):
+        input_dim = self.input_shape[1]
+
+        self.input = T.matrix()
+        self.W = self.init((input_dim, self.output_dim))
+        self.b = shared_zeros((self.output_dim,))
+
+        self.params = [self.W, self.b]
+
+        self.regularizers = []
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+
+        if self.b_regularizer:
+            self.b_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
+        if self.activity_regularizer:
+            self.activity_regularizer.set_layer(self)
+            self.regularizers.append(self.activity_regularizer)
+
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    @property
+    def output_shape(self):
+        return (self.input_shape[0], self.output_dim)

    def get_output(self, train=False):
        X = self.get_input(train)
@@ -412,16 +586,18 @@ class Dense(Layer):
        return output

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "activation": self.activation.__name__,
-                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
-                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
-                "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "activation": self.activation.__name__,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "b_constraint": self.b_constraint.get_config() if self.b_constraint else None,
+                  "input_dim": self.input_dim}
+        base_config = super(Dense, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class ActivityRegularization(Layer):
@@ -429,8 +605,8 @@ class ActivityRegularization(Layer):
        Layer that passes through its input unchanged, but applies an update
        to the cost function based on the activity.
    '''
-    def __init__(self, l1=0., l2=0.):
-        super(ActivityRegularization, self).__init__()
+    def __init__(self, l1=0., l2=0., **kwargs):
+        super(ActivityRegularization, self).__init__(**kwargs)
        self.l1 = l1
        self.l2 = l2

@@ -442,58 +618,76 @@ class ActivityRegularization(Layer):
        return self.get_input(train)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "l1": self.l1,
-                "l2": self.l2}
+        config = {"name": self.__class__.__name__,
+                  "l1": self.l1,
+                  "l2": self.l2}
+        base_config = super(ActivityRegularization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class TimeDistributedDense(MaskedLayer):
    '''
-       Apply a same DenseLayer for each dimension[1] (shared_dimension) input
-       Especially useful after a recurrent network with 'return_sequence=True'
-       Tensor input dimensions:   (nb_sample, shared_dimension, input_dim)
-       Tensor output dimensions:  (nb_sample, shared_dimension, output_dim)
+       Apply a same Dense layer for each dimension[1] (time_dimension) input.
+       Especially useful after a recurrent network with 'return_sequence=True'.
+       Tensor input dimensions:   (nb_sample, time_dimension, input_dim)
+       Tensor output dimensions:  (nb_sample, time_dimension, output_dim)

    '''
-    def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None,
-                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
-                 W_constraint=None, b_constraint=None):
+    input_ndim = 3

-        super(TimeDistributedDense, self).__init__()
+    def __init__(self, output_dim, init='glorot_uniform', activation='linear', weights=None,
+                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
+                 W_constraint=None, b_constraint=None, input_dim=None, input_length=None, **kwargs):
+        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.activation = activations.get(activation)
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-
-        self.input = T.tensor3()
-        self.W = self.init((self.input_dim, self.output_dim))
-        self.b = shared_zeros((self.output_dim))
-
-        self.params = [self.W, self.b]
-
-        self.regularizers = []

        self.W_regularizer = regularizers.get(W_regularizer)
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
        self.b_regularizer = regularizers.get(b_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        self.activity_regularizer = regularizers.get(activity_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.constraints = [self.W_constraint, self.b_constraint]

-        if weights is not None:
-            self.set_weights(weights)
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(TimeDistributedDense, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
+
+        self.input = T.tensor3()
+        self.W = self.init((input_dim, self.output_dim))
+        self.b = shared_zeros((self.output_dim))
+
+        self.params = [self.W, self.b]
+        self.regularizers = []
+
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+
+        if self.b_regularizer:
+            self.b_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
+        if self.activity_regularizer:
+            self.activity_regularizer.set_layer(self)
+            self.regularizers.append(self.activity_regularizer)
+
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        return (input_shape[0], input_shape[1], self.output_dim)

    def get_output(self, train=False):
        X = self.get_input(train)
@@ -501,27 +695,33 @@ class TimeDistributedDense(MaskedLayer):
        return output.dimshuffle(1, 0, 2)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "activation": self.activation.__name__,
-                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
-                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
-                "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "activation": self.activation.__name__,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "b_constraint": self.b_constraint.get_config() if self.b_constraint else None,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(TimeDistributedDense, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class AutoEncoder(Layer):
-    '''
-        A customizable autoencoder model.
-        If output_reconstruction then dim(input) = dim(output)
-        else dim(output) = dim(hidden)
-    '''
-    def __init__(self, encoder, decoder, output_reconstruction=True, weights=None):
+    '''A customizable autoencoder model.

-        super(AutoEncoder, self).__init__()
+    Tensor input dimensions: same as encoder input
+    Tensor output dimensions:
+        if output_reconstruction:
+            same as encoder output
+        else:
+            same as decoder output
+    '''
+    def __init__(self, encoder, decoder, output_reconstruction=True, weights=None, **kwargs):
+        super(AutoEncoder, self).__init__(**kwargs)

        self.output_reconstruction = output_reconstruction
        self.encoder = encoder
@@ -569,6 +769,17 @@ class AutoEncoder(Layer):
    def _get_hidden(self, train=False):
        return self.encoder.get_output(train)

+    @property
+    def input_shape(self):
+        self.encoder.previous.output_shape
+
+    @property
+    def output_shape(self):
+        if self.output_reconstruction:
+            return self.encoder.previous.output_shape
+        else:
+            return self.decoder.previous.output_shape
+
    def get_output(self, train=False):
        if not train and not self.output_reconstruction:
            return self.encoder.get_output(train)
@@ -587,45 +798,58 @@ class MaxoutDense(Layer):
        Max-out layer, nb_feature is the number of pieces in the piecewise linear approx.
        Refer to http://arxiv.org/pdf/1302.4389.pdf
    '''
-    def __init__(self, input_dim, output_dim, nb_feature=4, init='glorot_uniform', weights=None,
-                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
-                 W_constraint=None, b_constraint=None):
+    input_ndim = 2

-        super(MaxoutDense, self).__init__()
-        self.init = initializations.get(init)
-        self.input_dim = input_dim
+    def __init__(self, output_dim, nb_feature=4, init='glorot_uniform', weights=None,
+                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
+                 W_constraint=None, b_constraint=None, input_dim=None, **kwargs):
        self.output_dim = output_dim
        self.nb_feature = nb_feature
-
-        self.input = T.matrix()
-        self.W = self.init((self.nb_feature, self.input_dim, self.output_dim))
-        self.b = shared_zeros((self.nb_feature, self.output_dim))
-
-        self.params = [self.W, self.b]
-
-        self.regularizers = []
+        self.init = initializations.get(init)

        self.W_regularizer = regularizers.get(W_regularizer)
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
        self.b_regularizer = regularizers.get(b_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        self.activity_regularizer = regularizers.get(activity_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.constraints = [self.W_constraint, self.b_constraint]

-        if weights is not None:
-            self.set_weights(weights)
+        self.initial_weights = weights
+        self.input_dim = input_dim
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_dim,)
+        super(MaxoutDense, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[1]
+
+        self.input = T.matrix()
+        self.W = self.init((self.nb_feature, input_dim, self.output_dim))
+        self.b = shared_zeros((self.nb_feature, self.output_dim))
+
+        self.params = [self.W, self.b]
+        self.regularizers = []
+
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+
+        if self.b_regularizer:
+            self.b_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
+        if self.activity_regularizer:
+            self.activity_regularizer.set_layer(self)
+            self.regularizers.append(self.activity_regularizer)
+
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    @property
+    def output_shape(self):
+        return (self.input_shape[0], self.output_dim)

    def get_output(self, train=False):
        X = self.get_input(train)
@@ -634,13 +858,15 @@ class MaxoutDense(Layer):
        return output

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "nb_feature": self.nb_feature,
-                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
-                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
-                "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "nb_feature": self.nb_feature,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "b_constraint": self.b_constraint.get_config() if self.b_constraint else None,
+                  "input_dim": self.input_dim}
+        base_config = super(MaxoutDense, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -17,38 +17,42 @@ class Embedding(Layer):
        @input_dim: size of vocabulary (highest input integer + 1)
        @out_dim: size of dense representation
    '''
-    def __init__(self, input_dim, output_dim, init='uniform',
-                 W_regularizer=None, activity_regularizer=None, W_constraint=None,
-                 mask_zero=False, weights=None):
+    input_ndim = 2

-        super(Embedding, self).__init__()
-        self.init = initializations.get(init)
+    def __init__(self, input_dim, output_dim, init='uniform', input_length=None,
+                 W_regularizer=None, activity_regularizer=None, W_constraint=None,
+                 mask_zero=False, weights=None, **kwargs):
        self.input_dim = input_dim
        self.output_dim = output_dim
-
-        self.input = T.imatrix()
-        self.W = self.init((self.input_dim, self.output_dim))
+        self.init = initializations.get(init)
+        self.input_length = input_length
        self.mask_zero = mask_zero

-        self.params = [self.W]
-
        self.W_constraint = constraints.get(W_constraint)
        self.constraints = [self.W_constraint]

-        self.regularizers = []
-
        self.W_regularizer = regularizers.get(W_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+
+        self.initial_weights = weights
+        kwargs['input_shape'] = (self.input_dim,)
+        super(Embedding, self).__init__(**kwargs)
+
+    def build(self):
+        self.input = T.imatrix()
+        self.W = self.init((self.input_dim, self.output_dim))
+        self.params = [self.W]
+        self.regularizers = []
        if self.W_regularizer:
            self.W_regularizer.set_param(self.W)
            self.regularizers.append(self.W_regularizer)

-        self.activity_regularizer = regularizers.get(activity_regularizer)
        if self.activity_regularizer:
            self.activity_regularizer.set_layer(self)
            self.regularizers.append(self.activity_regularizer)

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)

    def get_output_mask(self, train=None):
        X = self.get_input(train)
@@ -57,19 +61,27 @@ class Embedding(Layer):
        else:
            return T.ones_like(X) * (1 - T.eq(X, 0))

+    @property
+    def output_shape(self):
+        return (self.input_shape[0], self.input_length, self.output_dim)
+
    def get_output(self, train=False):
        X = self.get_input(train)
        out = self.W[X]
        return out

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
-                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None}
+        config = {"name": self.__class__.__name__,
+                  "input_dim": self.input_dim,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "input_length": self.input_length,
+                  "mask_zero": self.mask_zero,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None}
+        base_config = super(Embedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class WordContextProduct(Layer):
@@ -96,10 +108,12 @@ class WordContextProduct(Layer):
            Efficient Estimation of Word reprensentations in Vector Space
            http://arxiv.org/pdf/1301.3781v3.pdf
    '''
-    def __init__(self, input_dim, proj_dim=128,
-                 init='uniform', activation='sigmoid', weights=None):
+    input_ndim = 2

-        super(WordContextProduct, self).__init__()
+    def __init__(self, input_dim, proj_dim=128,
+                 init='uniform', activation='sigmoid', weights=None, **kwargs):
+
+        super(WordContextProduct, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.proj_dim = proj_dim
        self.init = initializations.get(init)
@@ -116,6 +130,10 @@ class WordContextProduct(Layer):
        if weights is not None:
            self.set_weights(weights)

+    @property
+    def output_shape(self):
+        return (self.input_shape[0], 1)
+
    def get_output(self, train=False):
        X = self.get_input(train)
        w = self.W_w[X[:, 0]]  # nb_samples, proj_dim
@@ -126,8 +144,10 @@ class WordContextProduct(Layer):
        return self.activation(dot)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "proj_dim": self.proj_dim,
-                "init": self.init.__name__,
-                "activation": self.activation.__name__}
+        config = {"name": self.__class__.__name__,
+                  "input_dim": self.input_dim,
+                  "proj_dim": self.proj_dim,
+                  "init": self.init.__name__,
+                  "activation": self.activation.__name__}
+        base_config = super(WordContextProduct, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -10,8 +10,8 @@ class GaussianNoise(MaskedLayer):
    '''
        Corruption process with GaussianNoise
    '''
-    def __init__(self, sigma):
-        super(GaussianNoise, self).__init__()
+    def __init__(self, sigma, **kwargs):
+        super(GaussianNoise, self).__init__(**kwargs)
        self.sigma = sigma
        self.srng = RandomStreams(seed=np.random.randint(10e6))

@@ -24,8 +24,10 @@ class GaussianNoise(MaskedLayer):
                                        dtype=theano.config.floatX)

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "sigma": self.sigma}
+        config = {"name": self.__class__.__name__,
+                  "sigma": self.sigma}
+        base_config = super(GaussianNoise, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class GaussianDropout(MaskedLayer):
@@ -36,8 +38,8 @@ class GaussianDropout(MaskedLayer):
            Srivastava, Hinton, et al. 2014
            http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
    '''
-    def __init__(self, p):
-        super(GaussianDropout, self).__init__()
+    def __init__(self, p, **kwargs):
+        super(GaussianDropout, self).__init__(**kwargs)
        self.p = p
        self.srng = RandomStreams(seed=np.random.randint(10e6))

@@ -49,5 +51,7 @@ class GaussianDropout(MaskedLayer):
        return X

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "p": self.p}
+        config = {"name": self.__class__.__name__,
+                  "p": self.p}
+        base_config = super(GaussianDropout, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -1,5 +1,5 @@
 from ..layers.core import Layer
-from ..utils.theano_utils import shared_zeros, shared_ones, ndim_tensor
+from ..utils.theano_utils import shared_zeros, shared_ones, ndim_tensor, floatX
 from .. import initializations

 import theano.tensor as T
@@ -16,25 +16,27 @@ class BatchNormalization(Layer):

            momentum: momentum term in the computation of a running estimate of the mean and std of the data
    '''
-    def __init__(self, input_shape, epsilon=1e-6, mode=0, momentum=0.9, weights=None):
-        super(BatchNormalization, self).__init__()
+    def __init__(self, epsilon=1e-6, mode=0, momentum=0.9, weights=None, **kwargs):
        self.init = initializations.get("uniform")
-        self.input_shape = input_shape
        self.epsilon = epsilon
        self.mode = mode
        self.momentum = momentum
-        self.input = ndim_tensor(len(self.input_shape) + 1)
+        self.initial_weights = weights
+        super(BatchNormalization, self).__init__(**kwargs)

-        self.gamma = self.init((self.input_shape))
-        self.beta = shared_zeros(self.input_shape)
+    def build(self):
+        input_shape = self.input_shape  # starts with samples axis
+        input_shape = input_shape[1:]
+        self.input = ndim_tensor(len(input_shape) + 1)
+
+        self.gamma = self.init((input_shape))
+        self.beta = shared_zeros(input_shape)

        self.params = [self.gamma, self.beta]
-        if weights is not None:
-            self.set_weights(weights)
+        self.running_mean = shared_zeros(input_shape)
+        self.running_std = shared_ones((input_shape))

-    def init_updates(self):
-        self.running_mean = shared_zeros(self.input_shape)
-        self.running_std = shared_ones((self.input_shape))
+        # initialize self.updates: batch mean/std computation
        X = self.get_input(train=True)
        m = X.mean(axis=0)
        std = T.mean((X - m) ** 2 + self.epsilon, axis=0) ** 0.5
@@ -42,6 +44,18 @@ class BatchNormalization(Layer):
        std_update = self.momentum * self.running_std + (1-self.momentum) * std
        self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)]

+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    def get_weights(self):
+        return super(BatchNormalization, self).get_weights() + [self.running_mean.get_value(), self.running_std.get_value()]
+
+    def set_weights(self, weights):
+        self.running_mean.set_value(floatX(weights[-2]))
+        self.running_std.set_value(floatX(weights[-1]))
+        super(BatchNormalization, self).set_weights(weights[:-2])
+
    def get_output(self, train):
        X = self.get_input(train)

@@ -57,10 +71,12 @@ class BatchNormalization(Layer):
        return out

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_shape": self.input_shape,
-                "epsilon": self.epsilon,
-                "mode": self.mode}
+        config = {"name": self.__class__.__name__,
+                  "epsilon": self.epsilon,
+                  "mode": self.mode,
+                  "momentum": self.momentum}
+        base_config = super(BatchNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class LRN2D(Layer):
@@ -69,10 +85,10 @@ class LRN2D(Layer):
    License at: https://github.com/lisa-lab/pylearn2/blob/master/LICENSE.txt
    """

-    def __init__(self, alpha=1e-4, k=2, beta=0.75, n=5):
+    def __init__(self, alpha=1e-4, k=2, beta=0.75, n=5, **kwargs):
        if n % 2 == 0:
            raise NotImplementedError("LRN2D only works with odd n. n provided: " + str(n))
-        super(LRN2D, self).__init__()
+        super(LRN2D, self).__init__(**kwargs)
        self.alpha = alpha
        self.k = k
        self.beta = beta
@@ -92,8 +108,10 @@ class LRN2D(Layer):
        return X / scale

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "alpha": self.alpha,
-                "k": self.k,
-                "beta": self.beta,
-                "n": self.n}
+        config = {"name": self.__class__.__name__,
+                  "alpha": self.alpha,
+                  "k": self.k,
+                  "beta": self.beta,
+                  "n": self.n}
+        base_config = super(LRN2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -11,6 +11,8 @@ from six.moves import range


 class Recurrent(MaskedLayer):
+    input_ndim = 3
+
    def get_output_mask(self, train=None):
        if self.return_sequences:
            return super(Recurrent, self).get_output_mask(train)
@@ -24,7 +26,8 @@ class Recurrent(MaskedLayer):

        # mask is (nb_samples, time)
        mask = T.shape_padright(mask)  # (nb_samples, time, 1)
-        mask = T.addbroadcast(mask, -1)  # (time, nb_samples, 1) matrix.
+        mask = T.addbroadcast(mask, -1)  # the new dimension (the '1') is made broadcastable
+        # see http://deeplearning.net/software/theano/library/tensor/basic.html#broadcasting-in-theano-vs-numpy
        mask = mask.dimshuffle(1, 0, 2)  # (time, nb_samples, 1)

        if pad > 0:
@@ -33,6 +36,14 @@ class Recurrent(MaskedLayer):
            mask = T.concatenate([padding, mask], axis=0)
        return mask.astype('int8')

+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        if self.return_sequences:
+            return (input_shape[0], input_shape[1], self.output_dim)
+        else:
+            return (input_shape[0], self.output_dim)
+

 class SimpleRNN(Recurrent):
    '''
@@ -42,27 +53,35 @@ class SimpleRNN(Recurrent):
        included for demonstration purposes
        (demonstrates how to use theano.scan to build a basic RNN).
    '''
-    def __init__(self, input_dim, output_dim,
+    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal', activation='sigmoid', weights=None,
-                 truncate_gradient=-1, return_sequences=False):
-
-        super(SimpleRNN, self).__init__()
+                 truncate_gradient=-1, return_sequences=False, input_dim=None, input_length=None, **kwargs):
+        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
-        self.input_dim = input_dim
-        self.output_dim = output_dim
        self.truncate_gradient = truncate_gradient
        self.activation = activations.get(activation)
        self.return_sequences = return_sequences
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(SimpleRNN, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
        self.input = T.tensor3()

-        self.W = self.init((self.input_dim, self.output_dim))
+        self.W = self.init((input_dim, self.output_dim))
        self.U = self.inner_init((self.output_dim, self.output_dim))
        self.b = shared_zeros((self.output_dim))
        self.params = [self.W, self.U, self.b]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self, x_t, mask_tm1, h_tm1, u):
        '''
@@ -95,14 +114,17 @@ class SimpleRNN(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "activation": self.activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "activation": self.activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(SimpleRNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class SimpleDeepRNN(Recurrent):
@@ -115,30 +137,38 @@ class SimpleDeepRNN(Recurrent):
        This demonstrates how to build RNNs with arbitrary lookback.
        Also (probably) not a super useful model.
    '''
-    def __init__(self, input_dim, output_dim, depth=3,
+    def __init__(self, output_dim, depth=3,
                 init='glorot_uniform', inner_init='orthogonal',
                 activation='sigmoid', inner_activation='hard_sigmoid',
-                 weights=None, truncate_gradient=-1, return_sequences=False):
-
-        super(SimpleDeepRNN, self).__init__()
+                 weights=None, truncate_gradient=-1, return_sequences=False,
+                 input_dim=None, input_length=None, **kwargs):
+        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
-        self.input_dim = input_dim
-        self.output_dim = output_dim
        self.truncate_gradient = truncate_gradient
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
        self.depth = depth
        self.return_sequences = return_sequences
-        self.input = T.tensor3()
+        self.initial_weights = weights

-        self.W = self.init((self.input_dim, self.output_dim))
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(SimpleDeepRNN, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
+        self.input = T.tensor3()
+        self.W = self.init((input_dim, self.output_dim))
        self.Us = [self.inner_init((self.output_dim, self.output_dim)) for _ in range(self.depth)]
        self.b = shared_zeros((self.output_dim))
        self.params = [self.W] + self.Us + [self.b]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self, x_t, *args):
        o = x_t
@@ -180,16 +210,19 @@ class SimpleDeepRNN(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "depth": self.depth,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "activation": self.activation.__name__,
-                "inner_activation": self.inner_activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "depth": self.depth,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "activation": self.activation.__name__,
+                  "inner_activation": self.inner_activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(SimpleDeepRNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class GRU(Recurrent):
@@ -214,32 +247,39 @@ class GRU(Recurrent):
            Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
                http://arxiv.org/pdf/1412.3555v1.pdf
    '''
-    def __init__(self, input_dim, output_dim=128,
+    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
                 activation='sigmoid', inner_activation='hard_sigmoid',
-                 weights=None, truncate_gradient=-1, return_sequences=False):
-
-        super(GRU, self).__init__()
-        self.input_dim = input_dim
+                 weights=None, truncate_gradient=-1, return_sequences=False,
+                 input_dim=None, input_length=None, **kwargs):
        self.output_dim = output_dim
-        self.truncate_gradient = truncate_gradient
-        self.return_sequences = return_sequences
-
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.truncate_gradient = truncate_gradient
+        self.return_sequences = return_sequences
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(GRU, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
        self.input = T.tensor3()

-        self.W_z = self.init((self.input_dim, self.output_dim))
+        self.W_z = self.init((input_dim, self.output_dim))
        self.U_z = self.inner_init((self.output_dim, self.output_dim))
        self.b_z = shared_zeros((self.output_dim))

-        self.W_r = self.init((self.input_dim, self.output_dim))
+        self.W_r = self.init((input_dim, self.output_dim))
        self.U_r = self.inner_init((self.output_dim, self.output_dim))
        self.b_r = shared_zeros((self.output_dim))

-        self.W_h = self.init((self.input_dim, self.output_dim))
+        self.W_h = self.init((input_dim, self.output_dim))
        self.U_h = self.inner_init((self.output_dim, self.output_dim))
        self.b_h = shared_zeros((self.output_dim))

@@ -249,8 +289,9 @@ class GRU(Recurrent):
            self.W_h, self.U_h, self.b_h,
        ]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self,
              xz_t, xr_t, xh_t, mask_tm1,
@@ -283,15 +324,18 @@ class GRU(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "activation": self.activation.__name__,
-                "inner_activation": self.inner_activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "activation": self.activation.__name__,
+                  "inner_activation": self.inner_activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(GRU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class LSTM(Recurrent):
@@ -319,37 +363,44 @@ class LSTM(Recurrent):
            Supervised sequence labelling with recurrent neural networks
                http://www.cs.toronto.edu/~graves/preprint.pdf
    '''
-    def __init__(self, input_dim, output_dim=128,
+    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one',
                 activation='tanh', inner_activation='hard_sigmoid',
-                 weights=None, truncate_gradient=-1, return_sequences=False):
-
-        super(LSTM, self).__init__()
-        self.input_dim = input_dim
+                 weights=None, truncate_gradient=-1, return_sequences=False,
+                 input_dim=None, input_length=None, **kwargs):
        self.output_dim = output_dim
-        self.truncate_gradient = truncate_gradient
-        self.return_sequences = return_sequences
-
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.forget_bias_init = initializations.get(forget_bias_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.truncate_gradient = truncate_gradient
+        self.return_sequences = return_sequences
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(LSTM, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
        self.input = T.tensor3()

-        self.W_i = self.init((self.input_dim, self.output_dim))
+        self.W_i = self.init((input_dim, self.output_dim))
        self.U_i = self.inner_init((self.output_dim, self.output_dim))
        self.b_i = shared_zeros((self.output_dim))

-        self.W_f = self.init((self.input_dim, self.output_dim))
+        self.W_f = self.init((input_dim, self.output_dim))
        self.U_f = self.inner_init((self.output_dim, self.output_dim))
        self.b_f = self.forget_bias_init((self.output_dim))

-        self.W_c = self.init((self.input_dim, self.output_dim))
+        self.W_c = self.init((input_dim, self.output_dim))
        self.U_c = self.inner_init((self.output_dim, self.output_dim))
        self.b_c = shared_zeros((self.output_dim))

-        self.W_o = self.init((self.input_dim, self.output_dim))
+        self.W_o = self.init((input_dim, self.output_dim))
        self.U_o = self.inner_init((self.output_dim, self.output_dim))
        self.b_o = shared_zeros((self.output_dim))

@@ -360,8 +411,9 @@ class LSTM(Recurrent):
            self.W_o, self.U_o, self.b_o,
        ]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self,
              xi_t, xf_t, xo_t, xc_t, mask_tm1,
@@ -402,16 +454,19 @@ class LSTM(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "forget_bias_init": self.forget_bias_init.__name__,
-                "activation": self.activation.__name__,
-                "inner_activation": self.inner_activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "forget_bias_init": self.forget_bias_init.__name__,
+                  "activation": self.activation.__name__,
+                  "inner_activation": self.inner_activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(LSTM, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class JZS1(Recurrent):
@@ -434,27 +489,34 @@ class JZS1(Recurrent):
            An Empirical Exploration of Recurrent Network Architectures
                http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf
    '''
-    def __init__(self, input_dim, output_dim=128,
+    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
                 activation='tanh', inner_activation='sigmoid',
-                 weights=None, truncate_gradient=-1, return_sequences=False):
-
-        super(JZS1, self).__init__()
-        self.input_dim = input_dim
+                 weights=None, truncate_gradient=-1, return_sequences=False,
+                 input_dim=None, input_length=None, **kwargs):
        self.output_dim = output_dim
-        self.truncate_gradient = truncate_gradient
-        self.return_sequences = return_sequences
-
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.truncate_gradient = truncate_gradient
+        self.return_sequences = return_sequences
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(JZS1, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
        self.input = T.tensor3()

-        self.W_z = self.init((self.input_dim, self.output_dim))
+        self.W_z = self.init((input_dim, self.output_dim))
        self.b_z = shared_zeros((self.output_dim))

-        self.W_r = self.init((self.input_dim, self.output_dim))
+        self.W_r = self.init((input_dim, self.output_dim))
        self.U_r = self.inner_init((self.output_dim, self.output_dim))
        self.b_r = shared_zeros((self.output_dim))

@@ -462,21 +524,23 @@ class JZS1(Recurrent):
        self.b_h = shared_zeros((self.output_dim))

        # P_h used to project X onto different dimension, using sparse random projections
-        if self.input_dim == self.output_dim:
+        if input_dim == self.output_dim:
            self.Pmat = theano.shared(np.identity(self.output_dim, dtype=theano.config.floatX), name=None)
        else:
-            P = np.random.binomial(1, 0.5, size=(self.input_dim, self.output_dim)).astype(theano.config.floatX) * 2 - 1
-            P = 1 / np.sqrt(self.input_dim) * P
+            P = np.random.binomial(1, 0.5, size=(input_dim, self.output_dim)).astype(theano.config.floatX) * 2 - 1
+            P = 1 / np.sqrt(input_dim) * P
            self.Pmat = theano.shared(P, name=None)

        self.params = [
            self.W_z, self.b_z,
            self.W_r, self.U_r, self.b_r,
            self.U_h, self.b_h,
+            self.Pmat
        ]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self,
              xz_t, xr_t, xh_t, mask_tm1,
@@ -508,15 +572,18 @@ class JZS1(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "activation": self.activation.__name__,
-                "inner_activation": self.inner_activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "activation": self.activation.__name__,
+                  "inner_activation": self.inner_activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(JZS1, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class JZS2(Recurrent):
@@ -539,50 +606,59 @@ class JZS2(Recurrent):
            An Empirical Exploration of Recurrent Network Architectures
                http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf
    '''
-    def __init__(self, input_dim, output_dim=128,
+    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
                 activation='tanh', inner_activation='sigmoid',
-                 weights=None, truncate_gradient=-1, return_sequences=False):
-
-        super(JZS2, self).__init__()
-        self.input_dim = input_dim
+                 weights=None, truncate_gradient=-1, return_sequences=False,
+                 input_dim=None, input_length=None, **kwargs):
        self.output_dim = output_dim
-        self.truncate_gradient = truncate_gradient
-        self.return_sequences = return_sequences
-
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.truncate_gradient = truncate_gradient
+        self.return_sequences = return_sequences
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(JZS2, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
        self.input = T.tensor3()

-        self.W_z = self.init((self.input_dim, self.output_dim))
+        self.W_z = self.init((input_dim, self.output_dim))
        self.U_z = self.inner_init((self.output_dim, self.output_dim))
        self.b_z = shared_zeros((self.output_dim))

        self.U_r = self.inner_init((self.output_dim, self.output_dim))
        self.b_r = shared_zeros((self.output_dim))

-        self.W_h = self.init((self.input_dim, self.output_dim))
+        self.W_h = self.init((input_dim, self.output_dim))
        self.U_h = self.inner_init((self.output_dim, self.output_dim))
        self.b_h = shared_zeros((self.output_dim))

        # P_h used to project X onto different dimension, using sparse random projections
-        if self.input_dim == self.output_dim:
+        if input_dim == self.output_dim:
            self.Pmat = theano.shared(np.identity(self.output_dim, dtype=theano.config.floatX), name=None)
        else:
-            P = np.random.binomial(1, 0.5, size=(self.input_dim, self.output_dim)).astype(theano.config.floatX) * 2 - 1
-            P = 1 / np.sqrt(self.input_dim) * P
+            P = np.random.binomial(1, 0.5, size=(input_dim, self.output_dim)).astype(theano.config.floatX) * 2 - 1
+            P = 1 / np.sqrt(input_dim) * P
            self.Pmat = theano.shared(P, name=None)

        self.params = [
            self.W_z, self.U_z, self.b_z,
            self.U_r, self.b_r,
            self.W_h, self.U_h, self.b_h,
+            self.Pmat
        ]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self,
              xz_t, xr_t, xh_t, mask_tm1,
@@ -614,15 +690,18 @@ class JZS2(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "activation": self.activation.__name__,
-                "inner_activation": self.inner_activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "activation": self.activation.__name__,
+                  "inner_activation": self.inner_activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(JZS2, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class JZS3(Recurrent):
@@ -645,32 +724,39 @@ class JZS3(Recurrent):
            An Empirical Exploration of Recurrent Network Architectures
                http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf
    '''
-    def __init__(self, input_dim, output_dim=128,
+    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
                 activation='tanh', inner_activation='sigmoid',
-                 weights=None, truncate_gradient=-1, return_sequences=False):
-
-        super(JZS3, self).__init__()
-        self.input_dim = input_dim
+                 weights=None, truncate_gradient=-1, return_sequences=False,
+                 input_dim=None, input_length=None, **kwargs):
        self.output_dim = output_dim
-        self.truncate_gradient = truncate_gradient
-        self.return_sequences = return_sequences
-
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.truncate_gradient = truncate_gradient
+        self.return_sequences = return_sequences
+        self.initial_weights = weights
+
+        self.input_dim = input_dim
+        self.input_length = input_length
+        if self.input_dim:
+            kwargs['input_shape'] = (self.input_length, self.input_dim)
+        super(JZS3, self).__init__(**kwargs)
+
+    def build(self):
+        input_dim = self.input_shape[2]
        self.input = T.tensor3()

-        self.W_z = self.init((self.input_dim, self.output_dim))
+        self.W_z = self.init((input_dim, self.output_dim))
        self.U_z = self.inner_init((self.output_dim, self.output_dim))
        self.b_z = shared_zeros((self.output_dim))

-        self.W_r = self.init((self.input_dim, self.output_dim))
+        self.W_r = self.init((input_dim, self.output_dim))
        self.U_r = self.inner_init((self.output_dim, self.output_dim))
        self.b_r = shared_zeros((self.output_dim))

-        self.W_h = self.init((self.input_dim, self.output_dim))
+        self.W_h = self.init((input_dim, self.output_dim))
        self.U_h = self.inner_init((self.output_dim, self.output_dim))
        self.b_h = shared_zeros((self.output_dim))

@@ -680,8 +766,9 @@ class JZS3(Recurrent):
            self.W_h, self.U_h, self.b_h,
        ]

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def _step(self,
              xz_t, xr_t, xh_t, mask_tm1,
@@ -714,12 +801,15 @@ class JZS3(Recurrent):
        return outputs[-1]

    def get_config(self):
-        return {"name": self.__class__.__name__,
-                "input_dim": self.input_dim,
-                "output_dim": self.output_dim,
-                "init": self.init.__name__,
-                "inner_init": self.inner_init.__name__,
-                "activation": self.activation.__name__,
-                "inner_activation": self.inner_activation.__name__,
-                "truncate_gradient": self.truncate_gradient,
-                "return_sequences": self.return_sequences}
+        config = {"name": self.__class__.__name__,
+                  "output_dim": self.output_dim,
+                  "init": self.init.__name__,
+                  "inner_init": self.inner_init.__name__,
+                  "activation": self.activation.__name__,
+                  "inner_activation": self.inner_activation.__name__,
+                  "truncate_gradient": self.truncate_gradient,
+                  "return_sequences": self.return_sequences,
+                  "input_dim": self.input_dim,
+                  "input_length": self.input_length}
+        base_config = super(JZS3, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -100,29 +100,29 @@ def standardize_weights(y, sample_weight=None, class_weight=None):
        return np.ones(y.shape[:-1] + (1,))


-def model_from_yaml(yaml_string):
+def model_from_yaml(yaml_string, custom_layers={}):
    '''
        Returns a model generated from a local yaml file,
        which is either created by hand or from to_yaml method of Sequential or Graph
    '''
    import yaml
    config = yaml.load(yaml_string)
-    return model_from_config(config)
+    return model_from_config(config, custom_layers=custom_layers)


-def model_from_json(json_string):
+def model_from_json(json_string, custom_layers={}):
    import json
    config = json.loads(json_string)
-    return model_from_config(config)
+    return model_from_config(config, custom_layers=custom_layers)


-def model_from_config(config):
+def model_from_config(config, custom_layers={}):
    model_name = config.get('name')
    if model_name not in {'Graph', 'Sequential'}:
        raise Exception('Unrecognized model:', model_name)

    # Create a container then set class to appropriate model
-    model = container_from_config(config)
+    model = container_from_config(config, custom_layers=custom_layers)
    if model_name == 'Graph':
        model.__class__ = Graph
    elif model_name == 'Sequential':
@@ -200,9 +200,8 @@ class Model(object):
                try:
                    ins_batch = slice_X(ins, batch_ids)
                except TypeError as err:
-                    print('TypeError while preparing batch. \
+                    raise Exception('TypeError while preparing batch. \
                        If using HDF5 input data, pass shuffle="batch".\n')
-                    raise

                batch_logs = {}
                batch_logs['batch'] = batch_index
@@ -313,17 +312,17 @@ class Model(object):
            pp.pprint(config)
        return config

-    def to_yaml(self):
+    def to_yaml(self, **kwargs):
        # dump model configuration to yaml string
        import yaml
        config = self.get_config()
-        return yaml.dump(config)
+        return yaml.dump(config, **kwargs)

-    def to_json(self):
+    def to_json(self, **kwargs):
        # dump model configuration to json string
        import json
        config = self.get_config()
-        return json.dumps(config)
+        return json.dumps(config, **kwargs)


 class Sequential(Model, containers.Sequential):
@@ -643,8 +642,9 @@ class Graph(Model, containers.Graph):
            validation_split=0., validation_data=None, shuffle=True, class_weight={}, sample_weight={}):
        X = [data[name] for name in self.input_order]
        y = [standardize_y(data[name]) for name in self.output_order]
-        sample_weight_list = [standardize_weights(data[name],
-                                                  sample_weight=sample_weight.get(name)) for name in self.output_order]
+
+        sample_weight_list = [standardize_weights(y[i],
+                                                  sample_weight=sample_weight.get(self.output_order[i])) for i in range(len(self.output_order))]
        class_weight_list = [class_weight.get(name) for name in self.output_order]

        val_f = None
@@ -671,7 +671,6 @@ class Graph(Model, containers.Graph):
                                                  sample_weight=sample_weight_list[i],
                                                  class_weight=class_weight_list[i]) for i in range(len(self.output_order))]
        ins = X + y + sample_weight_list
-
        history = self._fit(f, ins, out_labels=out_labels, batch_size=batch_size, nb_epoch=nb_epoch,
                            verbose=verbose, callbacks=callbacks,
                            val_f=val_f, val_ins=val_ins,
@@ -41,6 +41,9 @@ class Optimizer(object):
            norm = T.sqrt(sum([T.sum(g ** 2) for g in grads]))
            grads = [clip_norm(g, self.clipnorm, norm) for g in grads]

+        if hasattr(self, 'clipvalue') and self.clipvalue > 0:
+            grads = [T.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
+
        return grads

    def get_config(self):
@@ -55,6 +58,7 @@ class SGD(Optimizer):
        self.iterations = shared_scalar(0)
        self.lr = shared_scalar(lr)
        self.momentum = shared_scalar(momentum)
+        self.decay = shared_scalar(decay)

    def get_updates(self, params, constraints, loss):
        grads = self.get_gradients(loss, params)
@@ -1,6 +1,7 @@
 from __future__ import absolute_import

 import numpy as np
+import re
 from scipy import ndimage
 from scipy import linalg

@@ -103,7 +104,7 @@ def img_to_array(img):

 def load_img(path, grayscale=False):
    from PIL import Image
-    img = Image.open(open(path))
+    img = Image.open(path)
    if grayscale:
        img = img.convert('L')
    else: # Assure 3 channel even when loaded image is grayscale
@@ -7,7 +7,7 @@ from six.moves import range
 def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.):
    """
        Pad each sequence to the same length: 
-        the length of the longuest sequence.
+        the length of the longest sequence.

        If maxlen is provided, any sequence longer
        than maxlen is truncated to maxlen. Truncation happens off either the beginning (default) or
@@ -107,7 +107,10 @@ class Progbar(object):
            else:
                info += ' - %ds' % (now - self.start)
            for k in self.unique_values:
-                info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+                if type(self.sum_values[k]) is list:
+                    info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+                else:
+                    info += ' - %s: %s' % (k, self.sum_values[k])

            self.total_width += len(info)
            if prev_total_width > self.total_width:
@@ -5,7 +5,7 @@ import theano
 import copy

 from ..layers.advanced_activations import LeakyReLU, PReLU
-from ..layers.core import Dense, Merge, Dropout, Activation, Reshape, Flatten, RepeatVector, Layer, AutoEncoder
+from ..layers.core import Dense, Merge, Dropout, Activation, Reshape, Flatten, RepeatVector, Layer, AutoEncoder, Masking, Permute
 from ..layers.core import ActivityRegularization, TimeDistributedDense, AutoEncoder, MaxoutDense
 from ..layers.convolutional import Convolution1D, Convolution2D, MaxPooling1D, MaxPooling2D, ZeroPadding2D
 from ..layers.embeddings import Embedding, WordContextProduct
@@ -17,7 +17,7 @@ from .. import regularizers
 from .. import constraints


-def container_from_config(original_layer_dict):
+def container_from_config(original_layer_dict, custom_layers={}):
    layer_dict = copy.deepcopy(original_layer_dict)
    name = layer_dict.get('name')

@@ -26,7 +26,7 @@ def container_from_config(original_layer_dict):
        layers = layer_dict.get('layers')
        layer_list = []
        for layer in layers:
-            init_layer = container_from_config(layer)
+            init_layer = container_from_config(layer, custom_layers=custom_layers)
            layer_list.append(init_layer)
        merge_layer = Merge(layer_list, mode)
        return merge_layer
@@ -35,7 +35,7 @@ def container_from_config(original_layer_dict):
        layers = layer_dict.get('layers')
        layer_list = []
        for layer in layers:
-            init_layer = container_from_config(layer)
+            init_layer = container_from_config(layer, custom_layers=custom_layers)
            layer_list.append(init_layer)
        seq_layer = containers.Sequential(layer_list)
        return seq_layer
@@ -49,7 +49,8 @@ def container_from_config(original_layer_dict):

        nodes = layer_dict.get('node_config')
        for node in nodes:
-            layer = container_from_config(layer_dict['nodes'].get(node['name']))
+            layer = container_from_config(layer_dict['nodes'].get(node['name']),
+                                          custom_layers=custom_layers)
            node['layer'] = layer
            graph_layer.add_node(**node)

@@ -59,8 +60,10 @@ def container_from_config(original_layer_dict):
        return graph_layer

    elif name == 'AutoEncoder':
-        kwargs = {'encoder': container_from_config(layer_dict.get('encoder_config')),
-                  'decoder': container_from_config(layer_dict.get('decoder_config'))}
+        kwargs = {'encoder': container_from_config(layer_dict.get('encoder_config'),
+                                                   custom_layers=custom_layers),
+                  'decoder': container_from_config(layer_dict.get('decoder_config'),
+                                                   custom_layers=custom_layers)}
        for kwarg in ['output_reconstruction', 'weights']:
            if kwarg in layer_dict:
                kwargs[kwarg] = layer_dict[kwarg]
@@ -79,7 +82,7 @@ def container_from_config(original_layer_dict):
                if vname in [x for x, y in inspect.getmembers(regularizers, predicate=inspect.isclass)]:
                    layer_dict[k] = regularizers.get(vname, v)

-        base_layer = get_layer(name, layer_dict)
+        base_layer = get_layer(name, layer_dict, custom_layers=custom_layers)
        return base_layer


@@ -121,5 +124,8 @@ def print_layer_shapes(model, input_shapes):


 from .generic_utils import get_from_module
-def get_layer(identifier, kwargs=None):
+def get_layer(identifier, kwargs=None, custom_layers={}):
+    # Insert custom layers into globals so they can be accessed by `get_from_module`.
+    for cls_key in custom_layers:
+        globals()[cls_key] = custom_layers[cls_key]
    return get_from_module(identifier, globals(), 'layer', instantiate=True, kwargs=kwargs)
@@ -38,3 +38,7 @@ def ndim_tensor(ndim):
    elif ndim == 4:
        return T.tensor4()
    return T.matrix()
+
+
+def on_gpu():
+    return theano.config.device[:3] == 'gpu'
@@ -0,0 +1,41 @@
+import pydot
+# old pydot will not work with python3, must use one
+# that works with python3 such as pydot2 or pydot
+
+
+def plot(model, to_file='model.png'):
+
+    graph = pydot.Dot(graph_type='digraph')
+    if type(model) == Sequential:
+        previous_node = None
+        written_nodes = []
+        n = 1
+        for node in model.get_config()['layers']:
+            # append number in case layers have same name to differentiate
+            if (node['name'] + str(n)) in written_nodes:
+                n += 1
+            current_node = pydot.Node(node['name'] + str(n))
+            written_nodes.append(node['name'] + str(n))
+            graph.add_node(current_node)
+            if previous_node:
+                graph.add_edge(pydot.Edge(previous_node, current_node))
+            previous_node = current_node
+        graph.write_png(to_file)
+
+    elif type(model) == Graph:
+        # don't need to append number for names since all nodes labeled
+        for input_node in model.input_config:
+            graph.add_node(pydot.Node(input_node['name']))
+
+        # intermediate and output nodes have input defined
+        for layer_config in [model.node_config, model.output_config]:
+            for node in layer_config:
+                graph.add_node(pydot.Node(node['name']))
+                # possible to have multiple 'inputs' vs 1 'input'
+                if node['inputs']:
+                    for e in node['inputs']:
+                        graph.add_edge(pydot.Edge(e, node['name']))
+                else:
+                    graph.add_edge(pydot.Edge(node['input'], node['name']))
+
+        graph.write_png(to_file)
@@ -3,12 +3,15 @@ from setuptools import find_packages


 setup(name='Keras',
-      version='0.1.2',
+      version='0.2.0',
      description='Theano-based Deep Learning library',
      author='Francois Chollet',
      author_email='francois.chollet@gmail.com',
      url='https://github.com/fchollet/keras',
-      download_url='https://github.com/fchollet/keras/tarball/0.1.2',
+      download_url='https://github.com/fchollet/keras/tarball/0.2.0',
      license='MIT',
-      install_requires=['theano', 'pyyaml', 'h5py'],
+      install_requires=['theano', 'pyyaml'],
+      extras_require={
+          'h5py': ['h5py'],
+      },
      packages=find_packages())
@@ -0,0 +1,163 @@
+import unittest
+import numpy as np
+from numpy.testing import assert_allclose
+import theano
+
+from keras.layers import convolutional
+
+
+class TestConvolutions(unittest.TestCase):
+    def test_convolution_1d(self):
+        nb_samples = 9
+        nb_steps = 7
+        input_dim = 10
+        filter_length = 6
+        nb_filter = 5
+
+        weights_in = [np.ones((nb_filter, input_dim, filter_length, 1)), np.ones(nb_filter)]
+
+        input = np.ones((nb_samples, nb_steps, input_dim))
+        for weight in [None, weights_in]:
+            for border_mode in ['valid', 'full', 'same']:
+                for subsample_length in [1, 3]:
+                    if border_mode == 'same' and subsample_length != 1:
+                        continue
+                    for W_regularizer in [None, 'l2']:
+                        for b_regularizer in [None, 'l2']:
+                            for act_regularizer in [None, 'l2']:
+                                layer = convolutional.Convolution1D(
+                                    nb_filter, filter_length, weights=weight,
+                                    border_mode=border_mode, W_regularizer=W_regularizer,
+                                    b_regularizer=b_regularizer, activity_regularizer=act_regularizer,
+                                    subsample_length=subsample_length, input_shape=(None, input_dim))
+
+                            layer.input = theano.shared(value=input)
+                            for train in [True, False]:
+                                out = layer.get_output(train).eval()
+                                assert input.shape[0] == out.shape[0]
+                                if border_mode == 'same' and subsample_length == 1:
+                                    assert input.shape[1] == out.shape[1]
+
+                            config = layer.get_config()
+
+    def test_maxpooling_1d(self):
+        nb_samples = 9
+        nb_steps = 7
+        input_dim = 10
+
+        input = np.ones((nb_samples, nb_steps, input_dim))
+        for ignore_border in [True, False]:
+            for stride in [1, 2]:
+                layer = convolutional.MaxPooling1D(stride=stride, ignore_border=ignore_border)
+                layer.input = theano.shared(value=input)
+                for train in [True, False]:
+                    layer.get_output(train).eval()
+
+                config = layer.get_config()
+
+    def test_convolution_2d(self):
+        nb_samples = 8
+        nb_filter = 9
+        stack_size = 7
+        nb_row = 10
+        nb_col = 6
+
+        input_nb_row = 11
+        input_nb_col = 12
+
+        weights_in = [np.ones((nb_filter, stack_size, nb_row, nb_col)), np.ones(nb_filter)]
+
+        input = np.ones((nb_samples, stack_size, input_nb_row, input_nb_col))
+        for weight in [None, weights_in]:
+            for border_mode in ['valid', 'full', 'same']:
+                for subsample in [(1, 1), (2, 3)]:
+                    if border_mode == 'same' and subsample != (1, 1):
+                        continue
+                    for W_regularizer in [None, 'l2']:
+                        for b_regularizer in [None, 'l2']:
+                            for act_regularizer in [None, 'l2']:
+                                layer = convolutional.Convolution2D(
+                                    nb_filter, nb_row, nb_col, weights=weight,
+                                    border_mode=border_mode, W_regularizer=W_regularizer,
+                                    b_regularizer=b_regularizer, activity_regularizer=act_regularizer,
+                                    subsample=subsample, input_shape=(stack_size, None, None))
+
+                                layer.input = theano.shared(value=input)
+                                for train in [True, False]:
+                                    out = layer.get_output(train).eval()
+                                    if border_mode == 'same' and subsample == (1, 1):
+                                        assert out.shape[2:] == input.shape[2:]
+
+                                config = layer.get_config()
+
+    def test_maxpooling_2d(self):
+        nb_samples = 9
+        stack_size = 7
+        input_nb_row = 11
+        input_nb_col = 12
+        pool_size = (3, 3)
+
+        input = np.ones((nb_samples, stack_size, input_nb_row, input_nb_col))
+        for ignore_border in [True, False]:
+            for stride in [(1, 1), (2, 2)]:
+                layer = convolutional.MaxPooling2D(stride=stride, ignore_border=ignore_border, pool_size=pool_size)
+                layer.input = theano.shared(value=input)
+                for train in [True, False]:
+                    layer.get_output(train).eval()
+
+                config = layer.get_config()
+
+    def test_zero_padding_2d(self):
+        nb_samples = 9
+        stack_size = 7
+        input_nb_row = 11
+        input_nb_col = 12
+
+        input = np.ones((nb_samples, stack_size, input_nb_row, input_nb_col))
+        layer = convolutional.ZeroPadding2D(padding=(2, 2))
+        layer.input = theano.shared(value=input)
+        for train in [True, False]:
+            out = layer.get_output(train).eval()
+            for offset in [0, 1, -1, -2]:
+                assert_allclose(out[:, :, offset, :], 0.)
+                assert_allclose(out[:, :, :, offset], 0.)
+            assert_allclose(out[:, :, 2:-2, 2:-2], 1.)
+
+        config = layer.get_config()
+
+    def test_upsample_1d(self):
+        nb_samples = 9
+        nb_steps = 7
+        input_dim = 10
+
+        input = np.ones((nb_samples, nb_steps, input_dim))
+        for length in [2, 3, 9]:
+            layer = convolutional.UpSample1D(length=length)
+            layer.input = theano.shared(value=input)
+            for train in [True, False]:
+                out = layer.get_output(train).eval()
+                assert out.shape[1] == length*nb_steps
+
+            config = layer.get_config()
+
+    def test_upsample_2d(self):
+        nb_samples = 9
+        stack_size = 7
+        input_nb_row = 11
+        input_nb_col = 12
+
+        input = np.ones((nb_samples, stack_size, input_nb_row, input_nb_col))
+
+        for length_row in [2, 3, 9]:
+            for length_col in [2, 3, 9]:
+                layer = convolutional.UpSample2D(size=(length_row, length_col))
+                layer.input = theano.shared(value=input)
+                for train in [True, False]:
+                    out = layer.get_output(train).eval()
+                    assert out.shape[2] == length_row*input_nb_row
+                    assert out.shape[3] == length_col*input_nb_col
+
+            config = layer.get_config()
+
+if __name__ == '__main__':
+    unittest.main()
@@ -12,11 +12,6 @@ class TestLayerBase(unittest.TestCase):
        input_dim = 5
        layer = core.Layer()

-        # As long as there is no input, an error should be raised.
-        for train in [True, False]:
-            self.assertRaises(AttributeError, layer.get_input, train)
-            self.assertRaises(AttributeError, layer.get_output, train)
-
        # Once an input is provided, it should be reachable through the
        # appropriate getters
        input = np.ones((nb_samples, input_dim))
@@ -34,10 +29,6 @@ class TestLayerBase(unittest.TestCase):
        input = np.ones((nb_samples, input_dim))
        layer1.input = theano.shared(value=input)

-        # As long as there is no previous layer, an error should be raised.
-        for train in [True, False]:
-            self.assertRaises(AttributeError, layer2.get_input, train)
-
        # After connecting, input of layer1 should be passed through
        layer2.set_previous(layer1)
        for train in [True, False]:
@@ -81,7 +72,7 @@ class TestConfigParams(unittest.TestCase):
        self._runner(layer)

    def test_reshape(self):
-        layer = core.Reshape(10, 10)
+        layer = core.Reshape(dims=(10, 10))
        self._runner(layer)

    def test_flatten(self):
@@ -93,7 +84,7 @@ class TestConfigParams(unittest.TestCase):
        self._runner(layer)

    def test_dense(self):
-        layer = core.Dense(10, 10)
+        layer = core.Dense(10, input_shape=(10,))
        self._runner(layer)

    def test_act_reg(self):
@@ -101,7 +92,11 @@ class TestConfigParams(unittest.TestCase):
        self._runner(layer)

    def test_time_dist_dense(self):
-        layer = core.TimeDistributedDense(10, 10)
+        layer = core.TimeDistributedDense(10, input_shape=(None, 10))
+        self._runner(layer)
+
+    def test_time_dist_merge(self):
+        layer = core.TimeDistributedMerge()
        self._runner(layer)

    def test_autoencoder(self):
@@ -126,9 +121,8 @@ class TestMasking(unittest.TestCase):
        func = theano.function([layer.input], layer.get_output_mask())
        self.assertTrue(np.all(
            # get mask for this input
-            func(np.array(
-            [[[1], [2], [3], [0]],
-             [[0], [4], [5], [0]]], dtype=np.int32)) ==
+            func(np.array([[[1], [2], [3], [0]],
+                          [[0], [4], [5], [0]]], dtype=np.int32)) ==
            # This is the expected output mask, one dimension less
            np.array([[1, 1, 1, 0], [0, 1, 1, 0]])))

@@ -138,9 +132,8 @@ class TestMasking(unittest.TestCase):
        func = theano.function([layer.input], layer.get_output_mask())
        self.assertTrue(np.all(
            # get mask for this input, if not all the values are 5, shouldn't masked
-            func(np.array(
-            [[[1, 1], [2, 1], [3, 1], [5, 5]],
-             [[1, 5], [5, 0], [0, 0], [0, 0]]], dtype=np.int32)) ==
+            func(np.array([[[1, 1], [2, 1], [3, 1], [5, 5]],
+                          [[1, 5], [5, 0], [0, 0], [0, 0]]], dtype=np.int32)) ==
            # This is the expected output mask, one dimension less
            np.array([[1, 1, 1, 0], [1, 1, 1, 1]])))

@@ -150,12 +143,11 @@ class TestMasking(unittest.TestCase):
        func = theano.function([layer.input], layer.get_output())
        self.assertTrue(np.all(
            # get output for this input, replace padding with 0
-            func(np.array(
-            [[[1, 1], [2, 1], [3, 1], [5, 5]],
-             [[1, 5], [5, 0], [0, 0], [0, 0]]], dtype=np.int32)) ==
+            func(np.array([[[1, 1], [2, 1], [3, 1], [5, 5]],
+                          [[1, 5], [5, 0], [0, 0], [0, 0]]], dtype=np.int32)) ==
            # This is the expected output
            np.array([[[1, 1], [2, 1], [3, 1], [0, 0]],
-             [[1, 5], [5, 0], [0, 0], [0, 0]]])))
+                     [[1, 5], [5, 0], [0, 0], [0, 0]]])))


 if __name__ == '__main__':
@@ -12,21 +12,20 @@ def _runner(layer_class):
    All the recurrent layers share the same interface, so we can run through them with a single
    function.
    """
-    for weights in [None, [np.ones((input_dim, output_dim))]]:
-        for ret_seq in [True, False]:
-            layer = layer_class(input_dim, output_dim, return_sequences=ret_seq, weights=weights)
-            layer.input = theano.shared(value=np.ones((nb_samples, timesteps, input_dim)))
-            config = layer.get_config()
+    for ret_seq in [True, False]:
+        layer = layer_class(output_dim, return_sequences=ret_seq, weights=None, input_shape=(None, input_dim))
+        layer.input = theano.shared(value=np.ones((nb_samples, timesteps, input_dim)))
+        config = layer.get_config()

-            for train in [True, False]:
-                out = layer.get_output(train).eval()
-                # Make sure the output has the desired shape
-                if ret_seq:
-                    assert(out.shape == (nb_samples, timesteps, output_dim))
-                else:
-                    assert(out.shape == (nb_samples, output_dim))
+        for train in [True, False]:
+            out = layer.get_output(train).eval()
+            # Make sure the output has the desired shape
+            if ret_seq:
+                assert(out.shape == (nb_samples, timesteps, output_dim))
+            else:
+                assert(out.shape == (nb_samples, output_dim))

-                mask = layer.get_output_mask(train)
+            mask = layer.get_output_mask(train)


 class TestRNNS(unittest.TestCase):
@@ -6,6 +6,7 @@ import theano.tensor as T

 import numpy

+
 def list_assert_equal(a, b, round_to=7):
    '''
    This will do a pairwise, rounded equality test across two lists of
@@ -15,13 +16,14 @@ def list_assert_equal(a, b, round_to=7):
    for i, j in pairs:
        assert round(i, round_to) == round(j, round_to)

+
 def get_standard_values():
    '''
    These are just a set of floats used for testing the activation
    functions, and are useful in multiple tests.
    '''
+    return [0, 0.1, 0.5, 0.9, 1.0]

-    return [0,0.1,0.5,0.9,1.0]

 def test_softmax():

@@ -39,7 +41,7 @@ def test_softmax():
    x = T.vector()
    exp = s(x)
    f = theano.function([x], exp)
-    test_values=get_standard_values()
+    test_values = get_standard_values()

    result = f(test_values)
    expected = softmax(test_values)
@@ -49,6 +51,7 @@ def test_softmax():

    list_assert_equal(result, expected)

+
 def test_relu():
    '''
    Relu implementation doesn't depend on the value being
@@ -69,11 +72,10 @@ def test_relu():
    test_values = get_standard_values()
    result = f(test_values)

-    list_assert_equal(result, test_values) # because no negatives in test values
+    list_assert_equal(result, test_values)  # because no negatives in test values


 def test_tanh():
-
    from keras.activations import tanh as t
    test_values = get_standard_values()

@@ -61,7 +61,7 @@ class TestConstraints(unittest.TestCase):
        normalized = unitnorm_instance(self.example_array)

        norm_of_normalized = np.sqrt(np.sum(normalized.eval()**2, axis=1))
-        difference = norm_of_normalized - 1. #in the unit norm constraint, it should be equal to 1.
+        difference = norm_of_normalized - 1.  # in the unit norm constraint, it should be equal to 1.
        largest_difference = np.max(np.abs(difference))
        self.assertAlmostEqual(largest_difference, 0.)

@@ -15,15 +15,15 @@ class TestBatchNormalization(unittest.TestCase):
        self.input_shapes = [np.ones((10, 10)), np.ones((10, 10, 10))]

    def test_setup(self):
-        norm_m0 = normalization.BatchNormalization((10, 10))
-        norm_m1 = normalization.BatchNormalization((10, 10), mode=1)
+        norm_m0 = normalization.BatchNormalization(input_shape=(10, 10))
+        norm_m1 = normalization.BatchNormalization(input_shape=(10, 10), mode=1)

        # mode 3 does not exist
-        self.assertRaises(Exception, normalization.BatchNormalization((10, 10), mode=3))
+        self.assertRaises(Exception, normalization.BatchNormalization(input_shape=(10, 10), mode=3))

    def test_mode_0(self):
        model = Sequential()
-        norm_m0 = normalization.BatchNormalization((10,))
+        norm_m0 = normalization.BatchNormalization(input_shape=(10,))
        model.add(norm_m0)
        model.compile(loss='mse', optimizer='sgd')

@@ -37,8 +37,7 @@ class TestBatchNormalization(unittest.TestCase):
        self.assertAlmostEqual(out.std().eval(), 1.0, places=1)

    def test_mode_1(self):
-        norm_m1 = normalization.BatchNormalization((10,), mode=1)
-        norm_m1.init_updates()
+        norm_m1 = normalization.BatchNormalization(input_shape=(10,), mode=1)

        for inp in [self.input_1, self.input_2, self.input_3]:
            norm_m1.input = inp
@@ -54,12 +53,11 @@ class TestBatchNormalization(unittest.TestCase):
        Test batch normalization with various input shapes
        """
        for inp in self.input_shapes:
-            norm_m0 = normalization.BatchNormalization(inp.shape, mode=0)
-            norm_m0.init_updates()
+            norm_m0 = normalization.BatchNormalization(input_shape=inp.shape, mode=0)
            norm_m0.input = inp
            out = (norm_m0.get_output(train=True) - norm_m0.beta) / norm_m0.gamma

-            norm_m1 = normalization.BatchNormalization(inp.shape, mode=1)
+            norm_m1 = normalization.BatchNormalization(input_shape=inp.shape, mode=1)
            norm_m1.input = inp
            out = (norm_m1.get_output(train=True) - norm_m1.beta) / norm_m1.gamma

@@ -67,9 +65,8 @@ class TestBatchNormalization(unittest.TestCase):
        """
        Test weight initialization
        """
-
-        norm_m1 = normalization.BatchNormalization((10,), mode=1, weights=[np.ones(10), np.ones(10)])
-        norm_m1.init_updates()
+        norm_m1 = normalization.BatchNormalization(input_shape=(10,), mode=1,
+                                                   weights=[np.ones(10), np.ones(10), np.zeros(10), np.zeros(10)])

        for inp in [self.input_1, self.input_2, self.input_3]:
            norm_m1.input = inp
@@ -83,17 +80,19 @@ class TestBatchNormalization(unittest.TestCase):
        assert_allclose(norm_m1.gamma.eval(), np.ones(10))
        assert_allclose(norm_m1.beta.eval(), np.ones(10))

-        # Weights must be an iterable of gamma AND beta.
-        self.assertRaises(Exception, normalization.BatchNormalization((10,)), weights=np.ones(10))
-
    def test_config(self):
-        norm = normalization.BatchNormalization((10, 10), mode=1, epsilon=0.1)
+        norm = normalization.BatchNormalization(input_shape=(10, 10), mode=1, epsilon=0.1, momentum=0.9)
        conf = norm.get_config()
        conf_target = {"input_shape": (10, 10), "name": normalization.BatchNormalization.__name__,
-                       "epsilon": 0.1, "mode": 1}
-
+                       "epsilon": 0.1, "mode": 1, "momentum": 0.9}
        self.assertDictEqual(conf, conf_target)

+    def test_save_weights(self):
+        norm = normalization.BatchNormalization(input_shape=(10, 10), mode=1, epsilon=0.1)
+        weights = norm.get_weights()
+        assert(len(weights) == 4)
+        norm.set_weights(weights)
+

 if __name__ == '__main__':
    unittest.main()
@@ -14,9 +14,9 @@ class TestEmbedding(unittest.TestCase):

    def test_unitnorm_constraint(self):
        lookup = Sequential()
-        lookup.add(Embedding(3, 2, weights=[self.W1], W_constraint=unitnorm()))
+        lookup.add(Embedding(3, 2, weights=[self.W1], W_constraint=unitnorm(), input_length=1))
        lookup.add(Flatten())
-        lookup.add(Dense(2, 1))
+        lookup.add(Dense(1))
        lookup.add(Activation('sigmoid'))
        lookup.compile(loss='binary_crossentropy', optimizer='sgd', class_mode='binary')
        lookup.train_on_batch(self.X1, np.array([[1], [0]], dtype='int32'))
@@ -23,11 +23,11 @@ class TestGraph(unittest.TestCase):
    def test_1o_1i(self):
        print('test a non-sequential graph with 1 input and 1 output')
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
+        graph.add_input(name='input1', input_shape=(32,))

-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input1')
-        graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input1')
+        graph.add_node(Dense(4), name='dense3', input='dense1')

        graph.add_output(name='output1', inputs=['dense2', 'dense3'], merge_mode='sum')
        graph.compile('rmsprop', {'output1': 'mse'})
@@ -45,14 +45,14 @@ class TestGraph(unittest.TestCase):
    def test_1o_1i_2(self):
        print('test a more complex non-sequential graph with 1 input and 1 output')
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
+        graph.add_input(name='input1', input_shape=(32,))

-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2-0', input='input1')
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2-0', input='input1')
        graph.add_node(Activation('relu'), name='dense2', input='dense2-0')

-        graph.add_node(Dense(4, 16), name='dense3', input='dense2')
-        graph.add_node(Dense(16, 4), name='dense4', inputs=['dense1', 'dense3'], merge_mode='sum')
+        graph.add_node(Dense(16), name='dense3', input='dense2')
+        graph.add_node(Dense(4), name='dense4', inputs=['dense1', 'dense3'], merge_mode='sum')

        graph.add_output(name='output1', inputs=['dense2', 'dense4'], merge_mode='sum')
        graph.compile('rmsprop', {'output1': 'mse'})
@@ -71,12 +71,12 @@ class TestGraph(unittest.TestCase):
    def test_1o_2i(self):
        print('test a non-sequential graph with 2 inputs and 1 output')
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
-        graph.add_input(name='input2', ndim=2)
+        graph.add_input(name='input1', input_shape=(32,))
+        graph.add_input(name='input2', input_shape=(32,))

-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input2')
-        graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input2')
+        graph.add_node(Dense(4), name='dense3', input='dense1')

        graph.add_output(name='output1', inputs=['dense2', 'dense3'], merge_mode='sum')
        graph.compile('rmsprop', {'output1': 'mse'})
@@ -95,11 +95,11 @@ class TestGraph(unittest.TestCase):
    def test_2o_1i_weights(self):
        print('test a non-sequential graph with 1 input and 2 outputs')
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
+        graph.add_input(name='input1', input_shape=(32,))

-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input1')
-        graph.add_node(Dense(16, 1), name='dense3', input='dense1')
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input1')
+        graph.add_node(Dense(1), name='dense3', input='dense1')

        graph.add_output(name='output1', input='dense2')
        graph.add_output(name='output2', input='dense3')
@@ -118,10 +118,10 @@ class TestGraph(unittest.TestCase):
        print('test weight saving')
        graph.save_weights('temp.h5', overwrite=True)
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input1')
-        graph.add_node(Dense(16, 1), name='dense3', input='dense1')
+        graph.add_input(name='input1', input_shape=(32,))
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input1')
+        graph.add_node(Dense(1), name='dense3', input='dense1')
        graph.add_output(name='output1', input='dense2')
        graph.add_output(name='output2', input='dense3')
        graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
@@ -133,11 +133,11 @@ class TestGraph(unittest.TestCase):
    def test_2o_1i_sample_weights(self):
        print('test a non-sequential graph with 1 input and 2 outputs with sample weights')
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
+        graph.add_input(name='input1', input_shape=(32,))

-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input1')
-        graph.add_node(Dense(16, 1), name='dense3', input='dense1')
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input1')
+        graph.add_node(Dense(1), name='dense3', input='dense1')

        graph.add_output(name='output1', input='dense2')
        graph.add_output(name='output2', input='dense3')
@@ -166,16 +166,16 @@ class TestGraph(unittest.TestCase):
        print('test layer-like API')

        graph = containers.Graph()
-        graph.add_input(name='input1', ndim=2)
-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input1')
-        graph.add_node(Dense(16, 4), name='dense3', input='dense1')
+        graph.add_input(name='input1', input_shape=(32,))
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input1')
+        graph.add_node(Dense(4), name='dense3', input='dense1')
        graph.add_output(name='output1', inputs=['dense2', 'dense3'], merge_mode='sum')

        seq = Sequential()
-        seq.add(Dense(32, 32, name='first_seq_dense'))
+        seq.add(Dense(32, input_shape=(32,)))
        seq.add(graph)
-        seq.add(Dense(4, 4, name='last_seq_dense'))
+        seq.add(Dense(4))

        seq.compile('rmsprop', 'mse')

@@ -191,12 +191,12 @@ class TestGraph(unittest.TestCase):
    def test_create_output(self):
        print('test create_output argument')
        graph = Graph()
-        graph.add_input(name='input1', ndim=2)
+        graph.add_input(name='input1', input_shape=(32,))

-        graph.add_node(Dense(32, 16), name='dense1', input='input1')
-        graph.add_node(Dense(32, 4), name='dense2', input='input1')
-        graph.add_node(Dense(16, 4), name='dense3', input='dense1')
-        graph.add_node(Dense(4, 4), name='output1', inputs=['dense2', 'dense3'], merge_mode='sum', create_output=True)
+        graph.add_node(Dense(16), name='dense1', input='input1')
+        graph.add_node(Dense(4), name='dense2', input='input1')
+        graph.add_node(Dense(4), name='dense3', input='dense1')
+        graph.add_node(Dense(4), name='output1', inputs=['dense2', 'dense3'], merge_mode='sum', create_output=True)
        graph.compile('rmsprop', {'output1': 'mse'})

        history = graph.fit({'input1': X_train, 'output1': y_train}, nb_epoch=10)
@@ -209,6 +209,34 @@ class TestGraph(unittest.TestCase):
        print(loss)
        assert(loss < 2.5)

+    def test_count_params(self):
+        print('test count params')
+
+        nb_units = 100
+        nb_classes = 2
+
+        graph = Graph()
+        graph.add_input(name='input1', input_shape=(32,))
+        graph.add_input(name='input2', input_shape=(32,))
+        graph.add_node(Dense(nb_units),
+                       name='dense1', input='input1')
+        graph.add_node(Dense(nb_classes),
+                       name='dense2', input='input2')
+        graph.add_node(Dense(nb_classes),
+                       name='dense3', input='dense1')
+        graph.add_output(name='output', inputs=['dense2', 'dense3'],
+                         merge_mode='sum')
+
+        n = 32 * nb_units + nb_units
+        n += 32 * nb_classes + nb_classes
+        n += nb_units * nb_classes + nb_classes
+
+        self.assertEqual(n, graph.count_params())
+
+        graph.compile('rmsprop', {'output': 'binary_crossentropy'})
+
+        self.assertEqual(n, graph.count_params())
+

 if __name__ == '__main__':
    print('Test graph model')
@@ -14,8 +14,8 @@ class TestLossMasking(unittest.TestCase):
            [[[1, 1], [2, 1], [3, 1], [5, 5]],
             [[1, 5], [5, 0], [0, 0], [0, 0]]], dtype=np.int32)
        model = Sequential()
-        model.add(Masking(mask_value=0))
-        model.add(TimeDistributedDense(2, 1, init='one'))
+        model.add(Masking(mask_value=0, input_shape=(None, 2)))
+        model.add(TimeDistributedDense(1, init='one'))
        model.compile(loss='mse', optimizer='sgd')
        y = model.predict(X)
        loss = model.fit(X, 4*y, nb_epoch=1, batch_size=2, verbose=1).history['loss'][0]
@@ -42,18 +42,18 @@ sample_weight[y_train == weighted_class] = high_weight

 def create_sequential_model():
    model = Sequential()
-    model.add(Dense(784, 50))
+    model.add(Dense(50, input_shape=(784,)))
    model.add(Activation('relu'))
-    model.add(Dense(50, 10))
+    model.add(Dense(10))
    model.add(Activation('softmax'))
    return model


 def create_graph_model():
    model = Graph()
-    model.add_input(name='input')
-    model.add_node(Dense(784, 50, activation='relu'), name='d1', input='input')
-    model.add_node(Dense(50, 10, activation='softmax'), name='d2', input='d1')
+    model.add_input(name='input', input_shape=(784,))
+    model.add_node(Dense(50, activation='relu'), name='d1', input='input')
+    model.add_node(Dense(10, activation='softmax'), name='d2', input='d1')
    model.add_output(name='output', input='d2')
    return model

@@ -18,9 +18,9 @@ y_test = to_categorical(y_test)

 def get_model(input_dim, nb_hidden, output_dim):
    model = Sequential()
-    model.add(Dense(input_dim, nb_hidden))
+    model.add(Dense(nb_hidden, input_shape=(input_dim,)))
    model.add(Activation('relu'))
-    model.add(Dense(nb_hidden, output_dim))
+    model.add(Dense(output_dim))
    model.add(Activation('softmax'))
    return model

@@ -35,9 +35,9 @@ test_ids = np.where(y_test == np.array(weighted_class))[0]

 def create_model(weight_reg=None, activity_reg=None):
    model = Sequential()
-    model.add(Dense(784, 50))
+    model.add(Dense(50, input_shape=(784,)))
    model.add(Activation('relu'))
-    model.add(Dense(50, 10, W_regularizer=weight_reg, activity_regularizer=activity_reg))
+    model.add(Dense(10, W_regularizer=weight_reg, activity_regularizer=activity_reg))
    model.add(Activation('softmax'))
    return model

@@ -30,9 +30,9 @@ class TestSequential(unittest.TestCase):
    def test_sequential(self):
        print('Test sequential')
        model = Sequential()
-        model.add(Dense(input_dim, nb_hidden))
+        model.add(Dense(nb_hidden, input_shape=(input_dim,)))
        model.add(Activation('relu'))
-        model.add(Dense(nb_hidden, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

@@ -57,9 +57,9 @@ class TestSequential(unittest.TestCase):
        print('test weight saving')
        model.save_weights('temp.h5', overwrite=True)
        model = Sequential()
-        model.add(Dense(input_dim, nb_hidden))
+        model.add(Dense(nb_hidden, input_shape=(input_dim,)))
        model.add(Activation('relu'))
-        model.add(Dense(nb_hidden, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
        model.load_weights('temp.h5')
@@ -79,17 +79,17 @@ class TestSequential(unittest.TestCase):
    def test_merge_sum(self):
        print('Test merge: sum')
        left = Sequential()
-        left.add(Dense(input_dim, nb_hidden))
+        left.add(Dense(nb_hidden, input_shape=(input_dim,)))
        left.add(Activation('relu'))

        right = Sequential()
-        right.add(Dense(input_dim, nb_hidden))
+        right.add(Dense(nb_hidden, input_shape=(input_dim,)))
        right.add(Activation('relu'))

        model = Sequential()
        model.add(Merge([left, right], mode='sum'))

-        model.add(Dense(nb_hidden, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -113,14 +113,14 @@ class TestSequential(unittest.TestCase):
        print('test weight saving')
        model.save_weights('temp.h5', overwrite=True)
        left = Sequential()
-        left.add(Dense(input_dim, nb_hidden))
+        left.add(Dense(nb_hidden, input_shape=(input_dim,)))
        left.add(Activation('relu'))
        right = Sequential()
-        right.add(Dense(input_dim, nb_hidden))
+        right.add(Dense(nb_hidden, input_shape=(input_dim,)))
        right.add(Activation('relu'))
        model = Sequential()
        model.add(Merge([left, right], mode='sum'))
-        model.add(Dense(nb_hidden, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))
        model.load_weights('temp.h5')
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -132,17 +132,17 @@ class TestSequential(unittest.TestCase):
    def test_merge_concat(self):
        print('Test merge: concat')
        left = Sequential()
-        left.add(Dense(input_dim, nb_hidden))
+        left.add(Dense(nb_hidden, input_shape=(input_dim,)))
        left.add(Activation('relu'))

        right = Sequential()
-        right.add(Dense(input_dim, nb_hidden))
+        right.add(Dense(nb_hidden, input_shape=(input_dim,)))
        right.add(Activation('relu'))

        model = Sequential()
        model.add(Merge([left, right], mode='concat'))

-        model.add(Dense(nb_hidden * 2, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -166,17 +166,17 @@ class TestSequential(unittest.TestCase):
        print('test weight saving')
        model.save_weights('temp.h5', overwrite=True)
        left = Sequential()
-        left.add(Dense(input_dim, nb_hidden))
+        left.add(Dense(nb_hidden, input_shape=(input_dim,)))
        left.add(Activation('relu'))

        right = Sequential()
-        right.add(Dense(input_dim, nb_hidden))
+        right.add(Dense(nb_hidden, input_shape=(input_dim,)))
        right.add(Activation('relu'))

        model = Sequential()
        model.add(Merge([left, right], mode='concat'))

-        model.add(Dense(nb_hidden * 2, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -190,26 +190,26 @@ class TestSequential(unittest.TestCase):
        print('Test merge recursivity')

        left = Sequential()
-        left.add(Dense(input_dim, nb_hidden))
+        left.add(Dense(nb_hidden, input_shape=(input_dim,)))
        left.add(Activation('relu'))

        right = Sequential()
-        right.add(Dense(input_dim, nb_hidden))
+        right.add(Dense(nb_hidden, input_shape=(input_dim,)))
        right.add(Activation('relu'))

        righter = Sequential()
-        righter.add(Dense(input_dim, nb_hidden))
+        righter.add(Dense(nb_hidden, input_shape=(input_dim,)))
        righter.add(Activation('relu'))

        intermediate = Sequential()
        intermediate.add(Merge([left, right], mode='sum'))
-        intermediate.add(Dense(nb_hidden, nb_hidden))
+        intermediate.add(Dense(nb_hidden))
        intermediate.add(Activation('relu'))

        model = Sequential()
        model.add(Merge([intermediate, righter], mode='sum'))

-        model.add(Dense(nb_hidden, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -240,13 +240,13 @@ class TestSequential(unittest.TestCase):
    def test_merge_overlap(self):
        print('Test merge overlap')
        left = Sequential()
-        left.add(Dense(input_dim, nb_hidden))
+        left.add(Dense(nb_hidden, input_shape=(input_dim,)))
        left.add(Activation('relu'))

        model = Sequential()
        model.add(Merge([left, left], mode='sum'))

-        model.add(Dense(nb_hidden, nb_class))
+        model.add(Dense(nb_class))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
@@ -276,6 +276,28 @@ class TestSequential(unittest.TestCase):
        print(nloss)
        assert(loss == nloss)

+    def test_count_params(self):
+        print('test count params')
+        input_dim = 20
+        nb_units = 10
+        nb_classes = 2
+
+        n = input_dim * nb_units + nb_units
+        n += nb_units * nb_units + nb_units
+        n += nb_units * nb_classes + nb_classes
+
+        model = Sequential()
+        model.add(Dense(nb_units, input_shape=(input_dim,)))
+        model.add(Dense(nb_units))
+        model.add(Dense(nb_classes))
+        model.add(Activation('softmax'))
+
+        self.assertEqual(n, model.count_params())
+
+        model.compile('sgd', 'binary_crossentropy')
+
+        self.assertEqual(n, model.count_params())
+

 if __name__ == '__main__':
    print('Test Sequential model')
@@ -0,0 +1,132 @@
+import unittest
+import numpy as np
+import theano
+from keras.utils.theano_utils import ndim_tensor
+from keras.layers.core import *
+from keras.layers.convolutional import *
+from keras.layers.recurrent import SimpleRNN
+
+
+def check_layer_output_shape(layer, input_data):
+    ndim = len(input_data.shape)
+    layer.input = ndim_tensor(ndim)
+    layer.set_input_shape(input_data.shape[1:])
+    expected_output_shape = layer.output_shape[1:]
+
+    function = theano.function([layer.input], [layer.get_output()])
+    output = function(input_data)[0]
+    assert output.shape[1:] == expected_output_shape
+
+
+class TestShapeInference(unittest.TestCase):
+    # ########
+    # # Core #
+    # ########
+    def test_Reshape(self):
+        layer = Reshape(dims=(2, 3))
+        input_data = np.random.random((2, 6))
+        check_layer_output_shape(layer, input_data)
+
+    def test_Permute(self):
+        layer = Permute(dims=(1, 3, 2))
+        input_data = np.random.random((2, 2, 4, 3))
+        check_layer_output_shape(layer, input_data)
+
+    def test_Flatten(self):
+        layer = Flatten()
+        input_data = np.random.random((2, 2, 3))
+        check_layer_output_shape(layer, input_data)
+
+    def test_RepeatVector(self):
+        layer = RepeatVector(2)
+        input_data = np.random.random((2, 2))
+        check_layer_output_shape(layer, input_data)
+
+    def test_Dense(self):
+        layer = Dense(3)
+        input_data = np.random.random((2, 2))
+        check_layer_output_shape(layer, input_data)
+
+    def test_TimeDistributedDense(self):
+        layer = TimeDistributedDense(2)
+        input_data = np.random.random((2, 2, 3))
+        check_layer_output_shape(layer, input_data)
+
+    #################
+    # Convolutional #
+    #################
+    def test_Convolution1D(self):
+        for border_mode in ['same', 'full', 'valid']:
+            for filter_length in [2, 3]:
+                for subsample_length in [1, 2]:
+                    if subsample_length > 1 and border_mode == 'same':
+                        continue
+                    for input_data_shape in [(2, 3, 2), (2, 4, 2)]:
+                        layer = Convolution1D(nb_filter=1, filter_length=filter_length,
+                                              border_mode=border_mode, subsample_length=subsample_length)
+                        input_data = np.random.random(input_data_shape)
+                        check_layer_output_shape(layer, input_data)
+
+    def test_Convolution2D(self):
+        for border_mode in ['same', 'full', 'valid']:
+            for nb_row, nb_col in [(2, 1), (3, 2)]:
+                for subsample in [(1, 1), (2, 2)]:
+                    if (subsample[0] > 1 or subsample[1] > 1) and border_mode == 'same':
+                        continue
+                    for input_data_shape in [(2, 1, 3, 3), (2, 1, 4, 4)]:
+                        layer = Convolution2D(nb_filter=1, nb_row=nb_row, nb_col=nb_row,
+                                              border_mode=border_mode, subsample=subsample)
+                        input_data = np.random.random(input_data_shape)
+                        check_layer_output_shape(layer, input_data)
+
+    def test_MaxPooling1D(self):
+        for ignore_border in [True, False]:
+            for stride in [1, 2]:
+                for pool_length in [1, 2]:
+                    for input_data_shape in [(2, 1, 3), (2, 1, 4)]:
+                        layer = MaxPooling1D(pool_length=pool_length, stride=stride, ignore_border=ignore_border)
+                        input_data = np.random.random(input_data_shape)
+                        check_layer_output_shape(layer, input_data)
+
+    def test_MaxPooling2D(self):
+        for ignore_border in [True, False]:
+            for stride in [(1, 1), (2, 2)]:
+                for pool_size in [(2, 2), (3, 3), (4, 4)]:
+                    for input_data_shape in [(2, 1, 3, 3), (2, 1, 4, 4), (2, 1, 5, 5), (2, 1, 6, 6)]:
+                        layer = MaxPooling2D(pool_size=pool_size, stride=stride, ignore_border=ignore_border)
+                        input_data = np.random.random(input_data_shape)
+                        check_layer_output_shape(layer, input_data)
+
+    def test_UpSample1D(self):
+        layer = UpSample1D(length=2)
+        input_data = np.random.random((2, 2, 3))
+        check_layer_output_shape(layer, input_data)
+
+    def test_UpSample2D(self):
+        layer = UpSample2D(size=(2, 2))
+        input_data = np.random.random((2, 1, 2, 3))
+        check_layer_output_shape(layer, input_data)
+
+    def test_ZeroPadding1D(self):
+        layer = ZeroPadding1D(1)
+        input_data = np.random.random((2, 2, 1))
+        check_layer_output_shape(layer, input_data)
+
+    def test_ZeroPadding2D(self):
+        layer = ZeroPadding2D((1, 2))
+        input_data = np.random.random((2, 1, 2, 3))
+        check_layer_output_shape(layer, input_data)
+
+    # #############
+    # # Recurrent #
+    # #############
+    def test_SimpleRNN(self):
+        # all recurrent layers inherit output_shape
+        # from the same base recurrent layer
+        layer = SimpleRNN(2)
+        input_data = np.random.random((2, 2, 3))
+        check_layer_output_shape(layer, input_data)
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -11,7 +11,7 @@ from keras.utils.np_utils import to_categorical
 import unittest


-class TestRegularizers(unittest.TestCase):
+class TestTasks(unittest.TestCase):
    def test_vector_clf(self):
        nb_hidden = 10

@@ -27,9 +27,9 @@ class TestRegularizers(unittest.TestCase):
        y_test = to_categorical(y_test)

        model = Sequential()
-        model.add(Dense(X_train.shape[-1], nb_hidden))
+        model.add(Dense(nb_hidden, input_shape=(X_train.shape[-1],)))
        model.add(Activation('relu'))
-        model.add(Dense(nb_hidden, y_train.shape[-1]))
+        model.add(Dense(y_train.shape[-1]))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
        history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), show_accuracy=True, verbose=2)
@@ -47,16 +47,16 @@ class TestRegularizers(unittest.TestCase):
        print('y_test:', y_test.shape)

        model = Sequential()
-        model.add(Dense(X_train.shape[-1], nb_hidden))
+        model.add(Dense(nb_hidden, input_shape=(X_train.shape[-1],)))
        model.add(Activation('tanh'))
-        model.add(Dense(nb_hidden, y_train.shape[-1]))
+        model.add(Dense(y_train.shape[-1]))
        model.compile(loss='hinge', optimizer='adagrad')
        history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), verbose=2)
        self.assertTrue(history.history['val_loss'][-1] < 0.9)

    def test_temporal_clf(self):
        print('temporal classification data:')
-        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(5,10), 
+        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(3, 5),
                                                             classification=True, nb_class=2)
        print('X_train:', X_train.shape)
        print('X_test:', X_test.shape)
@@ -67,7 +67,7 @@ class TestRegularizers(unittest.TestCase):
        y_test = to_categorical(y_test)

        model = Sequential()
-        model.add(GRU(X_train.shape[-1], y_train.shape[-1]))
+        model.add(GRU(y_train.shape[-1], input_shape=(None, X_train.shape[-1])))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adadelta')
        history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), show_accuracy=True, verbose=2)
@@ -75,7 +75,7 @@ class TestRegularizers(unittest.TestCase):

    def test_temporal_reg(self):
        print('temporal regression data:')
-        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(5, 10), output_shape=(2,),
+        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(3, 5), output_shape=(2,),
                                                             classification=False)
        print('X_train:', X_train.shape)
        print('X_test:', X_test.shape)
@@ -83,14 +83,14 @@ class TestRegularizers(unittest.TestCase):
        print('y_test:', y_test.shape)

        model = Sequential()
-        model.add(GRU(X_train.shape[-1], y_train.shape[-1]))
+        model.add(GRU(y_train.shape[-1], input_shape=(None, X_train.shape[-1])))
        model.compile(loss='hinge', optimizer='adam')
        history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), verbose=2)
        self.assertTrue(history.history['val_loss'][-1] < 0.8)

    def test_seq_to_seq(self):
        print('sequence to sequence data:')
-        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(5, 10), output_shape=(5, 10),
+        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(3, 5), output_shape=(3, 5),
                                                             classification=False)
        print('X_train:', X_train.shape)
        print('X_test:', X_test.shape)
@@ -98,14 +98,14 @@ class TestRegularizers(unittest.TestCase):
        print('y_test:', y_test.shape)

        model = Sequential()
-        model.add(TimeDistributedDense(X_train.shape[-1], y_train.shape[-1]))
+        model.add(TimeDistributedDense(y_train.shape[-1], input_shape=(None, X_train.shape[-1])))
        model.compile(loss='hinge', optimizer='rmsprop')
        history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), verbose=2)
-        self.assertTrue(history.history['val_loss'][-1] < 0.75)
+        self.assertTrue(history.history['val_loss'][-1] < 0.8)

    def test_img_clf(self):
        print('image classification data:')
-        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(3, 32, 32),
+        (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(3, 8, 8),
                                                             classification=True, nb_class=2)
        print('X_train:', X_train.shape)
        print('X_test:', X_test.shape)
@@ -116,13 +116,14 @@ class TestRegularizers(unittest.TestCase):
        y_test = to_categorical(y_test)

        model = Sequential()
-        model.add(Convolution2D(32, 3, 32, 32))
+        model.add(Convolution2D(8, 8, 8, input_shape=(3, 8, 8)))
        model.add(Activation('sigmoid'))
        model.add(Flatten())
-        model.add(Dense(32, y_test.shape[-1]))
+        model.add(Dense(y_test.shape[-1]))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='sgd')
        history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), show_accuracy=True, verbose=2)
+        print(history.history['val_acc'][-1])
        self.assertTrue(history.history['val_acc'][-1] > 0.9)


@@ -133,14 +133,14 @@ class DrawActivations(Callback):
 # model.add(Activation('softmax'))

 model = Sequential()
-model.add(Convolution2D(32, 1, 3, 3, border_mode='full')) 
+model.add(Convolution2D(32, 1, 3, 3, border_mode='full'))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

-model.add(Convolution2D(64, 32, 3, 3, border_mode='full')) 
+model.add(Convolution2D(64, 32, 3, 3, border_mode='full'))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(poolsize=(2, 2)))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

 model.add(Flatten())
@@ -215,12 +215,22 @@ print("Test model checkpointer without validation data")
 import warnings
 warnings.filterwarnings('error')
 try:
+    passed = False
    # this should issue a warning
    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, verbose=0, callbacks =[checkpointer])
 except:
-    print("Tests passed")
-    import sys
-    sys.exit(0)
+    passed = True
+if not passed:
+    raise Exception("Modelcheckpoint tests did not pass")

-raise Exception("Modelcheckpoint tests did not pass")
+print("Test model checkpointer with pattern")
+filename = "model_weights.{epoch:04d}.hdf5"
+f = os.path.join(path, filename)
+nb_epoch = 3
+checkpointer = cbks.ModelCheckpoint(f)
+model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=0, callbacks=[checkpointer])
+for i in range(nb_epoch):
+    if not os.path.isfile(f.format(epoch=i)):
+        raise Exception("Model weights were not saved separately for each epoch")

+print("Tests passed")
@@ -8,7 +8,7 @@ import keras.utils.layer_utils as layer_utils
 print('-- Sequential model')
 left = Sequential()
 left.add(Convolution2D(32, 1, 3, 3, border_mode='valid'))
-left.add(MaxPooling2D(poolsize=(2, 2)))
+left.add(MaxPooling2D(pool_size=(2, 2)))
 left.add(Flatten())
 left.add(Dense(32 * 13 * 13, 50))
 left.add(Activation('relu'))
Autor	SHA1	Mensagem	Data
Francois Chollet	d2189fef32	Update version number: now 0.2.0	2015-10-10 17:51:59 -07:00
François Chollet	0a35173b33	Merge pull request #814 from matsuyamax/shapeinfer Fix variable sharing issue with NonNeg constraint.	2015-10-10 17:17:41 -07:00
Makoto Matsuyama	5d2f3101ae	Fix variable sharing issue with NonNeg constraint.	2015-10-10 16:50:33 -07:00
François Chollet	73aac1c7c9	Merge pull request #813 from hedeon/keras_hq Fix: added "Permute" into imports of line 8: keras/keras/utils/layer_utils.py	2015-10-10 13:09:37 -07:00
Haizhou Qu	815f7064a2	Fix: added "Permute" into imports	2015-10-10 20:50:26 +01:00
François Chollet	63f81cafbd	Merge pull request #808 from transcranial/typo-fixes fixes for new API: Embedding layer, additional examples	2015-10-09 22:47:43 -07:00
Leon Chen	4c1a6fc27e	fix Embedding config property to include new input_length property, add input_length to all examples with Embedding layer	2015-10-09 16:35:48 -04:00
Leon Chen	9dbf04b699	poolsize -> pool_size	2015-10-09 16:32:34 -04:00
François Chollet	856a99de6c	Merge pull request #804 from transcranial/conv2d-bugfix correct image_shape and filter_shape parameters in Convolution2D	2015-10-08 16:11:44 -07:00
Leon Chen	f463d23b38	correct image_shape and filter_shape parameters in Convolution2D	2015-10-08 16:54:15 -04:00
François Chollet	775719983f	Merge pull request #791 from matsuyamax/shapeinfer Add automatic shape inference.	2015-10-06 20:26:20 -07:00
Makoto Matsuyama	e839a4bdac	Py3 compatibility	2015-10-05 17:42:35 -07:00
Makoto Matsuyama	cfd2763514	Simplify test_tasks	2015-10-05 17:04:50 -07:00
Makoto Matsuyama	0b8a52e463	Update documentation to new API.	2015-10-05 16:28:17 -07:00
Makoto Matsuyama	cb77f7d7e2	Incorporate image_shape and filter_shape in convs	2015-10-05 13:02:31 -07:00
Makoto Matsuyama	e8e56d9013	Add shape inference to Graph containers	2015-10-05 12:01:24 -07:00
Makoto Matsuyama	c60e2dfbdb	Update (most) automated tests.	2015-10-05 07:09:44 -07:00
François Chollet	9807dcd69b	Merge pull request #775 from stephenbalaban/feature/customlayers First crack at threading `custom_layers` through.	2015-10-05 05:55:27 -07:00
Makoto Matsuyama	2bd4c295d6	Update all examples with new API	2015-10-04 18:44:49 -07:00
Makoto Matsuyama	35d66d672b	Add shape inference to existing layers	2015-10-04 16:45:01 -07:00
Stephen A. Balaban	11eaaeb695	Rm `custom_layers` argument from `get_from_module` Removed tailind whitespace in generic_utils. Shoved variables from `custom_layers` into globals().	2015-10-04 15:57:43 -07:00
Stephen A. Balaban	cc1251b307	Merge branch 'master' of github.com:fchollet/keras	2015-10-04 15:44:36 -07:00
Makoto Matsuyama	4564dab62a	Allow any layer to accept a 'input_shape' kwarg.	2015-10-04 14:26:12 -07:00
Makoto Matsuyama	0e62ae4eaa	fix merge conflict	2015-10-04 13:41:45 -07:00
François Chollet	876bca046f	Merge pull request #780 from matsuyamax/master Fix compatibility with old MaxPooling interface	2015-10-04 13:00:01 -07:00
Makoto Matsuyama	d7e0ba1c39	Fix	2015-10-04 12:49:50 -07:00
Makoto Matsuyama	e0bcee4963	Revert default stride in MaxPooling to None	2015-10-04 12:28:22 -07:00
Makoto Matsuyama	ca4fc2e72f	Fix compatibility with old MaxPooling interface	2015-10-04 12:26:48 -07:00
François Chollet	83544cdb41	Merge pull request #777 from matsuyamax/shapeinfer Add .output_shape attribute in all layers (+tests)	2015-10-04 10:46:56 -07:00
Makoto Matsuyama	37978fcda6	fix merge conflict	2015-10-04 06:57:52 -07:00
Makoto Matsuyama	61d76d4a07	fix merge conflict	2015-10-04 06:39:40 -07:00
Makoto Matsuyama	cc6280f34d	Fix tests	2015-10-04 06:30:45 -07:00
François Chollet	5bab11eec7	Merge pull request #778 from transcranial/conv1d-cudnn Use cuDNN in Convolution1D layer if available	2015-10-04 06:29:44 -07:00
Leon Chen	65b048455b	use cuDNN in Convolution1D layer if available	2015-10-04 01:52:39 -04:00
Makoto Matsuyama	9be4480eab	Add ZeroPadding1D, refactor ZeroPadding2D	2015-10-03 22:16:14 -07:00
Makoto Matsuyama	7219bb4b96	Change API of Reshape layer	2015-10-03 22:15:53 -07:00
Makoto Matsuyama	fd2c6dbafd	in MaxPooling layers: poolsize -> pool_size	2015-10-03 21:52:35 -07:00
Makoto Matsuyama	19c736a4ca	Remove print statement, py3 compatibility.	2015-10-03 19:54:46 -07:00
François Chollet	b9bf954f24	Merge pull request #732 from matsuyamax/master Update relu to use Theano's implementation	2015-10-03 19:27:18 -07:00
Makoto Matsuyama	0bc7b25f59	Upgrade Theano in Travis config	2015-10-03 18:20:00 -07:00
Makoto Matsuyama	9f47903daf	Merge remote-tracking branch 'upstream/master'	2015-10-03 18:18:18 -07:00
Makoto Matsuyama	7f1eb97000	Remove whitespace, useless comment	2015-10-03 18:15:46 -07:00
Makoto Matsuyama	c506fbda4a	Add .output_shape attribute in all layers (+tests)	2015-10-03 17:08:28 -07:00
Stephen A. Balaban	aa05c44145	Merge branch 'master' of github.com:fchollet/keras	2015-10-03 12:58:09 -07:00
Stephen A. Balaban	2e0d96d1a2	Merge branch 'bugfix/reshape'	2015-10-03 12:56:26 -07:00
François Chollet	6b62678e90	Merge pull request #619 from amitbeka/non-overwrite-checkpoint support for multiple files in ModelCheckpoint	2015-10-03 12:10:55 -07:00
François Chollet	cc8a901c31	Merge pull request #706 from nehz/nehz-subsample Should subsample Convolution1D on correct axis	2015-10-03 12:07:06 -07:00
François Chollet	7c44d16a77	Merge pull request #771 from stephenbalaban/bugfix/reshape Bugfix/reshape	2015-10-03 10:10:03 -07:00
Stephen A. Balaban	ee07e6ef74	Added kwargs to Reshape.	2015-10-02 17:00:23 -07:00
Stephen A. Balaban	88a0ab5e93	First crack at threading `custom_layers` through. A bit surprised that keras was using globals() to access layers (doesn't work across modules.) Hacky solution was to pass a dict mapping name -> class. I called this dict `custom_layers`. Is there a better way of doing this that I'm not seeing?	2015-10-02 13:31:19 -07:00
Stephen A. Balaban	57bb9e2613	Added **kwargs to to_json and to_yaml. This allows you to do nice things like save JSON models so that they're human readable & editable. For example: >>> with open('output.json', 'w') as f: ... f.write(model.to_json(indent=4, sort_keys=True)) ...	2015-10-02 10:59:13 -07:00
François Chollet	c1857cfa66	Merge pull request #757 from matsuyamax/cnn_fix Fix Reshape and Permute deserialization	2015-09-30 22:18:31 -07:00
Makoto Matsuyama	2d8307622d	Fix Reshape and Permute deserialization	2015-09-30 21:16:59 -07:00
François Chollet	af932d3480	Merge pull request #752 from jfsantos/patch-4 Fix typo in docstring	2015-09-30 09:07:18 -07:00
João Felipe Santos	4ed53ae5a4	Fix typo in docstring longuest -> longest	2015-09-30 11:47:38 -04:00
François Chollet	0d798c662b	Merge pull request #749 from nebw/fix-sample-weight-doc add class_weight/sample_weight parameters to doc #736	2015-09-29 20:56:29 -07:00
Benjamin Wild	5f4675bd7f	add class_weight/sample_weight parameters to doc #736	2015-09-29 16:51:32 +02:00
François Chollet	14b175c9b0	Merge pull request #745 from blackyang/doc_sample_weight change sample_weight doc	2015-09-28 22:19:04 -07:00
Xiao Yang	8d4e75894a	change sample_weight doc	2015-09-29 00:53:13 -04:00
François Chollet	3d888cbf7e	Merge pull request #739 from matsuyamax/cnn_fix Fix bug in deserialization of convolutional layers	2015-09-28 08:22:14 -07:00
Makoto Matsuyama	3b76158c49	Fix bug in deserialization of convolutional layers	2015-09-27 21:49:20 -07:00
François Chollet	788d838160	Merge pull request #738 from matsuyamax/graph_fix Fix bug with Graph sample_weights	2015-09-27 21:39:16 -07:00
Makoto Matsuyama	56ae624f12	Fix bug with Graph sample_weights	2015-09-27 20:50:32 -07:00
François Chollet	ef43a271ee	Merge pull request #714 from eulerreich/patch-1 fixed incorrect comment	2015-09-27 10:48:18 -07:00
François Chollet	0b1a1e9761	Merge pull request #734 from EderSantana/master Fix order of sings of clipvalue	2015-09-26 14:48:49 -07:00
EderSantana	52e3e2623a	Merge branch 'master' of https://github.com/fchollet/keras	2015-09-26 17:43:26 -04:00
EderSantana	46a2fb6fd8	Fix sign order for clipvalue	2015-09-26 17:42:56 -04:00
Makoto Matsuyama	b0f2446370	Fix relu	2015-09-25 23:55:25 -07:00
Makoto Matsuyama	7a2e8ce8a2	Update relu to use Theano's implementation	2015-09-25 23:21:35 -07:00
François Chollet	200948c3be	Merge pull request #730 from eulerreich/patch-2 minor typo	2015-09-25 20:45:32 -07:00
François Chollet	35612d698a	Merge pull request #731 from eulerreich/patch-3 minor typos	2015-09-25 20:45:15 -07:00
eulerreich	8a5767a53e	minor typos	2015-09-25 22:18:53 -05:00
eulerreich	f4ca4026a3	minor typo	2015-09-25 22:11:57 -05:00
François Chollet	e4d0ed5992	Merge pull request #719 from farizrahman4u/patch-1 Update skipgram_word_embeddings.py	2015-09-25 19:52:26 -07:00
François Chollet	1325e73a59	Merge pull request #729 from EderSantana/master Clip value as in Neural Turing Machines paper	2015-09-25 19:49:23 -07:00
EderSantana	b6d8e9dd4e	Fix clip value logic	2015-09-25 22:12:15 -04:00
EderSantana	69afdd7ec4	Add clip value as in Neural Turing Machines Instead of norm clipping they do an elementwise clip. I believe others may want to try that out too.	2015-09-25 22:10:27 -04:00
farizrahman4u	d5cd2687ed	Update skipgram_word_embeddings.py Redundant code line 159 and 161	2015-09-25 11:14:53 +05:30
François Chollet	ca60201fe5	Merge pull request #690 from EderSantana/master Add merge_mode join	2015-09-24 21:16:22 -07:00
EderSantana	dd6697738b	Raise error if using merge_mode= with unnamed input	2015-09-24 18:14:04 -04:00
EderSantana	cccc118225	Raise error if using merge_mode= with unnamed input	2015-09-24 18:12:21 -04:00
eulerreich	36578f8569	fixed incorrect comment	2015-09-23 12:30:48 -05:00
François Chollet	c18a9cd405	Merge pull request #684 from jmhessel/mergefixes Added averaging support in merge and a TimeDistributedMerge layer	2015-09-22 20:12:26 -07:00
Jack Hessel	cba5cfa597	Added a very quick config unit test	2015-09-22 16:50:02 -04:00
EderSantana	b2048d1d88	Merge branch 'master' of https://github.com/fchollet/keras	2015-09-22 11:43:42 -04:00
EderSantana	8bfafd6d7f	Merge join returns OrderedDict instead of list This makes merge_mode='join' complaint with keras API. Also, the OrderedDict allows the user to simple .values() and use it as a list if he knows in which order the inputs were merged.	2015-09-22 11:37:49 -04:00
Zhen Wang	a6521de3e3	Should subsample Convolution1D on correct axis	2015-09-21 11:59:40 +08:00
Zhen Wang	02ddc11858	Merge pull request #1 from fchollet/master Update	2015-09-21 11:58:23 +08:00
François Chollet	588261acfc	Merge pull request #704 from rodrigob/patch-4 Add a bit of flexibility in Progbar.update	2015-09-20 12:33:31 -07:00
François Chollet	61a48d487f	Merge pull request #696 from rodrigob/patch-3 "Epoch %d out of %d"	2015-09-20 12:26:49 -07:00
Rodrigo Benenson	eee20b4614	Update callbacks.py fixed +1	2015-09-20 21:23:43 +02:00
Rodrigo Benenson	9827db2c85	Update callbacks.py following suggestions	2015-09-20 21:18:42 +02:00
Rodrigo Benenson	b9403cb262	Add a bit of flexibility in Progbar.update By allowing sum_values[k] to be other things than lists, it makes it easier for children classes to print "any value" (in my case, a timedelta object).	2015-09-20 15:26:27 +02:00
François Chollet	e379fff425	Merge pull request #697 from ndronen/count-params Parameter counting method for models.	2015-09-18 07:45:44 -07:00
Nicholas Dronen	80c0c762fd	Add count_params method to keras.layers.core.Layer and the Sequential and Graph container classes.	2015-09-17 09:19:21 -06:00
Rodrigo Benenson	51818e5b7b	"Epoch %d out of %d" Print "Epoch %d out of %d" instead of just "Epoch %d"	2015-09-17 15:39:56 +02:00
François Chollet	393642df55	Merge pull request #691 from grahamannett/master added visualization tools to view Sequential and Graph models	2015-09-16 20:52:41 -07:00
graham	6bb9eecd0c	added functioning vizualization	2015-09-16 00:58:44 -07:00
graham	f026bb2f5a	added functioning vizualization	2015-09-16 00:10:38 -07:00
EderSantana	5c3db2fea6	Add merge_mode join	2015-09-15 21:52:54 -04:00
Jack Hessel	1a953feaf7	added averaging support in merge and a TimeDistributedMerge layer	2015-09-14 13:27:59 -04:00
François Chollet	0733a80297	Merge pull request #677 from jnphilipp/master Fixed Python 3 Image loading. Closed #676	2015-09-11 19:23:45 -07:00
jnphilipp	a5653c245a	Fixed Python 3 Image loading. Closed #676	2015-09-11 22:39:07 +02:00
François Chollet	1724fe5882	Merge pull request #662 from gw0/feat-optional-h5py Remove h5py requirement and made it optional.	2015-09-09 07:58:50 -07:00
gw0 [http://gw.tnode.com/]	a582b184c9	Remove h5py requirement and made it optional.	2015-09-08 17:56:44 +02:00
François Chollet	36ef1ca7b4	Merge pull request #661 from jfsantos/patch-3 Updated documentation of Merge layer	2015-09-08 08:42:38 -07:00
João Felipe Santos	27edefe48c	Updated documentation of Merge layer Added 'mul' mode documentation to Merge.	2015-09-08 10:50:11 -04:00
François Chollet	4b1b86783f	Merge pull request #659 from amitbeka/support-saving-masking add Masking layer to utils.layer_utils for saving/loading	2015-09-08 07:46:45 -07:00
François Chollet	7009e80b74	Merge pull request #660 from amitbeka/fix-saving-sgd fix SGD get_config	2015-09-08 07:46:22 -07:00
Amit Beka	cd82deb152	fix SGD get_config Signed-off-by: Amit Beka <amit.beka@gmail.com>	2015-09-08 17:04:24 +03:00
Amit Beka	65b794957f	add Masking layer to utils.layer_utils for saving/loading Signed-off-by: Amit Beka <amit.beka@gmail.com>	2015-09-08 16:39:06 +03:00
Francois Chollet	7b4e6ef50c	Fix typo in FAQ	2015-09-07 20:50:40 -07:00
Francois Chollet	f804b19fdc	Fix typos in FAQ	2015-09-07 20:48:37 -07:00
Francois Chollet	eff8731db4	Fixes in doc FAQ	2015-09-07 18:20:39 -07:00
Francois Chollet	43ddbf4a4f	Add Keras FAQ	2015-09-07 17:11:38 -07:00
Francois Chollet	c5b3959b42	Fix test_tasks	2015-09-07 15:36:38 -07:00
Francois Chollet	289804c67c	Fix theano.tensor.signal import issue	2015-09-07 15:16:06 -07:00
Francois Chollet	c6825eb343	Style fixes	2015-09-07 15:06:37 -07:00
Francois Chollet	92b8ad9d02	Merge branch 'master' of https://github.com/Mofef/keras into Mofef-master	2015-09-07 15:02:30 -07:00
Francois Chollet	3dfba0504b	Merge branch 'master' of https://github.com/fchollet/keras	2015-09-07 13:59:56 -07:00
François Chollet	4bdb43f244	Merge pull request #639 from rodrigob/patch-1 Added reference for orthogonal initialization	2015-09-07 13:12:46 -07:00
Francois Chollet	83e285fd00	Add on_gpu() check	2015-09-07 13:05:45 -07:00
Francois Chollet	4e1ec93c2f	Fix weight saving in BatchNormalization	2015-09-07 12:50:34 -07:00
François Chollet	2224c4cc1e	Merge pull request #654 from phreeza/travis-coveralls-support add travis CI and coveralls support	2015-09-07 10:40:55 -07:00
François Chollet	9f6f206ccd	Merge pull request #647 from phreeza/test_convolutional Add tests for convolutional layers	2015-09-07 10:40:35 -07:00
Francois Chollet	f3eeb982d0	Avoid dnn import when not running on GPU	2015-09-07 10:32:13 -07:00
Moritz Münst	2be651dc39	concatenation axis as param for Merge() and Graph.add_output/node()	2015-09-07 19:36:51 +03:00
Thomas McColgan	c77ded2eb6	add travis CI and coveralls support	2015-09-07 16:28:03 +02:00
François Chollet	06ab8dbd34	Merge pull request #650 from rodrigob/patch-2 Fix cPickle import for python3 support	2015-09-06 17:27:33 -07:00
François Chollet	8e293db9b5	Merge pull request #644 from anjishnu/issue_643 Fixed import errors with six.moves.cPickle and model.train typo in th…	2015-09-06 12:42:11 -07:00
Rodrigo Benenson	5040aa386d	Fix cPickle import for python3 support	2015-09-06 15:58:30 +02:00
Thomas McColgan	8e67b040e8	add tests for border_mode == same	2015-09-06 13:03:18 +02:00
Thomas McColgan	84909a49c2	add upsampling layer tests	2015-09-06 12:55:50 +02:00
Thomas McColgan	0969c569a6	add convolutional layer tests	2015-09-06 12:17:24 +02:00
Francois Chollet	cb8f0a83e6	Merge branch 'jfsantos-merge_mul'	2015-09-05 17:49:55 -07:00
Francois Chollet	5648119b66	Remove outdated Merge exception	2015-09-05 17:49:27 -07:00
Francois Chollet	25e9b90550	Merge branch 'merge_mul' of https://github.com/jfsantos/keras into jfsantos-merge_mul	2015-09-05 17:45:13 -07:00
Anjishnu Kumar	e98b24a767	changed 'fit' to 'train_on_batch'	2015-09-05 14:19:40 -07:00
Anjishnu Kumar	034822359d	Fixed import errors with six.moves.cPickle and model.train typo in the skipgram embeddings example	2015-09-05 13:36:52 -07:00
François Chollet	2e60c99924	Merge pull request #642 from wuaalb/lr-scheduler Fix typo LearningRateScheduler	2015-09-05 04:38:26 -07:00
wuaalb	4bb6ac0b04	Fix typo LearningRateScheduler	2015-09-05 11:59:29 +02:00
François Chollet	c368b86d11	Merge pull request #640 from Smerity/master Removing magic numbers from MNIST and CIFAR10	2015-09-04 17:30:57 -07:00
Stephen Merity	49335d4345	Remove magic numbers from `cifar10_cnn.py` (fixes #469 )	2015-09-04 16:34:00 -07:00
Stephen Merity	93c1a8c675	Remove magic numbers from `mnist_cnn.py` (re: #469 )	2015-09-04 16:24:47 -07:00
Rodrigo Benenson	5f3bdeb0a3	Added reference for orthogonal initialization	2015-09-05 00:54:24 +02:00
François Chollet	ddf908359c	Merge pull request #637 from jnphilipp/master Fix for issue #636	2015-09-04 10:28:41 -07:00
jnphilipp	37f4d11ea9	Merge branch 'master' of github.com:jnphilipp/keras	2015-09-04 13:49:52 +02:00
jnphilipp	94fbbd1c7e	Fixed missing import. Closed #636	2015-09-04 13:44:26 +02:00
Francois Chollet	332d43e023	Make Pmat a param of JSZ1-2	2015-09-02 20:18:28 -07:00
Francois Chollet	f84fe7ce17	Change cPickle import pattern in datasets	2015-09-02 20:15:14 -07:00
Joao Felipe Santos	16d0e40560	Updated 'mul' mode to support multiple layers	2015-08-31 21:17:09 -04:00
Amit Beka	da24be79ab	support for multiple files in ModelCheckpoint enable string formatted filenames (e.g. weights.{epoch:02d}.hdf5), so every epoch will be saved to a different file without overwriting. Signed-off-by: Amit Beka <amit.beka@gmail.com>	2015-08-31 11:25:06 +03:00
Joao Felipe Santos	ab8642e0ff	Added element-wise multiplication as merge mode	2015-08-29 13:24:54 -04:00