Prepare 0.3.3 release (last release before 1.0).

Merge pull request #2109 from the-moliver/sign
add sign operation to backends
2016-03-31 11:15:44 -07:00 · 2016-03-31 11:07:02 -07:00 · 2016-03-31 11:04:11 -07:00 · 2016-03-29 20:13:45 +02:00 · 2016-03-28 19:43:49 +00:00 · 2016-03-28 09:49:57 -07:00
@@ -0,0 +1,9 @@
+Please make sure that the boxes below are checked before you submit your issue. Thank you!
+
+- [ ] Check that you are up-to-date with the master branch of Keras. You can update with:
+pip install git+git://github.com/fchollet/keras.git --upgrade --no-deps
+
+- [ ] If running on Theano, check that you are up-to-date with the master branch of Theano. You can update with:
+pip install git+git://github.com/Theano/Theano.git --upgrade --no-deps
+
+- [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short).
@@ -124,19 +124,23 @@ def process_class_docstring(docstring):
    docstring = re.sub(r'    ([^\s\\]+):(.*)\n',
                       r'    - __\1__:\2\n',
                       docstring)
+
+    docstring = docstring.replace('    ' * 5, '\t\t')
    docstring = docstring.replace('    ' * 3, '\t')
    docstring = docstring.replace('    ', '')
    return docstring


 def process_method_docstring(docstring):
-    docstring = re.sub(r'    # (.*)\n',
-                       r'    __\1__\n\n',
+    docstring = re.sub(r'\n        # (.*)\n',
+                       r'\n        __\1__\n\n',
                       docstring)

    docstring = re.sub(r'    ([^\s\\]+):(.*)\n',
                       r'    - __\1__:\2\n',
                       docstring)
+
+    docstring = docstring.replace('    ' * 6, '\t\t')
    docstring = docstring.replace('    ' * 4, '\t')
    docstring = docstring.replace('    ', '')
    return docstring
@@ -14,11 +14,13 @@ is equivalent to:
 model.add(Dense(64, activation='tanh'))
 ```

-You can also pass an element-wise Theano function as an activation:
+You can also pass an element-wise Theano/TensorFlow function as an activation:

 ```python
+from keras import backend as K
+
 def tanh(x):
-    return theano.tensor.tanh(x)
+    return K.tanh(x)

 model.add(Dense(64, activation=tanh))
 model.add(Activation(tanh))
@@ -36,4 +38,4 @@ model.add(Activation(tanh))

 ## On Advanced Activations

-Activations that are more complex than a simple Theano function (eg. learnable activations, configurable activations, etc.) are available as [Advanced Activation layers](layers/advanced_activations.md), and can be found in the module `keras.layers.advanced_activations`. These include PReLU and LeakyReLU.
+Activations that are more complex than a simple Theano/TensorFlow function (eg. learnable activations, configurable activations, etc.) are available as [Advanced Activation layers](layers/advanced_activations.md), and can be found in the module `keras.layers.advanced_activations`. These include PReLU and LeakyReLU.
@@ -1,6 +1,18 @@

 Here are a few examples to get you started!

+In the examples folder, you will also find example models for real datasets:
+
+- CIFAR10 small images classification: Convolutional Neural Network (CNN) with realtime data augmentation
+- IMDB movie review sentiment classification: LSTM over sequences of words
+- Reuters newswires topic classification: Multilayer Perceptron (MLP)
+- MNIST handwritten digits classification: MLP & CNN
+- Character-level text generation with LSTM
+
+...and more.
+
+------------------
+
 ### Multilayer Perceptron (MLP) for multi-class softmax classification:

 ```python
@@ -32,6 +44,8 @@ model.fit(X_train, y_train,
 score = model.evaluate(X_test, y_test, batch_size=16)
 ```

+------------------
+
 ### Alternative implementation of a similar MLP:

 ```python
@@ -45,6 +59,7 @@ model.add(Dense(10, activation='softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='adadelta')
 ```

+------------------

 ### MLP for binary classification:
 ```python
@@ -55,13 +70,12 @@ model.add(Dense(64, activation='relu'))
 model.add(Dropout(0.5))
 model.add(Dense(1, activation='sigmoid'))

-# "class_mode" defaults to "categorical". For correctly displaying accuracy
-# in a binary classification problem, it should be set to "binary".
 model.compile(loss='binary_crossentropy',
-              optimizer='rmsprop',
-              class_mode='binary')
+              optimizer='rmsprop')
 ```

+------------------
+
 ### VGG-like convnet:

 ```python
@@ -103,6 +117,8 @@ model.fit(X_train, Y_train, batch_size=32, nb_epoch=1)

 ```

+------------------
+
 ### Sequence classification with LSTM:

 ```python
@@ -167,9 +183,10 @@ image_model.add(RepeatVector(max_caption_len))

 # the output of both models will be tensors of shape (samples, max_caption_len, 128).
 # let's concatenate these 2 vector sequences.
-model = Merge([image_model, language_model], mode='concat', concat_axis=-1)
+model = Sequential()
+model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
 # let's encode this vector sequence into a single vector
-model.add(GRU(256, 256, return_sequences=False))
+model.add(GRU(256, return_sequences=False))
 # which will be used to compute a probability
 # distribution over what the next word in the caption should be!
 model.add(Dense(vocab_size))
@@ -186,12 +203,188 @@ model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 model.fit([images, partial_captions], next_words, batch_size=16, nb_epoch=100)
 ```

-In the examples folder, you will find example models for real datasets:
+------------------

- CIFAR10 small images classification: Convolutional Neural Network (CNN) with realtime data augmentation
- IMDB movie review sentiment classification: LSTM over sequences of words
- Reuters newswires topic classification: Multilayer Perceptron (MLP)
- MNIST handwritten digits classification: MLP & CNN
- Character-level text generation with LSTM
+### Stacked LSTM for sequence classification

-...and more.
+In this model, we stack 3 LSTM layers on top of each other,
+making the model capable of learning higher-level temporal representations.
+
+The first two LSTMs return their full output sequences, but the last one only returns
+the last step in its output sequence, thus dropping the temporal dimension
+(i.e. converting the input sequence into a single vector).
+
+<img src="http://keras.io/img/regular_stacked_lstm.png" alt="stacked LSTM" style="width: 300px;"/>
+
+(N.B.: in Keras, "None" in an input shape indicates a variable dimension. In the graph above, the batch size is "None",
+meaning that any batch size is allowed for the input data).
+
+```python
+from keras.models import Sequential
+from keras.layers import LSTM, Dense
+import numpy as np
+
+data_dim = 16
+timesteps = 8
+nb_classes = 10
+
+# expected input data shape: (batch_size, timesteps, data_dim)
+model = Sequential()
+model.add(LSTM(32, return_sequences=True,
+               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
+model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
+model.add(LSTM(32))  # return a single vector of dimension 32
+model.add(Dense(10, activation='softmax'))
+
+model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+# generate dummy training data
+x_train = np.random.random((1000, timesteps, data_dim))
+y_train = np.random.random((1000, nb_classes))
+
+# generate dummy validation data
+x_val = np.random.random((100, timesteps, data_dim))
+y_val = np.random.random((100, nb_classes))
+
+model.fit(x_train, y_train,
+          batch_size=64, nb_epoch=5, show_accuracy=True,
+          validation_data=(x_val, y_val))
+```
+
+------------------
+
+### Same stacked LSTM model, rendered "stateful"
+
+A stateful recurrent model is one for which the internal states (memories) obtained after processing a batch
+of samples are reused as initial states for the samples of the next batch. This allows to process longer sequences
+while keeping computational complexity manageable.
+
+[You can read more about stateful RNNs in the FAQ.](/faq/#how-can-i-use-stateful-rnns)
+
+```python
+from keras.models import Sequential
+from keras.layers import LSTM, Dense
+import numpy as np
+
+data_dim = 16
+timesteps = 8
+nb_classes = 10
+batch_size = 32
+
+# expected input batch shape: (batch_size, timesteps, data_dim)
+# note that we have to provide the full batch_input_shape since the network is stateful.
+# the sample of index i in batch k is the follow-up for the sample i in batch k-1.
+model = Sequential()
+model.add(LSTM(32, return_sequences=True, stateful=True,
+               batch_input_shape=(batch_size, timesteps, data_dim)))
+model.add(LSTM(32, return_sequences=True, stateful=True))
+model.add(LSTM(32, stateful=True))
+model.add(Dense(10, activation='softmax'))
+
+model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+# generate dummy training data
+x_train = np.random.random((batch_size * 10, timesteps, data_dim))
+y_train = np.random.random((batch_size * 10, nb_classes))
+
+# generate dummy validation data
+x_val = np.random.random((batch_size * 3, timesteps, data_dim))
+y_val = np.random.random((batch_size * 3, nb_classes))
+
+model.fit(x_train, y_train,
+          batch_size=batch_size, nb_epoch=5, show_accuracy=True,
+          validation_data=(x_val, y_val))
+```
+
+------------------
+
+### Two merged LSTM encoders for classification over two parallel sequences
+
+In this model, two input sequences are encoded into vectors by two separate LSTM modules.
+
+These two vectors are then concatenated, and a fully connected network is trained on top of the concatenated representations.
+
+![Dual LSTM](http://keras.io/img/dual_lstm.png)
+
+```python
+from keras.models import Sequential
+from keras.layers import Merge, LSTM, Dense
+import numpy as np
+
+data_dim = 16
+timesteps = 8
+nb_classes = 10
+
+encoder_a = Sequential()
+encoder_a.add(LSTM(32, input_shape=(timesteps, data_dim)))
+
+encoder_b = Sequential()
+encoder_b.add(LSTM(32, input_shape=(timesteps, data_dim)))
+
+decoder = Sequential()
+decoder.add(Merge([encoder_a, encoder_b], mode='concat'))
+decoder.add(Dense(32, activation='relu'))
+decoder.add(Dense(nb_classes, activation='softmax'))
+
+decoder.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+# generate dummy training data
+x_train_a = np.random.random((1000, timesteps, data_dim))
+x_train_b = np.random.random((1000, timesteps, data_dim))
+y_train = np.random.random((1000, nb_classes))
+
+# generate dummy validation data
+x_val_a = np.random.random((100, timesteps, data_dim))
+x_val_b = np.random.random((100, timesteps, data_dim))
+y_val = np.random.random((100, nb_classes))
+
+decoder.fit([x_train_a, x_train_b], y_train,
+            batch_size=64, nb_epoch=5, show_accuracy=True,
+            validation_data=([x_val_a, x_val_b], y_val))
+```
+
+------------------
+
+### Single shared LSTM over two parallel sequences, for classification
+
+This is a similar setup as above, but now a single LSTM encoder is used for both input sequences.
+Such a setup makes sense if the two input sequences are the same type of object.
+
+<img src="http://keras.io/img/shared_lstm.png" alt="Shared LSTM" style="width: 500px;"/>
+
+```python
+from keras.models import Graph
+from keras.layers import LSTM, Dense
+import numpy as np
+
+data_dim = 16
+timesteps = 8
+nb_classes = 10
+
+encoder = Sequential()
+encoder.add(LSTM(32, input_shape=(timesteps, data_dim)))
+
+model = Graph()
+model.add_input(name='input_a', input_shape=(timesteps, data_dim))
+model.add_input(name='input_b', input_shape=(timesteps, data_dim))
+model.add_shared_node(encoder, name='shared_encoder', inputs=['input_a', 'input_b'],
+                      merge_mode='concat')
+model.add_node(Dense(64, activation='relu'), name='fc1', input='shared_encoder')
+model.add_node(Dense(3, activation='softmax'), name='output', input='fc1', create_output=True)
+
+model.compile(optimizer='adam', loss={'output': 'categorical_crossentropy'})
+
+# generate dummy training data
+x_train_a = np.random.random((1000, timesteps, data_dim))
+x_train_b = np.random.random((1000, timesteps, data_dim))
+y_train = np.random.random((1000, 3))
+
+# generate dummy validation data
+x_val_a = np.random.random((100, timesteps, data_dim))
+x_val_b = np.random.random((100, timesteps, data_dim))
+y_val = np.random.random((100, 3))
+
+model.fit({'input_a': x_train_a, 'input_b': x_train_b, 'output': y_train},
+          batch_size=64, nb_epoch=5,
+          validation_data={'input_a': x_val_a, 'input_b': x_val_b, 'output': y_val})
+```
@@ -1,5 +1,7 @@
 # Keras FAQ: Frequently Asked Keras Questions

+[How should I cite Keras?](#how-should-i-cite-keras)
+
 [How can I run Keras on GPU?](#how-can-i-run-keras-on-gpu)

 [How can I save a Keras model?](#how-can-i-save-a-keras-model)
@@ -22,8 +24,26 @@

 ---

+### How should I cite Keras?
+
+Please cite Keras in your publications if it helps your research. Here is an example BibTeX entry:
+
+```
+@misc{chollet2015keras,
+  author = {Chollet, François},
+  title = {Keras},
+  year = {2015},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/fchollet/keras}}
+}
+```
+
 ### How can I run Keras on GPU?

+If you are running on the TensorFlow backend, your code will automatically run on GPU if any available GPU is detected.
+If you are running on the Theano backend, you can use one of the following methods:
+
 Method 1: use Theano flags.
 ```bash
 THEANO_FLAGS=device=gpu,floatX=float32 python my_keras_script.py
@@ -67,7 +87,10 @@ model = model_from_json(json_string)
 model = model_from_yaml(yaml_string)
 ```

-If you need to save the weights of a model, you can do so in HDF5:
+If you need to save the weights of a model, you can do so in HDF5 with the code below.
+
+Note that you will first need to install HDF5 and the Python library h5py, which do not come bundled with Keras.
+
 ```python
 model.save_weights('my_model_weights.h5')
 ```
@@ -7,10 +7,10 @@ An objective function (or loss function, or optimization score function) is one
 model.compile(loss='mean_squared_error', optimizer='sgd')
 ```

-You can either pass the name of an existing objective, or pass a Theano symbolic function that returns a scalar for each data-point and takes the following two arguments:
+You can either pass the name of an existing objective, or pass a Theano/TensorFlow symbolic function that returns a scalar for each data-point and takes the following two arguments:

- __y_true__: True labels. Theano tensor.
- __y_pred__: Predictions. Theano tensor of the same shape as y_true.
+- __y_true__: True labels. Theano/TensorFlow tensor.
+- __y_pred__: Predictions. Theano/TensorFlow tensor of the same shape as y_true.

 The actual optimized objective is the mean of the output array across all datapoints.

@@ -10,6 +10,11 @@ from keras.utils.visualize_util import plot
 plot(model, to_file='model.png')
 ```

+`plot` takes two optional arguments:
+
+- `recursive` (defaults to True) controls whether we recursively explore container layers.
+- `show_shape` (defaults to False) controls whether output shapes are shown in the graph.
+
 You can also directly obtain the `pydot.Graph` object and render it yourself,
 for example to show it in an ipython notebook :
 ```python
@@ -66,7 +66,7 @@ batch_size = 128
 nb_classes = 10
 nb_epoch = 40

-# the data, shuffled and split between tran and test sets
+# the data, shuffled and split between train and test sets
 (X_train, y_train), (X_test, y_test) = mnist.load_data()

 X_train = X_train.reshape(60000, 784)
@@ -18,7 +18,7 @@ from keras.models import Sequential
 from keras.layers.embeddings import Embedding
 from keras.layers.core import Activation, Dense, Merge, Permute, Dropout
 from keras.layers.recurrent import LSTM
-from keras.datasets.data_utils import get_file
+from keras.utils.data_utils import get_file
 from keras.preprocessing.sequence import pad_sequences
 from functools import reduce
 import tarfile
@@ -7,8 +7,8 @@ http://arxiv.org/abs/1502.05698

 Task Number                  | FB LSTM Baseline | Keras QA
 ---                          | ---              | ---
-QA1 - Single Supporting Fact | 50               | 52.1
-QA2 - Two Supporting Facts   | 20               | 37.0
+QA1 - Single Supporting Fact | 50               | 100.0
+QA2 - Two Supporting Facts   | 20               | 50.0
 QA3 - Three Supporting Facts | 20               | 20.5
 QA4 - Two Arg. Relations     | 61               | 62.9
 QA5 - Three Arg. Relations   | 70               | 61.9
@@ -34,8 +34,8 @@ https://research.facebook.com/researchers/1543934539189348
 Notes:

 - With default word, sentence, and query vector sizes, the GRU model achieves:
-  - 52.1% test accuracy on QA1 in 20 epochs (2 seconds per epoch on CPU)
-  - 37.0% test accuracy on QA2 in 20 epochs (16 seconds per epoch on CPU)
+  - 100% test accuracy on QA1 in 20 epochs (2 seconds per epoch on CPU)
+  - 50% test accuracy on QA2 in 20 epochs (16 seconds per epoch on CPU)
 In comparison, the Facebook paper achieves 50% and 20% for the LSTM baseline.

 - The task does not traditionally parse the question separately. This likely
@@ -64,9 +64,9 @@ import tarfile
 import numpy as np
 np.random.seed(1337)  # for reproducibility

-from keras.datasets.data_utils import get_file
+from keras.utils.data_utils import get_file
 from keras.layers.embeddings import Embedding
-from keras.layers.core import Dense, Merge
+from keras.layers.core import Dense, Merge, Dropout, RepeatVector
 from keras.layers import recurrent
 from keras.models import Sequential
 from keras.preprocessing.sequence import pad_sequences
@@ -138,12 +138,12 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
        Y.append(y)
    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)

-RNN = recurrent.GRU
+RNN = recurrent.LSTM
 EMBED_HIDDEN_SIZE = 50
 SENT_HIDDEN_SIZE = 100
 QUERY_HIDDEN_SIZE = 100
 BATCH_SIZE = 32
-EPOCHS = 20
+EPOCHS = 40
 print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERY_HIDDEN_SIZE))

 path = get_file('babi-tasks-v1-2.tar.gz', origin='http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz')
@@ -178,15 +178,19 @@ print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))
 print('Build model...')

 sentrnn = Sequential()
-sentrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE, mask_zero=True))
-sentrnn.add(RNN(SENT_HIDDEN_SIZE, return_sequences=False))
+sentrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE, input_length=story_maxlen, mask_zero=True))
+sentrnn.add(Dropout(0.3))

 qrnn = Sequential()
-qrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE))
-qrnn.add(RNN(QUERY_HIDDEN_SIZE, return_sequences=False))
+qrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE, input_length=query_maxlen))
+qrnn.add(Dropout(0.3))
+qrnn.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
+qrnn.add(RepeatVector(story_maxlen))

 model = Sequential()
-model.add(Merge([sentrnn, qrnn], mode='concat'))
+model.add(Merge([sentrnn, qrnn], mode='sum'))
+model.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
+model.add(Dropout(0.3))
 model.add(Dense(vocab_size, activation='softmax'))

 model.compile(optimizer='adam', loss='categorical_crossentropy', class_mode='categorical')
@@ -21,6 +21,7 @@ from scipy.optimize import fmin_l_bfgs_b
 import time
 import argparse
 import h5py
+import os

 from keras.models import Sequential
 from keras.layers.convolutional import Convolution2D, ZeroPadding2D, MaxPooling2D
@@ -215,9 +216,9 @@ for i in range(5):
    print('Start of iteration', i)
    start_time = time.time()

-    # add a random jitter to the initial image. This will be reverted at decoding time
-    random_jitter = (settings['jitter'] * 2) * (np.random.random((3, img_width, img_height)) - 0.5)
-    x += random_jitter
+    # add a random offset jitter to the initial image. This will be reverted at decoding time
+    ox, oy = np.random.randint(-settings['jitter'], settings['jitter']+1, 2)
+    x = np.roll(np.roll(x, ox, -1), oy, -2)

    # run L-BFGS for 7 steps
    x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(),
@@ -225,7 +226,7 @@ for i in range(5):
    print('Current loss value:', min_val)
    # decode the dream and save it
    x = x.reshape((3, img_width, img_height))
-    x -= random_jitter
+    x = np.roll(np.roll(x, -ox, -1), -oy, -2) # unshift image
    img = deprocess_image(x)
    fname = result_prefix + '_at_iteration_%d.png' % i
    imsave(fname, img)
@@ -55,7 +55,7 @@ model.compile('adam', {'output': 'binary_crossentropy'})
 print('Train...')
 model.fit({'input': X_train, 'output': y_train},
          batch_size=batch_size,
-          nb_epoch=4)
+          nb_epoch=4, show_accuracy=True)
 acc = accuracy(y_test,
               np.round(np.array(model.predict({'input': X_test},
                                               batch_size=batch_size)['output'])))
@@ -71,8 +71,7 @@ model.add(Dense(1))
 model.add(Activation('sigmoid'))

 model.compile(loss='binary_crossentropy',
-              optimizer='rmsprop',
-              class_mode='binary')
+              optimizer='rmsprop')
 model.fit(X_train, y_train, batch_size=batch_size,
          nb_epoch=nb_epoch, show_accuracy=True,
          validation_data=(X_test, y_test))
@@ -38,7 +38,7 @@ print('Loading data...')
 print(len(X_train), 'train sequences')
 print(len(X_test), 'test sequences')

-print("Pad sequences (samples x time)")
+print('Pad sequences (samples x time)')
 X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
 X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
 print('X_train shape:', X_train.shape)
@@ -46,19 +46,18 @@ print('X_test shape:', X_test.shape)

 print('Build model...')
 model = Sequential()
-model.add(Embedding(max_features, 128, input_length=maxlen))
-model.add(LSTM(128))  # try using a GRU instead, for fun
+model.add(Embedding(max_features, 128, input_length=maxlen, dropout=0.5))
+model.add(LSTM(128, dropout_W=0.5, dropout_U=0.1))  # try using a GRU instead, for fun
 model.add(Dropout(0.5))
 model.add(Dense(1))
 model.add(Activation('sigmoid'))

 # try using different optimizers and different optimizer configs
 model.compile(loss='binary_crossentropy',
-              optimizer='adam',
-              class_mode="binary")
+              optimizer='adam')

-print("Train...")
-model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3,
+print('Train...')
+model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, y_test), show_accuracy=True)
 score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size,
@@ -14,13 +14,19 @@ from __future__ import print_function
 from keras.models import Sequential
 from keras.layers.core import Dense, Activation, Dropout
 from keras.layers.recurrent import LSTM
-from keras.datasets.data_utils import get_file
+from keras.utils.data_utils import get_file
 import numpy as np
 import random
 import sys

 path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
-text = open(path).read().lower()
+
+try: 
+    text = open(path).read().lower()
+except UnicodeDecodeError:
+    import codecs
+    text = codecs.open(path, encoding='utf-8').read().lower()
+
 print('corpus length:', len(text))

 chars = set(text)
@@ -29,7 +29,7 @@ nb_pool = 2
 # convolution kernel size
 nb_conv = 3

-# the data, shuffled and split between tran and test sets
+# the data, shuffled and split between train and test sets
 (X_train, y_train), (X_test, y_test) = mnist.load_data()

 X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
@@ -20,7 +20,7 @@ batch_size = 128
 nb_classes = 10
 nb_epoch = 20

-# the data, shuffled and split between tran and test sets
+# the data, shuffled and split between train and test sets
 (X_train, y_train), (X_test, y_test) = mnist.load_data()

 X_train = X_train.reshape(60000, 784)
@@ -77,7 +77,7 @@ def compute_accuracy(predictions, labels):
    return labels[predictions.ravel() < 0.5].mean()


-# the data, shuffled and split between tran and test sets
+# the data, shuffled and split between train and test sets
 (X_train, y_train), (X_test, y_test) = mnist.load_data()
 X_train = X_train.reshape(60000, 784)
 X_test = X_test.reshape(10000, 784)
@@ -7,11 +7,11 @@ and make sure the variable `weights_path` in this script matches the location of

 Run the script with:
 ```
-python neural_style.py path_to_your_base_image.jpg path_to_your_reference.jpg prefix_for_results
+python neural_style_transfer.py path_to_your_base_image.jpg path_to_your_reference.jpg prefix_for_results
 ```
 e.g.:
 ```
-python neural_style.py img/tuebingen.jpg img/starry_night.jpg results/my_result
+python neural_style_transfer.py img/tuebingen.jpg img/starry_night.jpg results/my_result
 ```

 It is preferrable to run this script on GPU, for speed.
@@ -89,17 +89,11 @@ assert img_height == img_width, 'Due to the use of the Gram matrix, width and he
 def preprocess_image(image_path):
    img = imresize(imread(image_path), (img_width, img_height))
    img = img.transpose((2, 0, 1)).astype('float64')
-    img[:, :, 0] -= 103.939
-    img[:, :, 1] -= 116.779
-    img[:, :, 2] -= 123.68
    img = np.expand_dims(img, axis=0)
    return img

 # util function to convert a tensor into a valid image
 def deprocess_image(x):
-    x[:, :, 0] += 103.939
-    x[:, :, 1] += 116.779
-    x[:, :, 2] += 123.68
    x = x.transpose((1, 2, 0))
    x = np.clip(x, 0, 255).astype('uint8')
    return x
@@ -59,7 +59,7 @@ model.add(LSTM(50,
               return_sequences=False,
               stateful=True))
 model.add(Dense(1))
-model.compile(loss='rmse', optimizer='rmsprop')
+model.compile(loss='mse', optimizer='rmsprop')

 print('Training')
 for i in range(epochs):
@@ -68,7 +68,8 @@ for i in range(epochs):
              expected_output,
              batch_size=batch_size,
              verbose=1,
-              nb_epoch=1)
+              nb_epoch=1,
+              shuffle=False)
    model.reset_states()

 print('Predicting')
@@ -1 +1 @@
-__version__ = '0.3.1'
+__version__ = '0.3.3'
@@ -7,13 +7,9 @@ def softmax(x):
    if ndim == 2:
        return K.softmax(x)
    elif ndim == 3:
-        # apply softmax to each timestep
-        def step(x, states):
-            return K.softmax(x), []
-        last_output, outputs, states = K.rnn(step, x,
-                                             [],
-                                             mask=None)
-        return outputs
+        e = K.exp(x - K.max(x, axis=-1, keepdims=True))
+        s = K.sum(e, axis=-1, keepdims=True)
+        return e / s
    else:
        raise Exception('Cannot apply softmax to a tensor that is not 2D or 3D. ' +
                        'Here, ndim=' + str(ndim))
@@ -18,7 +18,7 @@ _config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
 if os.path.exists(_config_path):
    _config = json.load(open(_config_path))
    _floatx = _config.get('floatx', floatx())
-    assert _floatx in {'float32', 'float64'}
+    assert _floatx in {'float16', 'float32', 'float64'}
    _epsilon = _config.get('epsilon', epsilon())
    assert type(_epsilon) == float
    _backend = _config.get('backend', _BACKEND)
@@ -20,7 +20,7 @@ def floatx():

 def set_floatx(floatx):
    global _FLOATX
-    if floatx not in {'float32', 'float64'}:
+    if floatx not in {'float16', 'float32', 'float64'}:
        raise Exception('Unknown floatx type: ' + str(floatx))
    floatx = str(floatx)
    _FLOATX = floatx
@@ -1,5 +1,7 @@
 import tensorflow as tf
 import numpy as np
+import os
+import warnings
 from .common import _FLOATX, _EPSILON

 # INTERNAL UTILS
@@ -7,14 +9,18 @@ from .common import _FLOATX, _EPSILON
 _SESSION = None


-def _get_session():
+def get_session():
    global _SESSION
    if _SESSION is None:
-        _SESSION = tf.Session('')
+        if not os.environ.get('OMP_NUM_THREADS'):
+            _SESSION = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+        else:
+            nb_thread = int(os.environ.get('OMP_NUM_THREADS'))
+            _SESSION = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=nb_thread, allow_soft_placement=True))
    return _SESSION


-def _set_session(session):
+def set_session(session):
    global _SESSION
    _SESSION = session

@@ -23,7 +29,7 @@ def _set_session(session):

 def variable(value, dtype=_FLOATX, name=None):
    v = tf.Variable(np.asarray(value, dtype=dtype), name=name)
-    _get_session().run(v.initializer)
+    get_session().run(v.initializer)
    return v


@@ -35,7 +41,13 @@ def placeholder(shape=None, ndim=None, dtype=_FLOATX, name=None):


 def shape(x):
-    return x.get_shape()
+    # symbolic shape
+    return tf.shape(x)
+
+
+def int_shape(x):
+    shape = x.get_shape()
+    return tuple([i.__int__() for i in shape])


 def ndim(x):
@@ -45,7 +57,7 @@ def ndim(x):
 def eval(x):
    '''Run a graph.
    '''
-    return x.eval(session=_get_session())
+    return x.eval(session=get_session())


 def zeros(shape, dtype=_FLOATX, name=None):
@@ -81,15 +93,28 @@ def dot(x, y):
    return tf.matmul(x, y)


+def batch_dot(x, y, axes=None):
+    if axes:
+        adj_x = None if axes[0][0] == ndim(x)-1 else True
+        adj_y = True if axes[1][0] == ndim(y)-1 else None
+    else:
+        adj_x = None
+        adj_y = None
+    return tf.batch_matmul(x, y, adj_x=adj_x, adj_y=adj_y)
+
+
 def transpose(x):
    return tf.transpose(x)


 def gather(reference, indices):
-    '''reference: a tensor.
-    indices: an int tensor of indices.
+    '''
+    # Arguments
+        reference: a tensor.
+        indices: an int tensor of indices.

-    Return: a tensor of same type as reference.
+    # Returns
+        a tensor of same type as `reference`.
    '''
    return tf.gather(reference, indices)

@@ -200,6 +225,10 @@ def round(x):
    return tf.round(x)


+def sign(x):
+    return tf.sign(x)
+
+
 def pow(x, a):
    return tf.pow(x, a)

@@ -231,7 +260,10 @@ def minimum(x, y):

 def concatenate(tensors, axis=-1):
    if axis < 0:
-        axis = axis % len(tensors[0].get_shape())
+        if len(tensors[0].get_shape()):
+            axis = axis % len(tensors[0].get_shape())
+        else:
+            axis = 0
    return tf.concat(axis, tensors)


@@ -242,8 +274,9 @@ def reshape(x, shape):
 def permute_dimensions(x, pattern):
    '''Transpose dimensions.

-    pattern should be a tuple or list of
-    dimension indices, e.g. [0, 2, 1].
+    # Arguments
+        pattern: should be a tuple or list of
+            dimension indices, e.g. [0, 2, 1].
    '''
    return tf.transpose(x, perm=pattern)

@@ -256,15 +289,15 @@ def resize_images(X, height_factor, width_factor, dim_ordering):
    positive integers.
    '''
    if dim_ordering == 'th':
-        new_height = shape(X)[2].value * height_factor
-        new_width = shape(X)[3].value * width_factor
+        new_shape = tf.shape(X)[2:]
+        new_shape *= tf.constant(np.array([height_factor, width_factor]).astype('int32'))
        X = permute_dimensions(X, [0, 2, 3, 1])
-        X = tf.image.resize_nearest_neighbor(X, (new_height, new_width))
+        X = tf.image.resize_nearest_neighbor(X, new_shape)
        return permute_dimensions(X, [0, 3, 1, 2])
    elif dim_ordering == 'tf':
-        new_height = shape(X)[1].value * height_factor
-        new_width = shape(X)[2].value * width_factor
-        return tf.image.resize_nearest_neighbor(X, (new_height, new_width))
+        new_shape = tf.shape(X)[1:3]
+        new_shape *= tf.constant(np.array([height_factor, width_factor]).astype('int32'))
+        return tf.image.resize_nearest_neighbor(X, new_shape)
    else:
        raise Exception('Invalid dim_ordering: ' + dim_ordering)

@@ -345,16 +378,21 @@ def spatial_2d_padding(x, padding=(1, 1), dim_ordering='th'):
    return tf.pad(x, pattern)


+def pack(x):
+    return tf.pack(x)
+
+
 # VALUE MANIPULATION

+
 def get_value(x):
    '''Technically the same as eval() for TF.
    '''
-    return x.eval(session=_get_session())
+    return x.eval(session=get_session())


 def set_value(x, value):
-    tf.assign(x, np.asarray(value)).op.run(session=_get_session())
+    tf.assign(x, np.asarray(value)).op.run(session=get_session())


 # GRAPH MANIPULATION
@@ -362,9 +400,9 @@ def set_value(x, value):
 class Function(object):

    def __init__(self, inputs, outputs, updates=[]):
-        assert type(inputs) in {list, tuple}
-        assert type(outputs) in {list, tuple}
-        assert type(updates) in {list, tuple}
+        assert type(inputs) in {list, tuple}, 'Input to a TensorFlow backend function should be a list or tuple.'
+        assert type(outputs) in {list, tuple}, 'Output to a TensorFlow backend function should be a list or tuple.'
+        assert type(updates) in {list, tuple}, 'Updates in a TensorFlow backend function should be a list or tuple.'
        self.inputs = list(inputs)
        self.outputs = list(outputs)
        with tf.control_dependencies(self.outputs):
@@ -374,12 +412,18 @@ class Function(object):
        assert type(inputs) in {list, tuple}
        names = [v.name for v in self.inputs]
        feed_dict = dict(zip(names, inputs))
-        session = _get_session()
+        session = get_session()
        updated = session.run(self.outputs + self.updates, feed_dict=feed_dict)
        return updated[:len(self.outputs)]


-def function(inputs, outputs, updates=[]):
+def function(inputs, outputs, updates=[], **kwargs):
+    if len(kwargs) > 0:
+        msg = [
+            "Expected no kwargs, you passed %s" % len(kwargs),
+            "kwargs passed to function are ignored with Tensorflow backend"
+        ]
+        warnings.warn('\n'.join(msg))
    return Function(inputs, outputs, updates=updates)


@@ -390,46 +434,47 @@ def gradients(loss, variables):
 # CONTROL FLOW

 def rnn(step_function, inputs, initial_states,
-        go_backwards=False, mask=None):
+        go_backwards=False, mask=None, constants=None):
    '''Iterates over the time dimension of a tensor.

-    Parameters
-    ----------
-    inputs: tensor of temporal data of shape (samples, time, ...)
-        (at least 3D).
-    step_function:
-        Parameters:
-            input: tensor with shape (samples, ...) (no time dimension),
-                representing input for the batch of samples at a certain
-                time step.
-            states: list of tensors.
-        Returns:
-            output: tensor with shape (samples, ...) (no time dimension),
-            new_states: list of tensors, same length and shapes
-                as 'states'.
-    initial_states: tensor with shape (samples, ...) (no time dimension),
-        containing the initial values for the states used in
-        the step function.
-    go_backwards: boolean. If True, do the iteration over
-        the time dimension in reverse order.
-    mask: binary tensor with shape (samples, time, 1),
-        with a zero for every element that is masked.
+    # Arguments
+        inputs: tensor of temporal data of shape (samples, time, ...)
+            (at least 3D).
+        step_function:
+            Parameters:
+                input: tensor with shape (samples, ...) (no time dimension),
+                    representing input for the batch of samples at a certain
+                    time step.
+                states: list of tensors.
+            Returns:
+                output: tensor with shape (samples, ...) (no time dimension),
+                new_states: list of tensors, same length and shapes
+                    as 'states'.
+        initial_states: tensor with shape (samples, ...) (no time dimension),
+            containing the initial values for the states used in
+            the step function.
+        go_backwards: boolean. If True, do the iteration over
+            the time dimension in reverse order.
+        mask: binary tensor with shape (samples, time, 1),
+            with a zero for every element that is masked.
+        constants: a list of constant values passed at each step.

-    Returns
-    -------
-    A tuple (last_output, outputs, new_states).
-        last_output: the latest output of the rnn, of shape (samples, ...)
-        outputs: tensor with shape (samples, time, ...) where each
-            entry outputs[s, t] is the output of the step function
-            at time t for sample s.
-        new_states: list of tensors, latest states returned by
-            the step function, of shape (samples, ...).
+    # Returns
+        A tuple (last_output, outputs, new_states).
+            last_output: the latest output of the rnn, of shape (samples, ...)
+            outputs: tensor with shape (samples, time, ...) where each
+                entry outputs[s, t] is the output of the step function
+                at time t for sample s.
+            new_states: list of tensors, latest states returned by
+                the step function, of shape (samples, ...).
    '''
    ndim = len(inputs.get_shape())
    assert ndim >= 3, "Input should be at least 3D."
    axes = [1, 0] + list(range(2, ndim))
    inputs = tf.transpose(inputs, (axes))
    input_list = tf.unpack(inputs)
+    if constants is None:
+        constants = []

    states = initial_states
    successive_states = []
@@ -445,8 +490,11 @@ def rnn(step_function, inputs, initial_states,
        mask = tf.cast(tf.transpose(mask, axes), tf.bool)
        mask_list = tf.unpack(mask)

+        if go_backwards:
+            mask_list.reverse()
+
        for input, mask_t in zip(input_list, mask_list):
-            output, new_states = step_function(input, states)
+            output, new_states = step_function(input, states + constants)

            # tf.select needs its condition tensor to be the same shape as its two
            # result tensors, but in our case the condition (mask) tensor is
@@ -474,7 +522,7 @@ def rnn(step_function, inputs, initial_states,
            successive_states.append(states)
    else:
        for input in input_list:
-            output, states = step_function(input, states)
+            output, states = step_function(input, states + constants)
            successive_outputs.append(output)
            successive_states.append(states)

@@ -488,7 +536,12 @@ def rnn(step_function, inputs, initial_states,


 def switch(condition, then_expression, else_expression):
-    '''condition: scalar tensor.
+    '''Switch between two operations depending on a scalar value.
+
+    # Arguments
+        condition: scalar tensor.
+        then_expression: TensorFlow operation.
+        else_expression: TensorFlow operation.
    '''
    return tf.python.control_flow_ops.cond(condition,
                                           lambda: then_expression,
@@ -500,14 +553,18 @@ def switch(condition, then_expression, else_expression):
 def relu(x, alpha=0., max_value=None):
    '''ReLU.

-    alpha: slope of negative section.
+    # Arguments
+        alpha: slope of negative section.
+        max_value: saturation threshold.
    '''
    negative_part = tf.nn.relu(-x)
    x = tf.nn.relu(x)
    if max_value is not None:
        x = tf.clip_by_value(x, tf.cast(0., dtype=_FLOATX),
                             tf.cast(max_value, dtype=_FLOATX))
-    x -= tf.constant(alpha, dtype=_FLOATX) * negative_part
+    if isinstance(alpha, (tuple, list, np.ndarray)) or np.isscalar(alpha):
+        alpha = tf.constant(alpha, dtype=_FLOATX)
+    x -= alpha * negative_part
    return x


@@ -526,13 +583,13 @@ def categorical_crossentropy(output, target, from_logits=False):
    if not from_logits:
        # scale preds so that the class probas of each sample sum to 1
        output /= tf.reduce_sum(output,
-                                reduction_indices=len(output.get_shape())-1,
+                                reduction_indices=len(output.get_shape()) - 1,
                                keep_dims=True)
        # manual computation of crossentropy
        output = tf.clip_by_value(output, tf.cast(_EPSILON, dtype=_FLOATX),
-                                  tf.cast(1.-_EPSILON, dtype=_FLOATX))
+                                  tf.cast(1. - _EPSILON, dtype=_FLOATX))
        return - tf.reduce_sum(target * tf.log(output),
-                               reduction_indices=len(output.get_shape())-1)
+                               reduction_indices=len(output.get_shape()) - 1)
    else:
        return tf.nn.softmax_cross_entropy_with_logits(output, target)

@@ -584,11 +641,12 @@ def l2_normalize(x, axis):

 def conv2d(x, kernel, strides=(1, 1), border_mode='valid', dim_ordering='th',
           image_shape=None, filter_shape=None):
-    '''
-    Run on cuDNN if available.
-    border_mode: string, "same" or "valid".
-    dim_ordering: whether to use Theano or TensorFlow dimension ordering
-    in inputs/kernels/ouputs.
+    '''Runs on cuDNN if available.
+
+    # Arguments
+        border_mode: string, "same" or "valid".
+        dim_ordering: whether to use Theano or TensorFlow dimension ordering
+        in inputs/kernels/ouputs.
    '''
    if border_mode == 'same':
        padding = 'SAME'
@@ -628,10 +686,11 @@ def conv2d(x, kernel, strides=(1, 1), border_mode='valid', dim_ordering='th',
 def pool2d(x, pool_size, strides=(1, 1),
           border_mode='valid', dim_ordering='th', pool_mode='max'):
    '''
-    pool_size: tuple of 2 integers.
-    strides: tuple of 2 integers.
-    border_mode: one of "valid", "same".
-    dim_ordering: one of "th", "tf".
+    # Arguments
+        pool_size: tuple of 2 integers.
+        strides: tuple of 2 integers.
+        border_mode: one of "valid", "same".
+        dim_ordering: one of "th", "tf".
    '''
    if border_mode == 'same':
        padding = 'SAME'
@@ -686,3 +745,10 @@ def random_uniform(shape, low=0.0, high=1.0, dtype=_FLOATX, seed=None):
        seed = np.random.randint(10e6)
    return tf.random_uniform(shape, minval=low, maxval=high,
                             dtype=dtype, seed=seed)
+
+
+def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None):
+    if seed is None:
+        seed = np.random.randint(10e6)
+    return tf.select(tf.random_uniform(shape, dtype=dtype, seed=seed) <= p,
+                     tf.ones(shape), tf.zeros(shape))
@@ -1,7 +1,9 @@
 import theano
 from theano import tensor as T
 from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
-from theano.tensor.signal import downsample
+from theano.tensor.signal import pool
+from theano.tensor.nnet import conv3d2d
+import inspect
 import numpy as np
 from .common import _FLOATX, _EPSILON

@@ -10,21 +12,6 @@ from .common import _FLOATX, _EPSILON
 theano.config.floatX = _FLOATX


-def _on_gpu():
-    '''Return whether the session is set to
-    run on GPU or not (i.e. on CPU).
-    '''
-    return theano.config.device[:3] == 'gpu' or theano.sandbox.cuda.cuda_enabled
-
-
-if _on_gpu():
-    '''Import cuDNN only if running on GPU:
-    not having Cuda installed should not
-    prevent from running the present code.
-    '''
-    from theano.sandbox.cuda import dnn
-
-
 # VARIABLE MANIPULATION

 def variable(value, dtype=_FLOATX, name=None):
@@ -41,6 +28,7 @@ def placeholder(shape=None, ndim=None, dtype=_FLOATX, name=None):
        raise Exception('Specify either a shape or ndim value.')
    if shape is not None:
        ndim = len(shape)
+
    broadcast = (False,) * ndim
    return T.TensorType(dtype, broadcast)(name)

@@ -108,6 +96,13 @@ def dot(x, y):
    return T.dot(x, y)


+def batch_dot(x, y, axes=None):
+    if axes is None:
+        # behaves like tf.batch_matmul as default
+        axes = [(x.ndim-1,), (y.ndim-2,)]
+    return T.batched_tensordot(x, y, axes=axes)
+
+
 def transpose(x):
    return T.transpose(x)

@@ -145,7 +140,10 @@ def prod(x, axis=None, keepdims=False):


 def mean(x, axis=None, keepdims=False):
-    return T.mean(x, axis=axis, keepdims=keepdims)
+    dtype = None
+    if 'int' in x.dtype:
+        dtype = _FLOATX
+    return T.mean(x, axis=axis, keepdims=keepdims, dtype=dtype)


 def std(x, axis=None, keepdims=False):
@@ -191,6 +189,10 @@ def round(x):
    return T.round(x)


+def sign(x):
+    return T.sgn(x)
+
+
 def pow(x, a):
    return T.pow(x, a)

@@ -265,6 +267,27 @@ def resize_images(X, height_factor, width_factor, dim_ordering):
        raise Exception('Invalid dim_ordering: ' + dim_ordering)


+def resize_volumes(X, depth_factor, height_factor, width_factor, dim_ordering):
+    '''Resize the volume contained in a 5D tensor of shape
+    - [batch, channels, depth, height, width] (for 'th' dim_ordering)
+    - [batch, depth, height, width, channels] (for 'tf' dim_ordering)
+    by a factor of (depth_factor, height_factor, width_factor).
+    Both factors should be positive integers.
+    '''
+    if dim_ordering == 'th':
+        output = repeat_elements(X, depth_factor, axis=2)
+        output = repeat_elements(output, height_factor, axis=3)
+        output = repeat_elements(output, width_factor, axis=4)
+        return output
+    elif dim_ordering == 'tf':
+        output = repeat_elements(X, depth_factor, axis=1)
+        output = repeat_elements(output, height_factor, axis=2)
+        output = repeat_elements(output, width_factor, axis=3)
+        return output
+    else:
+        raise Exception('Invalid dim_ordering: ' + dim_ordering)
+
+
 def repeat(x, n):
    '''Repeat a 2D tensor.

@@ -357,6 +380,45 @@ def spatial_2d_padding(x, padding=(1, 1), dim_ordering='th'):
        raise Exception('Invalid dim_ordering: ' + dim_ordering)
    return T.set_subtensor(output[indices], x)

+
+def spatial_3d_padding(x, padding=(1, 1, 1), dim_ordering='th'):
+    '''Pad the 2nd, 3rd and 4th dimensions of a 5D tensor
+    with "padding[0]", "padding[1]" and "padding[2]" (resp.) zeros left and right.
+    '''
+    input_shape = x.shape
+    if dim_ordering == 'th':
+        output_shape = (input_shape[0],
+                        input_shape[1],
+                        input_shape[2] + 2 * padding[0],
+                        input_shape[3] + 2 * padding[1],
+                        input_shape[4] + 2 * padding[2])
+        output = T.zeros(output_shape)
+        indices = (slice(None),
+                   slice(None),
+                   slice(padding[0], input_shape[2] + padding[0]),
+                   slice(padding[1], input_shape[3] + padding[1]),
+                   slice(padding[2], input_shape[4] + padding[2]))
+
+    elif dim_ordering == 'tf':
+        output_shape = (input_shape[0],
+                        input_shape[1] + 2 * padding[0],
+                        input_shape[2] + 2 * padding[1],
+                        input_shape[3] + 2 * padding[2],
+                        input_shape[4])
+        output = T.zeros(output_shape)
+        indices = (slice(None),
+                   slice(padding[0], input_shape[1] + padding[0]),
+                   slice(padding[1], input_shape[2] + padding[1]),
+                   slice(padding[2], input_shape[3] + padding[2]),
+                   slice(None))
+    else:
+        raise Exception('Invalid dim_ordering: ' + dim_ordering)
+    return T.set_subtensor(output[indices], x)
+
+
+def pack(x):
+    return T.stack(*x)
+
 # VALUE MANIPULATION


@@ -384,8 +446,14 @@ class Function(object):
        return self.function(*inputs)


-def function(inputs, outputs, updates=[]):
-    return Function(inputs, outputs, updates=updates)
+def function(inputs, outputs, updates=[], **kwargs):
+    if len(kwargs) > 0:
+        function_args = inspect.getargspec(theano.function)[0]
+        for key in kwargs.keys():
+            if key not in function_args:
+                msg = "Invalid argument '%s' passed to K.function" % key
+                raise ValueError(msg)
+    return Function(inputs, outputs, updates=updates, **kwargs)


 def gradients(loss, variables):
@@ -395,40 +463,40 @@ def gradients(loss, variables):
 # CONTROL FLOW

 def rnn(step_function, inputs, initial_states,
-        go_backwards=False, mask=None):
+        go_backwards=False, mask=None, constants=None):
    '''Iterates over the time dimension of a tensor.

-    Parameters
-    ----------
-    inputs: tensor of temporal data of shape (samples, time, ...)
-        (at least 3D).
-    step_function:
-        Parameters:
-            input: tensor with shape (samples, ...) (no time dimension),
-                representing input for the batch of samples at a certain
-                time step.
-            states: list of tensors.
-        Returns:
-            output: tensor with shape (samples, ...) (no time dimension),
-            new_states: list of tensors, same length and shapes
-                as 'states'.
-    initial_states: tensor with shape (samples, ...) (no time dimension),
-        containing the initial values for the states used in
-        the step function.
-    go_backwards: boolean. If True, do the iteration over
-        the time dimension in reverse order.
-    mask: binary tensor with shape (samples, time),
-        with a zero for every element that is masked.
+    # Arguments
+        inputs: tensor of temporal data of shape (samples, time, ...)
+            (at least 3D).
+        step_function:
+            Parameters:
+                input: tensor with shape (samples, ...) (no time dimension),
+                    representing input for the batch of samples at a certain
+                    time step.
+                states: list of tensors.
+            Returns:
+                output: tensor with shape (samples, ...) (no time dimension),
+                new_states: list of tensors, same length and shapes
+                    as 'states'.
+        initial_states: tensor with shape (samples, ...) (no time dimension),
+            containing the initial values for the states used in
+            the step function.
+        go_backwards: boolean. If True, do the iteration over
+            the time dimension in reverse order.
+        mask: binary tensor with shape (samples, time),
+            with a zero for every element that is masked.
+        constants: a list of constant values passed at each step.

-    Returns
-    -------
-    A tuple (last_output, outputs, new_states).
-        last_output: the latest output of the rnn, of shape (samples, ...)
-        outputs: tensor with shape (samples, time, ...) where each
-            entry outputs[s, t] is the output of the step function
-            at time t for sample s.
-        new_states: list of tensors, latest states returned by
-            the step function, of shape (samples, ...).
+
+    # Returns
+        A tuple (last_output, outputs, new_states).
+            last_output: the latest output of the rnn, of shape (samples, ...)
+            outputs: tensor with shape (samples, time, ...) where each
+                entry outputs[s, t] is the output of the step function
+                at time t for sample s.
+            new_states: list of tensors, latest states returned by
+                the step function, of shape (samples, ...).
    '''
    ndim = inputs.ndim
    assert ndim >= 3, 'Input should be at least 3D.'
@@ -442,8 +510,10 @@ def rnn(step_function, inputs, initial_states,
        assert mask.ndim == ndim
        mask = mask.dimshuffle(axes)

+        if constants is None:
+            constants = []
        # build an all-zero tensor of shape (samples, output_dim)
-        initial_output = step_function(inputs[0], initial_states)[0] * 0
+        initial_output = step_function(inputs[0], initial_states + constants)[0] * 0
        # Theano gets confused by broadcasting patterns in the scan op
        initial_output = T.unbroadcast(initial_output, 0, 1)

@@ -460,6 +530,7 @@ def rnn(step_function, inputs, initial_states,
            _step,
            sequences=[inputs, mask],
            outputs_info=[initial_output] + initial_states,
+            non_sequences=constants,
            go_backwards=go_backwards)
    else:
        def _step(input, *states):
@@ -470,6 +541,7 @@ def rnn(step_function, inputs, initial_states,
            _step,
            sequences=inputs,
            outputs_info=[None] + initial_states,
+            non_sequences=constants,
            go_backwards=go_backwards)

    # deal with Theano API inconsistency
@@ -569,7 +641,6 @@ def l2_normalize(x, axis):
 def conv2d(x, kernel, strides=(1, 1), border_mode='valid', dim_ordering='th',
           image_shape=None, filter_shape=None):
    '''
-    Run on cuDNN if available.
    border_mode: string, "same" or "valid".
    '''
    if dim_ordering not in {'th', 'tf'}:
@@ -591,51 +662,153 @@ def conv2d(x, kernel, strides=(1, 1), border_mode='valid', dim_ordering='th',
            filter_shape = (filter_shape[3], filter_shape[2],
                            filter_shape[0], filter_shape[1])

-    if _on_gpu() and dnn.dnn_available():
-        if border_mode == 'same':
-            assert(strides == (1, 1))
-            conv_out = dnn.dnn_conv(img=x,
-                                    kerns=kernel,
-                                    border_mode='full')
-            np_kernel = kernel.eval()
-            shift_x = (np_kernel.shape[2] - 1) // 2
-            shift_y = (np_kernel.shape[3] - 1) // 2
-            conv_out = conv_out[:, :,
-                                shift_x:x.shape[2] + shift_x,
-                                shift_y:x.shape[3] + shift_y]
-        else:
-            conv_out = dnn.dnn_conv(img=x,
-                                    kerns=kernel,
-                                    border_mode=border_mode,
-                                    subsample=strides)
+    if border_mode == 'same':
+        th_border_mode = 'half'
+        np_kernel = kernel.eval()
+        assert strides[0] <= np_kernel.shape[2], 'strides should be smaller than the convolution window.'
+        assert strides[1] <= np_kernel.shape[3], 'strides should be smaller than the convolution window.'
+    elif border_mode == 'valid':
+        th_border_mode = 'valid'
    else:
-        if border_mode == 'same':
-            th_border_mode = 'full'
-            assert(strides == (1, 1))
-        elif border_mode == 'valid':
-            th_border_mode = 'valid'
-        else:
-            raise Exception('Border mode not supported: ' + str(border_mode))
+        raise Exception('Border mode not supported: ' + str(border_mode))
+
+    # Theano might not accept like longs
+    def int_or_none(value):
+        try:
+            return int(value)
+        except TypeError:
+            return None
+
+    if image_shape is not None:
+        image_shape = tuple(int_or_none(v) for v in image_shape)
+
+    if filter_shape is not None:
+        filter_shape = tuple(int_or_none(v) for v in filter_shape)
+
+    conv_out = T.nnet.conv2d(x, kernel,
+                             border_mode=th_border_mode,
+                             subsample=strides,
+                             input_shape=image_shape,
+                             filter_shape=filter_shape)
+
+    if border_mode == 'same':
+        if np_kernel.shape[2] % 2 == 0:
+            conv_out = conv_out[:,:,:(x.shape[2]+strides[0]-1) // strides[0],:]
+        if np_kernel.shape[3] % 2 == 0:
+            conv_out = conv_out[:,:,:,:(x.shape[3]+strides[1]-1) // strides[1]]

-        conv_out = T.nnet.conv.conv2d(x, kernel,
-                                      border_mode=th_border_mode,
-                                      subsample=strides,
-                                      image_shape=image_shape,
-                                      filter_shape=filter_shape)
-        if border_mode == 'same':
-            np_kernel = kernel.eval()
-            shift_x = (np_kernel.shape[2] - 1) // 2
-            shift_y = (np_kernel.shape[3] - 1) // 2
-            conv_out = conv_out[:, :,
-                                shift_x:x.shape[2] + shift_x,
-                                shift_y:x.shape[3] + shift_y]
    if dim_ordering == 'tf':
        conv_out = conv_out.dimshuffle((0, 2, 3, 1))
    return conv_out


+def conv3d(x, kernel, strides=(1, 1, 1),
+           border_mode='valid', dim_ordering='th',
+           volume_shape=None, filter_shape=None):
+    '''
+    Run on cuDNN if available.
+    border_mode: string, "same" or "valid".
+    '''
+    if dim_ordering not in {'th', 'tf'}:
+        raise Exception('Unknown dim_ordering ' + str(dim_ordering))
+
+    if border_mode not in {'same', 'valid'}:
+        raise Exception('Invalid border mode: ' + str(border_mode))
+
+    if dim_ordering == 'tf':
+        # TF uses the last dimension as channel dimension,
+        # instead of the 2nd one.
+        # TH input shape: (samples, input_depth, conv_dim1, conv_dim2, conv_dim3)
+        # TF input shape: (samples, conv_dim1, conv_dim2, conv_dim3, input_depth)
+        # TH kernel shape: (out_depth, input_depth, kernel_dim1, kernel_dim2, kernel_dim3)
+        # TF kernel shape: (kernel_dim1, kernel_dim2, kernel_dim3, input_depth, out_depth)
+        x = x.dimshuffle((0, 4, 1, 2, 3))
+        kernel = kernel.dimshuffle((4, 3, 0, 1, 2))
+        if volume_shape:
+            volume_shape = (volume_shape[0], volume_shape[4],
+                            volume_shape[1], volume_shape[2], volume_shape[3])
+        if filter_shape:
+            filter_shape = (filter_shape[4], filter_shape[3],
+                            filter_shape[0], filter_shape[1], filter_shape[2])
+
+    if border_mode == 'same':
+        assert(strides == (1, 1, 1))
+        pad_dim1 = (kernel.shape[2] - 1)
+        pad_dim2 = (kernel.shape[3] - 1)
+        pad_dim3 = (kernel.shape[4] - 1)
+        output_shape = (x.shape[0], x.shape[1],
+                        x.shape[2] + pad_dim1,
+                        x.shape[3] + pad_dim2,
+                        x.shape[4] + pad_dim3)
+        output = T.zeros(output_shape)
+        indices = (slice(None), slice(None),
+                   slice(pad_dim1 // 2, x.shape[2] + pad_dim1 // 2),
+                   slice(pad_dim2 // 2, x.shape[3] + pad_dim2 // 2),
+                   slice(pad_dim3 // 2, x.shape[4] + pad_dim3 // 2))
+        x = T.set_subtensor(output[indices], x)
+        border_mode = 'valid'
+
+    border_mode_3d = (border_mode, border_mode, border_mode)
+    conv_out = conv3d2d.conv3d(signals=x.dimshuffle(0, 2, 1, 3, 4),
+                               filters=kernel.dimshuffle(0, 2, 1, 3, 4),
+                               border_mode=border_mode_3d)
+    conv_out = conv_out.dimshuffle(0, 2, 1, 3, 4)
+
+    # support strides by manually slicing the output
+    if strides != (1, 1, 1):
+        conv_out = conv_out[:, :, ::strides[0], ::strides[1], ::strides[2]]
+
+    if dim_ordering == 'tf':
+        conv_out = conv_out.dimshuffle((0, 2, 3, 4, 1))
+
+    return conv_out
+
+
 def pool2d(x, pool_size, strides=(1, 1), border_mode='valid',
           dim_ordering='th', pool_mode='max'):
+    if border_mode == 'same':
+        w_pad = pool_size[0] - 2 if pool_size[0] % 2 == 1 else pool_size[0] - 1
+        h_pad = pool_size[1] - 2 if pool_size[1] % 2 == 1 else pool_size[1] - 1
+        padding = (w_pad, h_pad)
+    elif border_mode == 'valid':
+        padding = (0, 0)
+    else:
+        raise Exception('Invalid border mode: ' + str(border_mode))
+
+    if dim_ordering not in {'th', 'tf'}:
+        raise Exception('Unknown dim_ordering ' + str(dim_ordering))
+
+    if dim_ordering == 'tf':
+        x = x.dimshuffle((0, 3, 1, 2))
+
+    if pool_mode == 'max':
+        pool_out = pool.pool_2d(x, ds=pool_size, st=strides,
+                                ignore_border=True,
+                                padding=padding,
+                                mode='max')
+    elif pool_mode == 'avg':
+        pool_out = pool.pool_2d(x, ds=pool_size, st=strides,
+                                ignore_border=True,
+                                padding=padding,
+                                mode='average_exc_pad')
+    else:
+        raise Exception('Invalid pooling mode: ' + str(pool_mode))
+
+    if border_mode == 'same':
+        expected_width = (x.shape[2] + strides[0] - 1) // strides[0]
+        expected_height = (x.shape[3] + strides[1] - 1) // strides[1]
+
+        pool_out = pool_out[:, :,
+                            : expected_width,
+                            : expected_height]
+
+    if dim_ordering == 'tf':
+        pool_out = pool_out.dimshuffle((0, 2, 3, 1))
+    return pool_out
+
+
+def pool3d(x, pool_size, strides=(1, 1, 1), border_mode='valid',
+           dim_ordering='th', pool_mode='max'):
    if border_mode == 'same':
        # TODO: add implementation for border_mode="same"
        raise Exception('border_mode="same" not supported with Theano.')
@@ -649,23 +822,46 @@ def pool2d(x, pool_size, strides=(1, 1), border_mode='valid',
        raise Exception('Unknown dim_ordering ' + str(dim_ordering))

    if dim_ordering == 'tf':
-        x = x.dimshuffle((0, 3, 1, 2))
+        x = x.dimshuffle((0, 4, 1, 2, 3))

    if pool_mode == 'max':
-        pool_out = downsample.max_pool_2d(x, ds=pool_size, st=strides,
-                                          ignore_border=ignore_border,
-                                          padding=padding,
-                                          mode='max')
+        # pooling over conv_dim2, conv_dim1 (last two channels)
+        output = pool.pool_2d(input=x.dimshuffle(0, 1, 4, 3, 2),
+                              ds=(pool_size[1], pool_size[0]),
+                              st=(strides[1], strides[0]),
+                              ignore_border=ignore_border,
+                              padding=padding,
+                              mode='max')
+
+        # pooling over conv_dim3
+        pool_out = pool.pool_2d(input=output.dimshuffle(0, 1, 4, 3, 2),
+                                ds=(1, pool_size[2]),
+                                st=(1, strides[2]),
+                                ignore_border=ignore_border,
+                                padding=padding,
+                                mode='max')
+
    elif pool_mode == 'avg':
-        pool_out = downsample.max_pool_2d(x, ds=pool_size, st=strides,
-                                          ignore_border=ignore_border,
-                                          padding=padding,
-                                          mode='average_exc_pad')
+        # pooling over conv_dim2, conv_dim1 (last two channels)
+        output = pool.pool_2d(input=x.dimshuffle(0, 1, 4, 3, 2),
+                              ds=(pool_size[1], pool_size[0]),
+                              st=(strides[1], strides[0]),
+                              ignore_border=ignore_border,
+                              padding=padding,
+                              mode='average_exc_pad')
+
+        # pooling over conv_dim3
+        pool_out = pool.pool_2d(input=output.dimshuffle(0, 1, 4, 3, 2),
+                                ds=(1, pool_size[2]),
+                                st=(1, strides[2]),
+                                ignore_border=ignore_border,
+                                padding=padding,
+                                mode='average_exc_pad')
    else:
        raise Exception('Invalid pooling mode: ' + str(pool_mode))

    if dim_ordering == 'tf':
-        pool_out = pool_out.dimshuffle((0, 2, 3, 1))
+        pool_out = pool_out.dimshuffle((0, 2, 3, 4, 1))
    return pool_out


@@ -685,6 +881,13 @@ def random_uniform(shape, low=0.0, high=1.0, dtype=_FLOATX, seed=None):
    rng = RandomStreams(seed=seed)
    return rng.uniform(shape, low=low, high=high, dtype=dtype)

+
+def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None):
+    if seed is None:
+        seed = np.random.randint(10e6)
+    rng = RandomStreams(seed=seed)
+    return rng.binomial(shape, p=p, dtype=dtype)
+
 '''
 more TODO:

@@ -92,7 +92,8 @@ class Callback(object):
    will include the following quantities in the `logs` that
    it passes to its callbacks:

-        on_epoch_end: logs optionally include `val_loss`
+        on_epoch_end: logs include `acc` and `loss`, and
+            optionally include `val_loss`
            (if validation is enabled in `fit`), and `val_acc`
            (if validation and accuracy monitoring are enabled).
        on_batch_begin: logs include `size`,
@@ -129,11 +130,35 @@ class Callback(object):


 class BaseLogger(Callback):
-    '''Callback that prints events to the standard output.
+    '''Callback that accumulates epoch averages of
+    the metrics being monitored.

    This callback is automatically applied to
-    every Keras model (it is the basis of the verbosity modes
-    in models).
+    every Keras model.
+    '''
+    def on_epoch_begin(self, epoch, logs={}):
+        self.seen = 0
+        self.totals = {}
+
+    def on_batch_end(self, batch, logs={}):
+        batch_size = logs.get('size', 0)
+        self.seen += batch_size
+
+        for k, v in logs.items():
+            if k in self.totals:
+                self.totals[k] += v * batch_size
+            else:
+                self.totals[k] = v * batch_size
+
+    def on_epoch_end(self, epoch, logs={}):
+        for k in self.params['metrics']:
+            if k in self.totals:
+                # make value available to next callbacks
+                logs[k] = self.totals[k] / self.seen
+
+
+class ProgbarLogger(Callback):
+    '''Callback that prints metrics to stdout.
    '''
    def on_train_begin(self, logs={}):
        self.verbose = self.params['verbose']
@@ -145,7 +170,6 @@ class BaseLogger(Callback):
            self.progbar = Progbar(target=self.params['nb_sample'],
                                   verbose=self.verbose)
        self.seen = 0
-        self.totals = {}

    def on_batch_begin(self, batch, logs={}):
        if self.seen < self.params['nb_sample']:
@@ -155,11 +179,6 @@ class BaseLogger(Callback):
        batch_size = logs.get('size', 0)
        self.seen += batch_size

-        for k, v in logs.items():
-            if k in self.totals:
-                self.totals[k] += v * batch_size
-            else:
-                self.totals[k] = v * batch_size
        for k in self.params['metrics']:
            if k in logs:
                self.log_values.append((k, logs[k]))
@@ -171,8 +190,6 @@ class BaseLogger(Callback):

    def on_epoch_end(self, epoch, logs={}):
        for k in self.params['metrics']:
-            if k in self.totals:
-                self.log_values.append((k, self.totals[k] / self.seen))
            if k in logs:
                self.log_values.append((k, logs[k]))
        if self.verbose:
@@ -191,26 +208,8 @@ class History(Callback):
        self.epoch = []
        self.history = {}

-    def on_epoch_begin(self, epoch, logs={}):
-        self.seen = 0
-        self.totals = {}
-
-    def on_batch_end(self, batch, logs={}):
-        batch_size = logs.get('size', 0)
-        self.seen += batch_size
-        for k, v in logs.items():
-            if k in self.totals:
-                self.totals[k] += v * batch_size
-            else:
-                self.totals[k] = v * batch_size
-
    def on_epoch_end(self, epoch, logs={}):
        self.epoch.append(epoch)
-        for k, v in self.totals.items():
-            if k not in self.history:
-                self.history[k] = []
-            self.history[k].append(v / self.seen)
-
        for k, v in logs.items():
            if k not in self.history:
                self.history[k] = []
@@ -256,7 +255,7 @@ class ModelCheckpoint(Callback):

        if mode not in ['auto', 'min', 'max']:
            warnings.warn('ModelCheckpoint mode %s is unknown, '
-                          'fallback to auto mode.' % (self.mode),
+                          'fallback to auto mode.' % (mode),
                          RuntimeWarning)
            mode = 'auto'

@@ -373,26 +372,10 @@ class RemoteMonitor(Callback):
    def __init__(self, root='http://localhost:9000'):
        self.root = root

-    def on_epoch_begin(self, epoch, logs={}):
-        self.seen = 0
-        self.totals = {}
-
-    def on_batch_end(self, batch, logs={}):
-        batch_size = logs.get('size', 0)
-        self.seen += batch_size
-        for k, v in logs.items():
-            if k in self.totals:
-                self.totals[k] += v * batch_size
-            else:
-                self.totals[k] = v * batch_size
-
    def on_epoch_end(self, epoch, logs={}):
        import requests
        send = {}
        send['epoch'] = epoch
-
-        for k, v in self.totals.items():
-            send[k] = v / self.seen
        for k, v in logs.items():
            send[k] = v

@@ -463,7 +446,7 @@ class TensorBoard(Callback):
        import keras.backend.tensorflow_backend as KTF

        self.model = model
-        self.sess = KTF._get_session()
+        self.sess = KTF.get_session()
        if self.histogram_freq and not self.merged:
            mod_type = self.model.get_config()['name']
            if mod_type == 'Sequential':
@@ -486,19 +469,6 @@ class TensorBoard(Callback):
        self.writer = tf.train.SummaryWriter(self.log_dir,
                                             self.sess.graph_def)

-    def on_epoch_begin(self, epoch, logs={}):
-        self.seen = 0
-        self.totals = {}
-
-    def on_batch_end(self, batch, logs={}):
-        batch_size = logs.get('size', 0)
-        self.seen += batch_size
-        for k, v in logs.items():
-            if k in self.totals:
-                self.totals[k] += v * batch_size
-            else:
-                self.totals[k] = v * batch_size
-
    def on_epoch_end(self, epoch, logs={}):
        import tensorflow as tf

@@ -509,15 +479,14 @@ class TensorBoard(Callback):
                else:
                    test_function = self.model._test
                names = [v.name for v in test_function.inputs]
+                # TODO: implement batched calls to sess.run
+                # (current call will likely go OOM on GPU)
                feed_dict = dict(zip(names, self.model.validation_data))
                result = self.sess.run([self.merged], feed_dict=feed_dict)
                summary_str = result[0]
                self.writer.add_summary(summary_str, epoch)

-        all_values = self.totals.copy()
-        all_values.update(logs)
-
-        for name, value in all_values.items():
+        for name, value in logs.items():
            if name in ['batch', 'size']:
                continue
            summary = tf.Summary()
@@ -4,6 +4,7 @@ import sys
 from six.moves import cPickle
 from six.moves import range

+
 def load_batch(fpath, label_key='labels'):
    f = open(fpath, 'rb')
    if sys.version_info < (3,):
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from .cifar import load_batch
-from .data_utils import get_file
+from ..utils.data_utils import get_file
 import numpy as np
 import os

@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from .cifar import load_batch
-from .data_utils import get_file
+from ..utils.data_utils import get_file
 import numpy as np
 import os

@@ -1,53 +1,4 @@
-from __future__ import absolute_import
-from __future__ import print_function
+from ..utils.data_utils import *
+import warnings

-import tarfile
-import os
-from six.moves.urllib.request import FancyURLopener
-
-from ..utils.generic_utils import Progbar
-
-
-class ParanoidURLopener(FancyURLopener):
-    def http_error_default(self, url, fp, errcode, errmsg, headers):
-        raise Exception('URL fetch failure on {}: {} -- {}'.format(url, errcode, errmsg))
-
-
-def get_file(fname, origin, untar=False):
-    datadir_base = os.path.expanduser(os.path.join('~', '.keras'))
-    if not os.access(datadir_base, os.W_OK):
-        datadir_base = os.path.join('/tmp', '.keras')
-    datadir = os.path.join(datadir_base, 'datasets')
-    if not os.path.exists(datadir):
-        os.makedirs(datadir)
-
-    if untar:
-        untar_fpath = os.path.join(datadir, fname)
-        fpath = untar_fpath + '.tar.gz'
-    else:
-        fpath = os.path.join(datadir, fname)
-
-    if not os.path.exists(fpath):
-        print('Downloading data from',  origin)
-        global progbar
-        progbar = None
-
-        def dl_progress(count, block_size, total_size):
-            global progbar
-            if progbar is None:
-                progbar = Progbar(total_size)
-            else:
-                progbar.update(count*block_size)
-
-        ParanoidURLopener().retrieve(origin, fpath, dl_progress)
-        progbar = None
-
-    if untar:
-        if not os.path.exists(untar_fpath):
-            print('Untaring file...')
-            tfile = tarfile.open(fpath, 'r:gz')
-            tfile.extractall(path=datadir)
-            tfile.close()
-        return untar_fpath
-
-    return fpath
+warnings.warn('data_utils has been moved to keras.utils.data_utils.')
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from six.moves import cPickle
 import gzip
-from .data_utils import get_file
+from ..utils.data_utils import get_file
 from six.moves import zip
 import numpy as np

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import gzip
-from .data_utils import get_file
+from ..utils.data_utils import get_file
 from six.moves import cPickle
 import sys

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
-from .data_utils import get_file
+from ..utils.data_utils import get_file
 from six.moves import cPickle
 from six.moves import zip
 import numpy as np
@@ -3,9 +3,26 @@ import numpy as np
 from . import backend as K


-def get_fans(shape):
-    fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
-    fan_out = shape[1] if len(shape) == 2 else shape[0]
+def get_fans(shape, dim_ordering='th'):
+    if len(shape) == 2:
+        fan_in = shape[0]
+        fan_out = shape[1]
+    elif len(shape) == 4 or len(shape) == 5:
+        # assuming convolution kernels (2D or 3D).
+        # TH kernel shape: (depth, input_depth, ...)
+        # TF kernel shape: (..., input_depth, depth)
+        if dim_ordering == 'th':
+            fan_in = np.prod(shape[1:])
+            fan_out = shape[0]
+        elif dim_ordering == 'tf':
+            fan_in = np.prod(shape[:-1])
+            fan_out = shape[-1]
+        else:
+            raise Exception('Invalid dim_ordering: ' + dim_ordering)
+    else:
+        # no specific assumptions
+        fan_in = np.sqrt(np.prod(shape))
+        fan_out = np.sqrt(np.prod(shape))
    return fan_in, fan_out


@@ -19,39 +36,39 @@ def normal(shape, scale=0.05, name=None):
                      name=name)


-def lecun_uniform(shape, name=None):
+def lecun_uniform(shape, name=None, dim_ordering='th'):
    ''' Reference: LeCun 98, Efficient Backprop
        http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
    '''
-    fan_in, fan_out = get_fans(shape)
+    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    scale = np.sqrt(3. / fan_in)
    return uniform(shape, scale, name=name)


-def glorot_normal(shape, name=None):
+def glorot_normal(shape, name=None, dim_ordering='th'):
    ''' Reference: Glorot & Bengio, AISTATS 2010
    '''
-    fan_in, fan_out = get_fans(shape)
+    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    s = np.sqrt(2. / (fan_in + fan_out))
    return normal(shape, s, name=name)


-def glorot_uniform(shape, name=None):
-    fan_in, fan_out = get_fans(shape)
+def glorot_uniform(shape, name=None, dim_ordering='th'):
+    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    s = np.sqrt(6. / (fan_in + fan_out))
    return uniform(shape, s, name=name)


-def he_normal(shape, name=None):
+def he_normal(shape, name=None, dim_ordering='th'):
    ''' Reference:  He et al., http://arxiv.org/abs/1502.01852
    '''
-    fan_in, fan_out = get_fans(shape)
+    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    s = np.sqrt(2. / fan_in)
    return normal(shape, s, name=name)


-def he_uniform(shape, name=None):
-    fan_in, fan_out = get_fans(shape)
+def he_uniform(shape, name=None, dim_ordering='th'):
+    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    s = np.sqrt(6. / fan_in)
    return uniform(shape, s, name=name)

@@ -85,5 +102,6 @@ def one(shape, name=None):


 from .utils.generic_utils import get_from_module
-def get(identifier):
-    return get_from_module(identifier, globals(), 'initialization')
+def get(identifier, **kwargs):
+    return get_from_module(identifier, globals(),
+                           'initialization', kwargs=kwargs)
@@ -29,8 +29,8 @@ class LeakyReLU(MaskedLayer):
        return K.relu(X, alpha=self.alpha)

    def get_config(self):
-        config = {"name": self.__class__.__name__,
-                  "alpha": self.alpha}
+        config = {'name': self.__class__.__name__,
+                  'alpha': self.alpha}
        base_config = super(LeakyReLU, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -59,7 +59,8 @@ class PReLU(MaskedLayer):

    def build(self):
        input_shape = self.input_shape[1:]
-        self.alphas = self.init(input_shape)
+        self.alphas = self.init(input_shape,
+                                name='{}_alphas'.format(self.name))
        self.trainable_weights = [self.alphas]

        if self.initial_weights is not None:
@@ -73,8 +74,8 @@ class PReLU(MaskedLayer):
        return pos + neg

    def get_config(self):
-        config = {"name": self.__class__.__name__,
-                  "init": self.init.__name__}
+        config = {'name': self.__class__.__name__,
+                  'init': self.init.__name__}
        base_config = super(PReLU, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -106,8 +107,8 @@ class ELU(MaskedLayer):
        return pos + self.alpha * (K.exp(neg) - 1.)

    def get_config(self):
-        config = {"name": self.__class__.__name__,
-                  "alpha": self.alpha}
+        config = {'name': self.__class__.__name__,
+                  'alpha': self.alpha}
        base_config = super(ELU, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -140,8 +141,10 @@ class ParametricSoftplus(MaskedLayer):

    def build(self):
        input_shape = self.input_shape[1:]
-        self.alphas = K.variable(self.alpha_init * np.ones(input_shape))
-        self.betas = K.variable(self.beta_init * np.ones(input_shape))
+        self.alphas = K.variable(self.alpha_init * np.ones(input_shape),
+                                 name='{}_alphas'.format(self.name))
+        self.betas = K.variable(self.beta_init * np.ones(input_shape),
+                                name='{}_betas'.format(self.name))
        self.trainable_weights = [self.alphas, self.betas]

        if self.initial_weights is not None:
@@ -153,9 +156,9 @@ class ParametricSoftplus(MaskedLayer):
        return K.softplus(self.betas * X) * self.alphas

    def get_config(self):
-        config = {"name": self.__class__.__name__,
-                  "alpha_init": self.alpha_init,
-                  "beta_init": self.beta_init}
+        config = {'name': self.__class__.__name__,
+                  'alpha_init': self.alpha_init,
+                  'beta_init': self.beta_init}
        base_config = super(ParametricSoftplus, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -186,8 +189,8 @@ class ThresholdedLinear(MaskedLayer):
        return K.switch(K.abs(X) < self.theta, 0, X)

    def get_config(self):
-        config = {"name": self.__class__.__name__,
-                  "theta": self.theta}
+        config = {'name': self.__class__.__name__,
+                  'theta': self.theta}
        base_config = super(ThresholdedLinear, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -218,7 +221,66 @@ class ThresholdedReLU(MaskedLayer):
        return K.switch(X > self.theta, X, 0)

    def get_config(self):
-        config = {"name": self.__class__.__name__,
-                  "theta": self.theta}
+        config = {'name': self.__class__.__name__,
+                  'theta': self.theta}
        base_config = super(ThresholdedReLU, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
+
+
+class SReLU(MaskedLayer):
+    '''SReLU
+
+    # Input shape
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+
+    # Output shape
+        Same shape as the input.
+
+    # Arguments
+        t_left_init: initialization function for the left part intercept
+        a_left_init: initialization function for the left part slope
+        t_right_init: initialization function for the right part intercept
+        a_right_init: initialization function for the right part slope
+
+    # References
+        [Deep Learning with S-shaped Rectified Linear Activation Units](http://arxiv.org/abs/1512.07030)
+    '''
+    def __init__(self, t_left_init='zero', a_left_init='glorot_uniform',
+                 t_right_init='glorot_uniform', a_right_init='one', **kwargs):
+        self.t_left_init = initializations.get(t_left_init)
+        self.a_left_init = initializations.get(a_left_init)
+        self.t_right_init = initializations.get(t_right_init)
+        self.a_right_init = initializations.get(a_right_init)
+        super(SReLU, self).__init__(**kwargs)
+
+    def build(self):
+        input_shape = self.input_shape[1:]
+        self.t_left = self.t_left_init(input_shape,
+                                       name='{}_t_left'.format(self.name))
+        self.a_left = self.a_left_init(input_shape,
+                                       name='{}_a_left'.format(self.name))
+        self.t_right = self.t_right_init(input_shape,
+                                         name='{}_t_right'.format(self.name))
+        self.a_right = self.a_right_init(input_shape,
+                                         name='{}_a_right'.format(self.name))
+        # ensure the the right part is always to the right of the left
+        self.t_right_actual = self.t_left + abs(self.t_right)
+        self.trainable_weights = [self.t_left, self.a_left,
+                                  self.t_right, self.a_right]
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        Y_left_and_center = self.t_left + K.relu(X - self.t_left,
+                                                 self.a_left,
+                                                 self.t_right_actual - self.t_left)
+        Y_right = K.relu(X - self.t_right_actual) * self.a_right
+        return Y_left_and_center + Y_right
+
+    def get_config(self):
+        return {'name': self.__class__.__name__,
+                't_left_init': self.t_left_init.__name__,
+                'a_left_init': self.a_left_init.__name__,
+                't_right_init': self.t_right_init.__name__,
+                'a_right_init': self.a_right_init.__name__}
@@ -21,33 +21,11 @@ class Sequential(Layer):
    def __init__(self, layers=[]):
        self.layers = []
        self.layer_cache = {}
+        self.shape_cache = {}
        for layer in layers:
            self.add(layer)
        self._cache_enabled = True

-    def __call__(self, X, mask=None, train=False):
-        # turn off layer cache temporarily
-        tmp_cache_enabled = self.cache_enabled
-        self.cache_enabled = False
-        # recursively search for a layer which is not a Sequential model
-        layer = self
-        while issubclass(layer.__class__, Sequential):
-            layer = layer.layers[0]
-        # set temporary input to first layer
-        tmp_input = layer.get_input
-        tmp_mask = None
-        layer.get_input = lambda _: X
-        if hasattr(layer, 'get_input_mask'):
-            tmp_mask = layer.get_input_mask
-            layer.get_input_mask = lambda _: mask
-        Y = self.get_output(train=train)
-        # return input from first layer to what it was
-        layer.get_input = tmp_input
-        if hasattr(layer, 'get_input_mask'):
-            layer.get_input_mask = tmp_mask
-        self.cache_enabled = tmp_cache_enabled
-        return Y
-
    @property
    def cache_enabled(self):
        return self._cache_enabled
@@ -58,11 +36,35 @@ class Sequential(Layer):
        for l in self.layers:
            l.cache_enabled = value

-    def set_previous(self, layer):
-        self.layers[0].previous = layer
+    @property
+    def layer_cache(self):
+        return super(Sequential, self).layer_cache
+
+    @layer_cache.setter
+    def layer_cache(self, value):
+        self._layer_cache = value
+        for layer in self.layers:
+            layer.layer_cache = self._layer_cache
+
+    @property
+    def shape_cache(self):
+        return super(Sequential, self).shape_cache
+
+    @shape_cache.setter
+    def shape_cache(self, value):
+        self._shape_cache = value
+        for layer in self.layers:
+            layer.shape_cache = self._shape_cache
+
+    def set_previous(self, layer, reset_weights=True):
+        self.layers[0].set_previous(layer, reset_weights)
+
+    def clear_previous(self, reset_weights=True):
+        self.layers[0].clear_previous(reset_weights)

    def add(self, layer):
        layer.layer_cache = self.layer_cache
+        layer.shape_cache = self.shape_cache
        self.layers.append(layer)
        if len(self.layers) > 1:
            self.layers[-1].set_previous(self.layers[-2])
@@ -154,9 +156,9 @@ class Sequential(Layer):
        return weights

    def set_weights(self, weights):
-        for i in range(len(self.layers)):
-            nb_param = len(self.layers[i].trainable_weights) + len(self.layers[i].non_trainable_weights)
-            self.layers[i].set_weights(weights[:nb_param])
+        for layer in self.layers:
+            nb_param = len(layer.get_weights())
+            layer.set_weights(weights[:nb_param])
            weights = weights[nb_param:]

    def get_config(self):
@@ -188,6 +190,72 @@ class Graph(Layer):
        self.output_config = []  # dicts
        self.node_config = []  # dicts
        self.layer_cache = {}
+        self.shape_cache = {}
+        self._cache_enabled = True
+
+    def __call__(self, X, mask=None, train=False):
+        if type(X) != dict:
+            return super(Graph, self).__call__(X, mask, train)
+        else:
+            # turn off layer cache temporarily
+            tmp_cache_enabled = self.cache_enabled
+            self.cache_enabled = False
+            # create a temporary layer for each input
+            tmp_previous = {}
+            for name, input in self.inputs.items():
+                layer = Layer(batch_input_shape=input.input_shape)
+                layer.input = X[name]
+                if hasattr(self, 'get_input_mask'):
+                    layer.get_input_mask = lambda _: mask[name]
+                # set temporary previous
+                if hasattr(input, 'previous'):
+                    tmp_previous[name] = input.previous
+                input.set_previous(layer, False)
+            Y = self.get_output(train=train)
+            # return previous to what it was
+            for name, input in self.inputs.items():
+                if name in tmp_previous:
+                    input.set_previous(tmp_previous[name], False)
+                else:
+                    input.clear_previous(False)
+            self.cache_enabled = tmp_cache_enabled
+            return Y
+
+    @property
+    def cache_enabled(self):
+        return self._cache_enabled
+
+    @cache_enabled.setter
+    def cache_enabled(self, value):
+        self._cache_enabled = value
+        for l in self.nodes.values():
+            l.cache_enabled = value
+        for l in self.inputs.values():
+            l.cache_enabled = value
+
+    @property
+    def layer_cache(self):
+        return super(Graph, self).layer_cache
+
+    @layer_cache.setter
+    def layer_cache(self, value):
+        self._layer_cache = value
+        for layer in self.nodes.values():
+            layer.layer_cache = self._layer_cache
+        for layer in self.inputs.values():
+            layer.layer_cache = self._layer_cache
+
+    @property
+    def shape_cache(self):
+        return super(Graph, self).shape_cache
+
+    @shape_cache.setter
+    def shape_cache(self, value):
+        self._shape_cache = value
+        for layer in self.nodes.values():
+            layer.shape_cache = self._shape_cache
+        for layer in self.inputs.values():
+            layer.shape_cache = self._shape_cache

    @property
    def nb_input(self):
@@ -248,22 +316,35 @@ class Graph(Layer):
            if hasattr(l, 'reset_states') and getattr(l, 'stateful', False):
                l.reset_states()

-    def set_previous(self, layer, connection_map={}):
+    def set_previous(self, layer, connection_map={}, reset_weights=True):
        if self.nb_input != layer.nb_output:
            raise Exception('Cannot connect layers: '
                            'input count does not match output count.')
        if self.nb_input == 1:
-            self.inputs[self.input_order[0]].set_previous(layer)
+            self.inputs[self.input_order[0]].set_previous(layer, reset_weights)
        else:
            if not connection_map:
                raise Exception('Cannot attach multi-input layer: '
                                'no connection_map provided.')
            for k, v in connection_map.items():
                if k in self.inputs and v in layer.outputs:
-                    self.inputs[k].set_previous(layer.outputs[v])
+                    self.inputs[k].set_previous(layer.outputs[v], reset_weights)
                else:
                    raise Exception('Invalid connection map.')

+    def clear_previous(self, reset_weights=True):
+        for k in self.inputs.values():
+            k.clear_previous(reset_weights)
+
+    @property
+    def input_shape(self):
+        if self.nb_input == 1:
+            # return tuple
+            return self.inputs[self.input_order[0]].input_shape
+        else:
+            # return dictionary mapping input names to shape tuples
+            return dict([(k, v.input_shape) for k, v in self.inputs.items()])
+
    def get_input(self, train=False):
        if len(self.inputs) == len(self.outputs) == 1:
            return self.inputs[self.input_order[0]].get_input(train)
@@ -374,6 +455,7 @@ class Graph(Layer):

        self.namespace.add(name)
        layer.layer_cache = self.layer_cache
+        layer.shape_cache = self.shape_cache
        self.nodes[name] = layer
        self.node_config.append({'name': name,
                                 'input': input,
@@ -450,6 +532,7 @@ class Graph(Layer):
                sh = SiameseHead(i)
                sh.previous = s
                sh_name = outputs[i]
+                sh.name = sh_name
                self.namespace.add(sh_name)
                self.nodes[sh_name] = sh
                self.node_config.append({'name': sh_name,
@@ -79,7 +79,7 @@ class Convolution1D(Layer):
            raise Exception('Invalid border mode for Convolution1D:', border_mode)
        self.nb_filter = nb_filter
        self.filter_length = filter_length
-        self.init = initializations.get(init)
+        self.init = initializations.get(init, dim_ordering='th')
        self.activation = activations.get(activation)
        assert border_mode in {'valid', 'same'}, 'border_mode must be in {valid, same}'
        self.border_mode = border_mode
@@ -101,14 +101,13 @@ class Convolution1D(Layer):
        self.input_length = input_length
        if self.input_dim:
            kwargs['input_shape'] = (self.input_length, self.input_dim)
-        self.input = K.placeholder(ndim=3)
        super(Convolution1D, self).__init__(**kwargs)

    def build(self):
        input_dim = self.input_shape[2]
        self.W_shape = (self.nb_filter, input_dim, self.filter_length, 1)
-        self.W = self.init(self.W_shape)
-        self.b = K.zeros((self.nb_filter,))
+        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+        self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
        self.trainable_weights = [self.W, self.b]
        self.regularizers = []

@@ -234,7 +233,7 @@ class Convolution2D(Layer):
        self.nb_filter = nb_filter
        self.nb_row = nb_row
        self.nb_col = nb_col
-        self.init = initializations.get(init)
+        self.init = initializations.get(init, dim_ordering=dim_ordering)
        self.activation = activations.get(activation)
        assert border_mode in {'valid', 'same'}, 'border_mode must be in {valid, same}'
        self.border_mode = border_mode
@@ -251,7 +250,6 @@ class Convolution2D(Layer):
        self.constraints = [self.W_constraint, self.b_constraint]

        self.initial_weights = weights
-        self.input = K.placeholder(ndim=4)
        super(Convolution2D, self).__init__(**kwargs)

    def build(self):
@@ -263,8 +261,8 @@ class Convolution2D(Layer):
            self.W_shape = (self.nb_row, self.nb_col, stack_size, self.nb_filter)
        else:
            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
-        self.W = self.init(self.W_shape)
-        self.b = K.zeros((self.nb_filter,))
+        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+        self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
        self.trainable_weights = [self.W, self.b]
        self.regularizers = []

@@ -343,6 +341,195 @@ class Convolution2D(Layer):
        return dict(list(base_config.items()) + list(config.items()))


+class Convolution3D(Layer):
+    '''Convolution operator for filtering windows of three-dimensional inputs.
+    When using this layer as the first layer in a model,
+    provide the keyword argument `input_shape`
+    (tuple of integers, does not include the sample axis),
+    e.g. `input_shape=(3, 10, 128, 128)` for 10 frames of 128x128 RGB pictures.
+
+    Note: this layer will only work with Theano for the time being.
+
+    # Input shape
+        5D tensor with shape:
+        `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if dim_ordering='tf'.
+
+    # Output shape
+        5D tensor with shape:
+        `(samples, nb_filter, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, nb_filter)` if dim_ordering='tf'.
+        `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have changed due to padding.
+
+    # Arguments
+        nb_filter: Number of convolution filters to use.
+        kernel_dim1: Length of the first dimension in the covolution kernel.
+        kernel_dim2: Length of the second dimension in the convolution kernel.
+        kernel_dim3: Length of the third dimension in the convolution kernel.
+        init: name of initialization function for the weights of the layer
+            (see [initializations](../initializations.md)), or alternatively,
+            Theano function to use for weights initialization.
+            This parameter is only relevant if you don't pass
+            a `weights` argument.
+        activation: name of activation function to use
+            (see [activations](../activations.md)),
+            or alternatively, elementwise Theano function.
+            If you don't specify anything, no activation is applied
+            (ie. "linear" activation: a(x) = x).
+        weights: list of numpy arrays to set as initial weights.
+        border_mode: 'valid' or 'same'.
+        subsample: tuple of length 3. Factor by which to subsample output.
+            Also called strides elsewhere.
+            Note: 'subsample' is implemented by slicing the output of conv3d with strides=(1,1,1).
+        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the main weights matrix.
+        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
+            applied to the bias.
+        activity_regularizer: instance of [ActivityRegularizer](../regularizers.md),
+            applied to the network output.
+        W_constraint: instance of the [constraints](../constraints.md) module
+            (eg. maxnorm, nonneg), applied to the main weights matrix.
+        b_constraint: instance of the [constraints](../constraints.md) module,
+            applied to the bias.
+        dim_ordering: 'th' or 'tf'. In 'th' mode, the channels dimension
+            (the depth) is at index 1, in 'tf' mode is it at index 4.
+    '''
+    input_ndim = 5
+
+    def __init__(self, nb_filter, kernel_dim1, kernel_dim2, kernel_dim3,
+                 init='glorot_uniform', activation='linear', weights=None,
+                 border_mode='valid', subsample=(1, 1, 1), dim_ordering='th',
+                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
+                 W_constraint=None, b_constraint=None, **kwargs):
+        if K._BACKEND != 'theano':
+            raise Exception(self.__class__.__name__ +
+                            ' is currently only working with Theano backend.')
+        if border_mode not in {'valid', 'same'}:
+            raise Exception('Invalid border mode for Convolution3D:', border_mode)
+        self.nb_filter = nb_filter
+        self.kernel_dim1 = kernel_dim1
+        self.kernel_dim2 = kernel_dim2
+        self.kernel_dim3 = kernel_dim3
+        self.init = initializations.get(init, dim_ordering=dim_ordering)
+        self.activation = activations.get(activation)
+        assert border_mode in {'valid', 'same'}, 'border_mode must be in {valid, same}'
+        self.border_mode = border_mode
+        self.subsample = tuple(subsample)
+        assert dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
+        self.dim_ordering = dim_ordering
+
+        self.W_regularizer = regularizers.get(W_regularizer)
+        self.b_regularizer = regularizers.get(b_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+
+        self.W_constraint = constraints.get(W_constraint)
+        self.b_constraint = constraints.get(b_constraint)
+        self.constraints = [self.W_constraint, self.b_constraint]
+
+        self.initial_weights = weights
+        super(Convolution3D, self).__init__(**kwargs)
+
+    def build(self):
+
+        if self.dim_ordering == 'th':
+            stack_size = self.input_shape[1]
+            self.W_shape = (self.nb_filter, stack_size,
+                            self.kernel_dim1, self.kernel_dim2, self.kernel_dim3)
+        elif self.dim_ordering == 'tf':
+            stack_size = self.input_shape[4]
+            self.W_shape = (self.kernel_dim1, self.kernel_dim2, self.kernel_dim3,
+                            stack_size, self.nb_filter)
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+        self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
+        self.trainable_weights = [self.W, self.b]
+        self.regularizers = []
+
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+
+        if self.b_regularizer:
+            self.b_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
+        if self.activity_regularizer:
+            self.activity_regularizer.set_layer(self)
+            self.regularizers.append(self.activity_regularizer)
+
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        if self.dim_ordering == 'th':
+            conv_dim1 = input_shape[2]
+            conv_dim2 = input_shape[3]
+            conv_dim3 = input_shape[4]
+        elif self.dim_ordering == 'tf':
+            conv_dim1 = input_shape[1]
+            conv_dim2 = input_shape[2]
+            conv_dim3 = input_shape[3]
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+        conv_dim1 = conv_output_length(conv_dim1, self.kernel_dim1,
+                                       self.border_mode, self.subsample[0])
+        conv_dim2 = conv_output_length(conv_dim2, self.kernel_dim2,
+                                       self.border_mode, self.subsample[1])
+        conv_dim3 = conv_output_length(conv_dim3, self.kernel_dim3,
+                                       self.border_mode, self.subsample[2])
+
+        if self.dim_ordering == 'th':
+            return (input_shape[0], self.nb_filter, conv_dim1, conv_dim2, conv_dim3)
+        elif self.dim_ordering == 'tf':
+            return (input_shape[0], conv_dim1, conv_dim2, conv_dim3, self.nb_filter)
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        conv_out = K.conv3d(X, self.W, strides=self.subsample,
+                            border_mode=self.border_mode,
+                            dim_ordering=self.dim_ordering,
+                            volume_shape=self.input_shape,
+                            filter_shape=self.W_shape)
+
+        if self.dim_ordering == 'th':
+            output = conv_out + K.reshape(self.b, (1, self.nb_filter, 1, 1, 1))
+        elif self.dim_ordering == 'tf':
+            output = conv_out + K.reshape(self.b, (1, 1, 1, 1, self.nb_filter))
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+        output = self.activation(output)
+        return output
+
+    def get_config(self):
+        config = {"name": self.__class__.__name__,
+                  "nb_filter": self.nb_filter,
+                  "kernel_dim1": self.kernel_dim1,
+                  "kernel_dim2": self.kernel_dim2,
+                  "kernel_dim3": self.kernel_dim3,
+                  "dim_ordering": self.dim_ordering,
+                  "init": self.init.__name__,
+                  "activation": self.activation.__name__,
+                  "border_mode": self.border_mode,
+                  "subsample": self.subsample,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "b_constraint": self.b_constraint.get_config() if self.b_constraint else None}
+        base_config = super(Convolution3D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
 class _Pooling1D(Layer):
    '''Abstract class for different pooling 1D layers.
    '''
@@ -356,7 +543,6 @@ class _Pooling1D(Layer):
        self.pool_length = pool_length
        self.stride = stride
        self.st = (self.stride, 1)
-        self.input = K.placeholder(ndim=3)
        self.pool_size = (pool_length, 1)
        assert border_mode in {'valid', 'same'}, 'border_mode must be in {valid, same}'
        self.border_mode = border_mode
@@ -407,6 +593,7 @@ class MaxPooling1D(_Pooling1D):
        border_mode: 'valid' or 'same'.
            Note: 'same' will only work with TensorFlow for the time being.
    '''
+
    def __init__(self, pool_length=2, stride=None,
                 border_mode='valid', **kwargs):
        super(MaxPooling1D, self).__init__(pool_length, stride,
@@ -434,6 +621,7 @@ class AveragePooling1D(_Pooling1D):
        border_mode: 'valid' or 'same'.
            Note: 'same' will only work with TensorFlow for the time being.
    '''
+
    def __init__(self, pool_length=2, stride=None,
                 border_mode='valid', **kwargs):
        super(AveragePooling1D, self).__init__(pool_length, stride,
@@ -454,7 +642,6 @@ class _Pooling2D(Layer):
    def __init__(self, pool_size=(2, 2), strides=None, border_mode='valid',
                 dim_ordering='th', **kwargs):
        super(_Pooling2D, self).__init__(**kwargs)
-        self.input = K.placeholder(ndim=4)
        self.pool_size = tuple(pool_size)
        if strides is None:
            strides = self.pool_size
@@ -535,6 +722,7 @@ class MaxPooling2D(_Pooling2D):
        dim_ordering: 'th' or 'tf'. In 'th' mode, the channels dimension
            (the depth) is at index 1, in 'tf' mode is it at index 3.
    '''
+
    def __init__(self, pool_size=(2, 2), strides=None, border_mode='valid',
                 dim_ordering='th', **kwargs):
        super(MaxPooling2D, self).__init__(pool_size, strides, border_mode,
@@ -572,6 +760,7 @@ class AveragePooling2D(_Pooling2D):
        dim_ordering: 'th' or 'tf'. In 'th' mode, the channels dimension
            (the depth) is at index 1, in 'tf' mode is it at index 3.
    '''
+
    def __init__(self, pool_size=(2, 2), strides=None, border_mode='valid',
                 dim_ordering='th', **kwargs):
        super(AveragePooling2D, self).__init__(pool_size, strides, border_mode,
@@ -584,6 +773,157 @@ class AveragePooling2D(_Pooling2D):
        return output


+class _Pooling3D(Layer):
+    '''Abstract class for different pooling 3D layers.
+    '''
+    input_ndim = 5
+
+    def __init__(self, pool_size=(2, 2, 2), strides=None, border_mode='valid',
+                 dim_ordering='th', **kwargs):
+        super(_Pooling3D, self).__init__(**kwargs)
+        self.pool_size = tuple(pool_size)
+        if strides is None:
+            strides = self.pool_size
+        self.strides = tuple(strides)
+        assert border_mode in {'valid', 'same'}, 'border_mode must be in {valid, same}'
+        self.border_mode = border_mode
+        assert dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
+        self.dim_ordering = dim_ordering
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        if self.dim_ordering == 'th':
+            len_dim1 = input_shape[2]
+            len_dim2 = input_shape[3]
+            len_dim3 = input_shape[4]
+        elif self.dim_ordering == 'tf':
+            len_dim1 = input_shape[1]
+            len_dim2 = input_shape[2]
+            len_dim3 = input_shape[3]
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+        len_dim1 = conv_output_length(len_dim1, self.pool_size[0],
+                                      self.border_mode, self.strides[0])
+        len_dim2 = conv_output_length(len_dim2, self.pool_size[1],
+                                      self.border_mode, self.strides[1])
+        len_dim3 = conv_output_length(len_dim3, self.pool_size[2],
+                                      self.border_mode, self.strides[2])
+
+        if self.dim_ordering == 'th':
+            return (input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3)
+        elif self.dim_ordering == 'tf':
+            return (input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4])
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+    def _pooling_function(self, inputs, pool_size, strides,
+                          border_mode, dim_ordering):
+        raise NotImplementedError
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        output = self._pooling_function(inputs=X, pool_size=self.pool_size,
+                                        strides=self.strides,
+                                        border_mode=self.border_mode,
+                                        dim_ordering=self.dim_ordering)
+        return output
+
+    def get_config(self):
+        config = {'name': self.__class__.__name__,
+                  'pool_size': self.pool_size,
+                  'border_mode': self.border_mode,
+                  'strides': self.strides,
+                  'dim_ordering': self.dim_ordering}
+        base_config = super(_Pooling3D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaxPooling3D(_Pooling3D):
+    '''Max pooling operation for 3D data (spatial or spatio-temporal).
+
+    Note: this layer will only work with Theano for the time being.
+
+    # Input shape
+        5D tensor with shape:
+        `(samples, channels, len_pool_dim1, len_pool_dim2, len_pool_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, len_pool_dim1, len_pool_dim2, len_pool_dim3, channels)` if dim_ordering='tf'.
+
+    # Output shape
+        5D tensor with shape:
+        `(nb_samples, channels, pooled_dim1, pooled_dim2, pooled_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, pooled_dim1, pooled_dim2, pooled_dim3, channels)` if dim_ordering='tf'.
+
+    # Arguments
+        pool_size: tuple of 3 integers,
+            factors by which to downscale (dim1, dim2, dim3).
+            (2, 2, 2) will halve the size of the 3D input in each dimension.
+        strides: tuple of 3 integers, or None. Strides values.
+        border_mode: 'valid' or 'same'.
+        dim_ordering: 'th' or 'tf'. In 'th' mode, the channels dimension
+            (the depth) is at index 1, in 'tf' mode is it at index 4.
+    '''
+
+    def __init__(self, pool_size=(2, 2, 2), strides=None, border_mode='valid',
+                 dim_ordering='th', **kwargs):
+        if K._BACKEND != 'theano':
+            raise Exception(self.__class__.__name__ +
+                            ' is currently only working with Theano backend.')
+        super(MaxPooling3D, self).__init__(pool_size, strides, border_mode,
+                                           dim_ordering, **kwargs)
+
+    def _pooling_function(self, inputs, pool_size, strides,
+                          border_mode, dim_ordering):
+        output = K.pool3d(inputs, pool_size, strides,
+                          border_mode, dim_ordering, pool_mode='max')
+        return output
+
+
+class AveragePooling3D(_Pooling3D):
+    '''Average pooling operation for 3D data (spatial or spatio-temporal).
+
+    Note: this layer will only work with Theano for the time being.
+
+    # Input shape
+        5D tensor with shape:
+        `(samples, channels, len_pool_dim1, len_pool_dim2, len_pool_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, len_pool_dim1, len_pool_dim2, len_pool_dim3, channels)` if dim_ordering='tf'.
+
+    # Output shape
+        5D tensor with shape:
+        `(nb_samples, channels, pooled_dim1, pooled_dim2, pooled_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, pooled_dim1, pooled_dim2, pooled_dim3, channels)` if dim_ordering='tf'.
+
+    # Arguments
+        pool_size: tuple of 3 integers,
+            factors by which to downscale (dim1, dim2, dim3).
+            (2, 2, 2) will halve the size of the 3D input in each dimension.
+        strides: tuple of 3 integers, or None. Strides values.
+        border_mode: 'valid' or 'same'.
+        dim_ordering: 'th' or 'tf'. In 'th' mode, the channels dimension
+            (the depth) is at index 1, in 'tf' mode is it at index 4.
+    '''
+
+    def __init__(self, pool_size=(2, 2, 2), strides=None, border_mode='valid',
+                 dim_ordering='th', **kwargs):
+        if K._BACKEND != 'theano':
+            raise Exception(self.__class__.__name__ +
+                            ' is currently only working with Theano backend.')
+        super(AveragePooling3D, self).__init__(pool_size, strides, border_mode,
+                                               dim_ordering, **kwargs)
+
+    def _pooling_function(self, inputs, pool_size, strides,
+                          border_mode, dim_ordering):
+        output = K.pool3d(inputs, pool_size, strides,
+                          border_mode, dim_ordering, pool_mode='avg')
+        return output
+
+
 class UpSampling1D(Layer):
    '''Repeat each temporal step `length` times along the time axis.

@@ -601,7 +941,6 @@ class UpSampling1D(Layer):
    def __init__(self, length=2, **kwargs):
        super(UpSampling1D, self).__init__(**kwargs)
        self.length = length
-        self.input = K.placeholder(ndim=3)

    @property
    def output_shape(self):
@@ -646,7 +985,6 @@ class UpSampling2D(Layer):

    def __init__(self, size=(2, 2), dim_ordering='th', **kwargs):
        super(UpSampling2D, self).__init__(**kwargs)
-        self.input = K.placeholder(ndim=4)
        self.size = tuple(size)
        assert dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
        self.dim_ordering = dim_ordering
@@ -679,6 +1017,71 @@ class UpSampling2D(Layer):
        return dict(list(base_config.items()) + list(config.items()))


+class UpSampling3D(Layer):
+    '''Repeat the first, second and third dimension of the data
+    by size[0], size[1] and size[2] respectively.
+
+    Note: this layer will only work with Theano for the time being.
+
+    # Input shape
+        5D tensor with shape:
+        `(samples, channels, dim1, dim2, dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, dim1, dim2, dim3, channels)` if dim_ordering='tf'.
+
+    # Output shape
+        5D tensor with shape:
+        `(samples, channels, upsampled_dim1, upsampled_dim2, upsampled_dim3)` if dim_ordering='th'
+        or 5D tensor with shape:
+        `(samples, upsampled_dim1, upsampled_dim2, upsampled_dim3, channels)` if dim_ordering='tf'.
+
+    # Arguments
+        size: tuple of 3 integers. The upsampling factors for dim1, dim2 and dim3.
+        dim_ordering: 'th' or 'tf'.
+            In 'th' mode, the channels dimension (the depth)
+            is at index 1, in 'tf' mode is it at index 4.
+    '''
+    input_ndim = 5
+
+    def __init__(self, size=(2, 2, 2), dim_ordering='th', **kwargs):
+        if K._BACKEND != 'theano':
+            raise Exception(self.__class__.__name__ +
+                            ' is currently only working with Theano backend.')
+        super(UpSampling3D, self).__init__(**kwargs)
+        self.size = tuple(size)
+        assert dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
+        self.dim_ordering = dim_ordering
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        if self.dim_ordering == 'th':
+            return (input_shape[0],
+                    input_shape[1],
+                    self.size[0] * input_shape[2],
+                    self.size[1] * input_shape[3],
+                    self.size[2] * input_shape[4])
+        elif self.dim_ordering == 'tf':
+            return (input_shape[0],
+                    self.size[0] * input_shape[1],
+                    self.size[1] * input_shape[2],
+                    self.size[2] * input_shape[3],
+                    input_shape[4])
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        return K.resize_volumes(X, self.size[0], self.size[1], self.size[2],
+                                self.dim_ordering)
+
+    def get_config(self):
+        config = {'name': self.__class__.__name__,
+                  'size': self.size}
+        base_config = super(UpSampling3D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
 class ZeroPadding1D(Layer):
    '''Zero-padding layer for 1D input (e.g. temporal sequence).

@@ -698,13 +1101,13 @@ class ZeroPadding1D(Layer):
    def __init__(self, padding=1, **kwargs):
        super(ZeroPadding1D, self).__init__(**kwargs)
        self.padding = padding
-        self.input = K.placeholder(ndim=3)

    @property
    def output_shape(self):
        input_shape = self.input_shape
+        length = input_shape[1] + self.padding * 2 if input_shape[1] is not None else None
        return (input_shape[0],
-                input_shape[1] + self.padding * 2,
+                length,
                input_shape[2])

    def get_output(self, train=False):
@@ -739,7 +1142,6 @@ class ZeroPadding2D(Layer):
    def __init__(self, padding=(1, 1), dim_ordering='th', **kwargs):
        super(ZeroPadding2D, self).__init__(**kwargs)
        self.padding = tuple(padding)
-        self.input = K.placeholder(ndim=4)
        assert dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
        self.dim_ordering = dim_ordering

@@ -747,14 +1149,18 @@ class ZeroPadding2D(Layer):
    def output_shape(self):
        input_shape = self.input_shape
        if self.dim_ordering == 'th':
+            width = input_shape[2] + 2 * self.padding[0] if input_shape[2] is not None else None
+            height = input_shape[3] + 2 * self.padding[1] if input_shape[3] is not None else None
            return (input_shape[0],
                    input_shape[1],
-                    input_shape[2] + 2 * self.padding[0],
-                    input_shape[3] + 2 * self.padding[1])
+                    width,
+                    height)
        elif self.dim_ordering == 'tf':
+            width = input_shape[1] + 2 * self.padding[0] if input_shape[1] is not None else None
+            height = input_shape[2] + 2 * self.padding[1] if input_shape[2] is not None else None
            return (input_shape[0],
-                    input_shape[1] + 2 * self.padding[0],
-                    input_shape[2] + 2 * self.padding[1],
+                    width,
+                    height,
                    input_shape[3])
        else:
            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
@@ -769,3 +1175,68 @@ class ZeroPadding2D(Layer):
                  'padding': self.padding}
        base_config = super(ZeroPadding2D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding3D(Layer):
+    '''Zero-padding layer for 3D data (spatial or spatio-temporal).
+
+    Note: this layer will only work with Theano for the time being.
+
+    # Input shape
+        5D tensor with shape:
+        (samples, depth, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad)
+
+    # Output shape
+        5D tensor with shape:
+        (samples, depth, first_padded_axis, second_padded_axis, third_axis_to_pad)
+
+    # Arguments
+        padding: tuple of int (length 3)
+            How many zeros to add at the beginning and end of
+            the 3 padding dimensions (axis 3, 4 and 5).
+    '''
+    input_ndim = 5
+
+    def __init__(self, padding=(1, 1, 1), dim_ordering='th', **kwargs):
+        if K._BACKEND != 'theano':
+            raise Exception(self.__class__.__name__ +
+                            ' is currently only working with Theano backend.')
+        super(ZeroPadding3D, self).__init__(**kwargs)
+        self.padding = tuple(padding)
+        assert dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
+        self.dim_ordering = dim_ordering
+
+    @property
+    def output_shape(self):
+        input_shape = self.input_shape
+        if self.dim_ordering == 'th':
+            dim1 = input_shape[2] + 2 * self.padding[0] if input_shape[2] is not None else None
+            dim2 = input_shape[3] + 2 * self.padding[1] if input_shape[3] is not None else None
+            dim3 = input_shape[4] + 2 * self.padding[2] if input_shape[4] is not None else None
+            return (input_shape[0],
+                    input_shape[1],
+                    dim1,
+                    dim2,
+                    dim3)
+        elif self.dim_ordering == 'tf':
+            dim1 = input_shape[1] + 2 * self.padding[0] if input_shape[1] is not None else None
+            dim2 = input_shape[2] + 2 * self.padding[1] if input_shape[2] is not None else None
+            dim3 = input_shape[3] + 2 * self.padding[2] if input_shape[3] is not None else None
+            return (input_shape[0],
+                    dim1,
+                    dim2,
+                    dim3,
+                    input_shape[4])
+        else:
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        return K.spatial_3d_padding(X, padding=self.padding,
+                                    dim_ordering=self.dim_ordering)
+
+    def get_config(self):
+        config = {'name': self.__class__.__name__,
+                  'padding': self.padding}
+        base_config = super(ZeroPadding3D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -12,9 +12,7 @@ from .. import backend as K
 from .. import activations, initializations, regularizers, constraints
 from ..regularizers import ActivityRegularizer

-import marshal
-import types
-import sys
+import inspect


 class Layer(object):
@@ -45,19 +43,26 @@ class Layer(object):
                          'name'}
        for kwarg in kwargs:
            assert kwarg in allowed_kwargs, 'Keyword argument not understood: ' + kwarg
+
+        if 'name' in kwargs:
+            self.name = kwargs['name']
+        else:
+            self.name = self.__class__.__name__.lower()
+
+        if 'cache_enabled' in kwargs:
+            self.cache_enabled = kwargs['cache_enabled']
+        else:
+            self.cache_enabled = True
+
        if 'batch_input_shape' in kwargs:
            self.set_input_shape(tuple(kwargs['batch_input_shape']))
        elif 'input_shape' in kwargs:
            self.set_input_shape((None,) + tuple(kwargs['input_shape']))
-        self.trainable = True
+
        if 'trainable' in kwargs:
            self.trainable = kwargs['trainable']
-        self.name = self.__class__.__name__.lower()
-        if 'name' in kwargs:
-            self.name = kwargs['name']
-        self.cache_enabled = True
-        if 'cache_enabled' in kwargs:
-            self.cache_enabled = kwargs['cache_enabled']
+        else:
+            self.trainable = True

    @property
    def name(self):
@@ -75,22 +80,56 @@ class Layer(object):
    def cache_enabled(self, value):
        self._cache_enabled = value

+    @property
+    def layer_cache(self):
+        if hasattr(self, '_layer_cache'):
+            return self._layer_cache
+        else:
+            return None
+
+    @layer_cache.setter
+    def layer_cache(self, value):
+        self._layer_cache = value
+
+    @property
+    def shape_cache(self):
+        if hasattr(self, '_shape_cache'):
+            return self._shape_cache
+        else:
+            return None
+
+    @shape_cache.setter
+    def shape_cache(self, value):
+        self._shape_cache = value
+
    def __call__(self, X, mask=None, train=False):
-        # set temporary input
-        tmp_input = self.get_input
-        tmp_mask = None
+        # reset layer cache temporarily
+        tmp_layer_cache = self.layer_cache
+        tmp_shape_cache = self.shape_cache
+        self.layer_cache = {}
+        self.shape_cache = {}
+        # create a temporary layer
+        layer = Layer(batch_input_shape=self.input_shape)
+        layer.name = "dummy"
+        layer.input = X
        if hasattr(self, 'get_input_mask'):
-            tmp_mask = self.get_input_mask
-            self.get_input_mask = lambda _: mask
-        self.get_input = lambda _: X
+            layer.get_input_mask = lambda _: mask
+        # set temporary previous
+        tmp_previous = None
+        if hasattr(self, 'previous'):
+            tmp_previous = self.previous
+        self.set_previous(layer, False)
        Y = self.get_output(train=train)
-        # return input to what it was
-        if hasattr(self, 'get_input_mask'):
-            self.get_input_mask = tmp_mask
-        self.get_input = tmp_input
+        # return previous to what it was
+        if tmp_previous is not None:
+            self.set_previous(tmp_previous, False)
+        else:
+            self.clear_previous(False)
+        self.layer_cache = tmp_layer_cache
+        self.shape_cache = tmp_shape_cache
        return Y

-    def set_previous(self, layer):
+    def set_previous(self, layer, reset_weights=True):
        '''Connect a layer to its parent in the computational graph.
        '''
        assert self.nb_input == layer.nb_output == 1, 'Cannot connect layers: input count and output count should be 1.'
@@ -101,8 +140,27 @@ class Layer(object):
                                                                str(layer.output_shape))
        if layer.get_output_mask() is not None:
            assert self.supports_masked_input(), 'Cannot connect non-masking layer to layer with masked output.'
+        if not reset_weights:
+            assert layer.output_shape == self.input_shape, ('Cannot connect layers without resetting weights: ' +
+                                                            'expected input with shape ' +
+                                                            str(self.input_shape) +
+                                                            ' but previous layer has output_shape ' +
+                                                            str(layer.output_shape))
        self.previous = layer
-        self.build()
+        if reset_weights:
+            self.build()
+
+    def clear_previous(self, reset_weights=True):
+        '''Unlink a layer from its parent in the computational graph.
+
+        This is only allowed if the layer has an `input` attribute.
+        '''
+        if not hasattr(self, 'input'):
+            raise Exception('Cannot clear previous for non-input layers')
+        if hasattr(self, 'previous'):
+            del self.previous
+            if reset_weights:
+                self.build()

    def build(self):
        '''Instantiation of layer weights.
@@ -137,7 +195,15 @@ class Layer(object):
        # if layer is not connected (e.g. input layer),
        # input shape can be set manually via _input_shape attribute.
        if hasattr(self, 'previous'):
-            return self.previous.output_shape
+            if self.shape_cache is not None and self.cache_enabled:
+                previous_layer_id = id(self.previous)
+                if previous_layer_id in self.shape_cache:
+                    return self.shape_cache[previous_layer_id]
+            previous_size = self.previous.output_shape
+            if self.shape_cache is not None and self.cache_enabled:
+                previous_layer_id = id(self.previous)
+                self.shape_cache[previous_layer_id] = previous_size
+            return previous_size
        elif hasattr(self, '_input_shape'):
            return self._input_shape
        else:
@@ -168,20 +234,20 @@ class Layer(object):
        if hasattr(self, 'previous'):
            # to avoid redundant computations,
            # layer outputs are cached when possible.
-            if hasattr(self, 'layer_cache') and self.cache_enabled:
+            if self.layer_cache is not None and self.cache_enabled:
                previous_layer_id = '%s_%s' % (id(self.previous), train)
                if previous_layer_id in self.layer_cache:
                    return self.layer_cache[previous_layer_id]
            previous_output = self.previous.get_output(train=train)
-            if hasattr(self, 'layer_cache') and self.cache_enabled:
+            if self.layer_cache is not None and self.cache_enabled:
                previous_layer_id = '%s_%s' % (id(self.previous), train)
                self.layer_cache[previous_layer_id] = previous_output
            return previous_output
        elif hasattr(self, 'input'):
            return self.input
        else:
-            raise Exception('Layer is not connected' +
-                            ' and is not an input layer.')
+            self.input = K.placeholder(shape=self.input_shape)
+            return self.input

    def supports_masked_input(self):
        '''Whether or not this layer respects the output mask of its previous
@@ -230,7 +296,7 @@ class Layer(object):
                                             str(len(weights)) + ' provided weights)')
        for p, w in zip(params, weights):
            if K.get_value(p).shape != w.shape:
-                raise Exception('Layer shape %s not compatible with weight shape %s.' % (K.get_value(p).shape, w.shape))
+                raise Exception('Layer weight shape %s not compatible with provided weight shape %s.' % (K.get_value(p).shape, w.shape))
            K.set_value(p, w)

    def get_weights(self):
@@ -327,8 +393,6 @@ class Masking(MaskedLayer):
    def __init__(self, mask_value=0., **kwargs):
        super(Masking, self).__init__(**kwargs)
        self.mask_value = mask_value
-        if (not hasattr(self, 'input')):
-            self.input = K.placeholder(ndim=3)

    def get_output_mask(self, train=False):
        X = self.get_input(train)
@@ -349,18 +413,16 @@ class Merge(Layer):
    '''Merge the output of a list of layers or containers into a single tensor.

    # Arguments
-        mode: one of {sum, mul, concat, ave, dot}.
+        mode: one of {sum, mul, concat, ave, join, cos, dot}.
            sum: sum the outputs (shapes must match)
            mul: multiply the outputs element-wise (shapes must match)
            concat: concatenate the outputs along the axis specified by `concat_axis`
            ave: average the outputs (shapes must match)
+            join: places the outputs in an OrderedDict (inputs must be named)
        concat_axis: axis to use in `concat` mode.
        dot_axes: axis or axes to use in `dot` mode
            (see [the Numpy documentation](http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.tensordot.html) for more details).

-    # TensorFlow warning
-        `dot` mode only works with Theano for the time being.
-
    # Examples

    ```python
@@ -399,9 +461,6 @@ class Merge(Layer):
                                'be merged using ' + mode + ' mode. ' +
                                'Layer shapes: %s' % ([l.output_shape for l in layers]))
        if mode in {'cos', 'dot'}:
-            if K._BACKEND != 'theano':
-                raise Exception('"' + mode + '" merge mode will only work with Theano.')
-
            if len(layers) > 2:
                raise Exception(mode + ' merge takes exactly 2 layers')
            shape1 = layers[0].output_shape
@@ -455,6 +514,10 @@ class Merge(Layer):
                    self.constraints.append(c)
        super(Merge, self).__init__()

+    @property
+    def input_shape(self):
+        return [layer.input_shape for layer in self.layers]
+
    @property
    def output_shape(self):
        input_shapes = [layer.output_shape for layer in self.layers]
@@ -514,23 +577,18 @@ class Merge(Layer):
                s *= self.layers[i].get_output(train)
            return s
        elif self.mode == 'dot':
-            if K._BACKEND != 'theano':
-                raise Exception('"dot" merge mode will only work with Theano.')
-            from theano import tensor as T
            l1 = self.layers[0].get_output(train)
            l2 = self.layers[1].get_output(train)
-            output = T.batched_tensordot(l1, l2, self.dot_axes)
+            output = K.batch_dot(l1, l2, self.dot_axes)
            output_shape = list(self.output_shape)
-            output_shape[0] = l1.shape[0]
-            output = output.reshape(tuple(output_shape))
+            output_shape[0] = -1
+            output = K.reshape(output, (tuple(output_shape)))
            return output
        elif self.mode == 'cos':
-            if K._BACKEND != 'theano':
-                raise Exception('"dot" merge mode will only work with Theano.')
-            import theano
            l1 = self.layers[0].get_output(train)
            l2 = self.layers[1].get_output(train)
-            output = T.batched_tensordot(l1, l2, self.dot_axes) / T.sqrt(T.batched_tensordot(l1, l1, self.dot_axes) * T.batched_tensordot(l2, l2, self.dot_axes))
+            output = K.batch_dot(l1, l2, self.dot_axes) / K.sqrt(
+                K.batch_dot(l1, l1, self.dot_axes) * K.batch_dot(l2, l2, self.dot_axes))
            output = output.dimshuffle((0, 'x'))
            return output
        else:
@@ -897,7 +955,8 @@ class Dense(Layer):
            If you don't specify anything, no activation is applied
            (ie. "linear" activation: a(x) = x).
        weights: list of numpy arrays to set as initial weights.
-            The list should have 1 element, of shape `(input_dim, output_dim)`.
+            The list should have 2 elements, of shape `(input_dim, output_dim)`
+            and (output_dim,) for weights and biases respectively.
        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
            (eg. L1 or L2 regularization), applied to the main weights matrix.
        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
@@ -934,14 +993,15 @@ class Dense(Layer):
        self.input_dim = input_dim
        if self.input_dim:
            kwargs['input_shape'] = (self.input_dim,)
-        self.input = K.placeholder(ndim=2)
        super(Dense, self).__init__(**kwargs)

    def build(self):
        input_dim = self.input_shape[1]

-        self.W = self.init((input_dim, self.output_dim))
-        self.b = K.zeros((self.output_dim,))
+        self.W = self.init((input_dim, self.output_dim),
+                           name='{}_W'.format(self.name))
+        self.b = K.zeros((self.output_dim,),
+                         name='{}_b'.format(self.name))

        self.trainable_weights = [self.W, self.b]

@@ -1009,7 +1069,8 @@ class TimeDistributedDense(MaskedLayer):
            If you don't specify anything, no activation is applied
            (ie. "linear" activation: a(x) = x).
        weights: list of numpy arrays to set as initial weights.
-            The list should have 1 element, of shape `(input_dim, output_dim)`.
+            The list should have 2 elements, of shape `(input_dim, output_dim)`
+            and (output_dim,) for weights and biases respectively.
        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
            (eg. L1 or L2 regularization), applied to the main weights matrix.
        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
@@ -1049,14 +1110,15 @@ class TimeDistributedDense(MaskedLayer):
        self.input_length = input_length
        if self.input_dim:
            kwargs['input_shape'] = (self.input_length, self.input_dim)
-        self.input = K.placeholder(ndim=3)
        super(TimeDistributedDense, self).__init__(**kwargs)

    def build(self):
        input_dim = self.input_shape[2]

-        self.W = self.init((input_dim, self.output_dim))
-        self.b = K.zeros((self.output_dim,))
+        self.W = self.init((input_dim, self.output_dim),
+                           name='{}_W'.format(self.name))
+        self.b = K.zeros((self.output_dim,),
+                         name='{}_b'.format(self.name))

        self.trainable_weights = [self.W, self.b]
        self.regularizers = []
@@ -1083,17 +1145,18 @@ class TimeDistributedDense(MaskedLayer):
        return (input_shape[0], input_shape[1], self.output_dim)

    def get_output(self, train=False):
-        X = self.get_input(train)
-
-        def step(x, states):
-            output = K.dot(x, self.W) + self.b
-            return output, []
-
-        last_output, outputs, states = K.rnn(step, X,
-                                             initial_states=[],
-                                             mask=None)
-        outputs = self.activation(outputs)
-        return outputs
+        X = self.get_input(train)  # (samples, timesteps, input_dim)
+        # Squash samples and timesteps into a single axis
+        x = K.reshape(X, (-1, self.input_shape[-1]))  # (samples * timesteps, input_dim)
+        Y = K.dot(x, self.W) + self.b  # (samples * timesteps, output_dim)
+        # We have to reshape Y to (samples, timesteps, output_dim)
+        input_length = self.input_shape[1]
+        # Note: input_length will always be provided when using tensorflow backend.
+        if not input_length:
+            input_length = K.shape(X)[1]
+        Y = K.reshape(Y, (-1, input_length, self.output_shape[-1]))  # (samples, timesteps, output_dim)
+        Y = self.activation(Y)
+        return Y

    def get_config(self):
        config = {'name': self.__class__.__name__,
@@ -1203,7 +1266,9 @@ class AutoEncoder(Layer):

        self._output_reconstruction = output_reconstruction
        self.encoder = encoder
+        self.encoder.layer_cache = self.layer_cache
        self.decoder = decoder
+        self.decoder.layer_cache = self.layer_cache

        if output_reconstruction:
            self.decoder.set_previous(self.encoder)
@@ -1241,8 +1306,30 @@ class AutoEncoder(Layer):
                    self.trainable_weights.append(p)
                    self.constraints.append(c)

-    def set_previous(self, node):
-        self.encoder.set_previous(node)
+    @property
+    def layer_cache(self):
+        return super(AutoEncoder, self).layer_cache
+
+    @layer_cache.setter
+    def layer_cache(self, value):
+        self._layer_cache = value
+        self.encoder.layer_cache = self._layer_cache
+        self.decoder.layer_cache = self._layer_cache
+
+    @property
+    def shape_cache(self):
+        return super(AutoEncoder, self).shape_cache
+
+    @shape_cache.setter
+    def shape_cache(self, value):
+        self._shape_cache = value
+        self.encoder.shape_cache = self._shape_cache
+        self.decoder.shape_cache = self._shape_cache
+
+    def set_previous(self, node, reset_weights=True):
+        self.encoder.set_previous(node, reset_weights)
+        if reset_weights:
+            self.build()

    def get_weights(self):
        weights = []
@@ -1330,14 +1417,15 @@ class MaxoutDense(Layer):
        self.input_dim = input_dim
        if self.input_dim:
            kwargs['input_shape'] = (self.input_dim,)
-        self.input = K.placeholder(ndim=2)
        super(MaxoutDense, self).__init__(**kwargs)

    def build(self):
        input_dim = self.input_shape[1]

-        self.W = self.init((self.nb_feature, input_dim, self.output_dim))
-        self.b = K.zeros((self.nb_feature, self.output_dim))
+        self.W = self.init((self.nb_feature, input_dim, self.output_dim),
+                           name='{}_W'.format(self.name))
+        self.b = K.zeros((self.nb_feature, self.output_dim),
+                         name='{}_b'.format(self.name))

        self.trainable_weights = [self.W, self.b]
        self.regularizers = []
@@ -1400,46 +1488,56 @@ class Lambda(Layer):
            Takes one argument: the output of previous layer
        output_shape: Expected output shape from function.
            Could be a tuple or a function of the shape of the input
+        arguments: optional dictionary of keyword arguments to be passed
+            to the function.
    '''
-    def __init__(self, function, output_shape=None, **kwargs):
+    def __init__(self, function, output_shape=None, arguments={}, **kwargs):
        super(Lambda, self).__init__(**kwargs)
-        py3 = sys.version_info[0] == 3
-        if py3:
-            self.function = marshal.dumps(function.__code__)
-        else:
-            assert hasattr(function, 'func_code'), ('The Lambda layer "function"'
-                                                    ' argument must be a Python function.')
-            self.function = marshal.dumps(function.func_code)
+        self.function = function
+        self.arguments = arguments
        if output_shape is None:
            self._output_shape = None
        elif type(output_shape) in {tuple, list}:
            self._output_shape = tuple(output_shape)
        else:
-            if py3:
-                self._output_shape = marshal.dumps(output_shape.__code__)
-            else:
-                self._output_shape = marshal.dumps(output_shape.func_code)
+            assert hasattr(output_shape, '__call__'), 'In Lambda, `output_shape` must be a list, a tuple, or a function.'
+            self._output_shape = output_shape
        super(Lambda, self).__init__()

    @property
    def output_shape(self):
        if self._output_shape is None:
+            # if TensorFlow, we can infer the output shape directly:
+            if K._BACKEND == 'tensorflow':
+                # we assume output shape is not dependent on train/test mode
+                x = self.get_output()
+                return K.int_shape(x)
+            # otherwise, we default to the input shape
            return self.input_shape
-        elif type(self._output_shape) == tuple:
-            return (self.input_shape[0], ) + self._output_shape
+        elif type(self._output_shape) in {tuple, list}:
+            nb_samples = self.input_shape[0] if self.input_shape else None
+            return (nb_samples,) + tuple(self._output_shape)
        else:
-            output_shape_func = marshal.loads(self._output_shape)
-            output_shape_func = types.FunctionType(output_shape_func, globals())
-            shape = output_shape_func(self.input_shape)
+            shape = self._output_shape(self.input_shape)
            if type(shape) not in {list, tuple}:
                raise Exception('output_shape function must return a tuple')
            return tuple(shape)

    def get_output(self, train=False):
        X = self.get_input(train)
-        func = marshal.loads(self.function)
-        func = types.FunctionType(func, globals())
-        return func(X)
+        arguments = self.arguments
+        arg_spec = inspect.getargspec(self.function)
+        if 'train' in arg_spec.args:
+            arguments['train'] = train
+        return self.function(X, **arguments)
+
+    def get_config(self):
+        # note: not serializable at the moment.
+        config = {'function': self.function,
+                  'output_shape': self._output_shape,
+                  'arguments': self.arguments}
+        base_config = super(Lambda, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))


 class MaskedLambda(MaskedLayer, Lambda):
@@ -1459,8 +1557,10 @@ class LambdaMerge(Lambda):
            list of outputs from input layers
        output_shape - Expected output shape from function.
            Could be a tuple or a function of list of input shapes
+        arguments: optional dictionary of keyword arguments to be passed
+            to the function.
    '''
-    def __init__(self, layers, function, output_shape=None):
+    def __init__(self, layers, function, output_shape=None, arguments={}):
        if len(layers) < 2:
            raise Exception('Please specify two or more input layers '
                            '(or containers) to merge.')
@@ -1469,6 +1569,7 @@ class LambdaMerge(Lambda):
        self.regularizers = []
        self.constraints = []
        self.updates = []
+        self.arguments = arguments
        for l in self.layers:
            params, regs, consts, updates = l.get_params()
            self.regularizers += regs
@@ -1478,20 +1579,14 @@ class LambdaMerge(Lambda):
                if p not in self.trainable_weights:
                    self.trainable_weights.append(p)
                    self.constraints.append(c)
-        py3 = sys.version_info[0] == 3
-        if py3:
-            self.function = marshal.dumps(function.__code__)
-        else:
-            self.function = marshal.dumps(function.func_code)
+        self.function = function
        if output_shape is None:
            self._output_shape = None
        elif type(output_shape) in {tuple, list}:
            self._output_shape = tuple(output_shape)
        else:
-            if py3:
-                self._output_shape = marshal.dumps(output_shape.__code__)
-            else:
-                self._output_shape = marshal.dumps(output_shape.func_code)
+            assert hasattr(output_shape, '__call__'), 'In LambdaMerge, `output_shape` must be a list, a tuple, or a function.'
+            self._output_shape = output_shape
        super(Lambda, self).__init__()

    @property
@@ -1499,24 +1594,24 @@ class LambdaMerge(Lambda):
        input_shapes = [layer.output_shape for layer in self.layers]
        if self._output_shape is None:
            return input_shapes[0]
-        elif type(self._output_shape) == tuple:
-            return (input_shapes[0][0], ) + self._output_shape
+        elif type(self._output_shape) in {tuple, list}:
+            return (input_shapes[0][0],) + self._output_shape
        else:
-            output_shape_func = marshal.loads(self._output_shape)
-            output_shape_func = types.FunctionType(output_shape_func, globals())
-            shape = output_shape_func(input_shapes)
+            shape = self._output_shape(input_shapes)
            if type(shape) not in {list, tuple}:
-                raise Exception('output_shape function must return a tuple.')
+                raise Exception('In LambdaMerge, the `output_shape` function must return a tuple.')
            return tuple(shape)

    def get_params(self):
        return self.trainable_weights, self.regularizers, self.constraints, self.updates

    def get_output(self, train=False):
-        func = marshal.loads(self.function)
-        func = types.FunctionType(func, globals())
        inputs = [layer.get_output(train) for layer in self.layers]
-        return func(inputs)
+        arguments = self.arguments
+        arg_spec = inspect.getargspec(self.function)
+        if 'train' in arg_spec.args:
+            arguments['train'] = train
+        return self.function(inputs, **arguments)

    def get_input(self, train=False):
        res = []
@@ -1552,10 +1647,12 @@ class LambdaMerge(Lambda):
            weights = weights[nb_param:]

    def get_config(self):
+        # note: not serializable at the moment.
        config = {'name': self.__class__.__name__,
                  'layers': [l.get_config() for l in self.layers],
                  'function': self.function,
-                  'output_shape': self._output_shape}
+                  'output_shape': self._output_shape,
+                  'arguments': self.arguments}
        base_config = super(LambdaMerge, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -1614,6 +1711,10 @@ class Siamese(Layer):
                    self.constraints.append(c)
        super(Siamese, self).__init__()

+    @property
+    def input_shape(self):
+        return [layer.output_shape for layer in self.inputs]
+
    @property
    def output_shape(self):
        if self.merge_mode is None:
@@ -1651,16 +1752,12 @@ class Siamese(Layer):
        return self.trainable_weights, self.regularizers, self.constraints, self.updates

    def set_layer_input(self, head):
-        layer = self.layer
-        from ..layers.containers import Sequential
-        while issubclass(layer.__class__, Sequential):
-            layer = layer.layers[0]
-        layer.previous = self.inputs[head]
+        self.layer.set_previous(self.inputs[head], reset_weights=False)

    def get_output_at(self, head, train=False):
        X = self.inputs[head].get_output(train)
        mask = self.inputs[head].get_output_mask(train)
-        Y = self.layer(X, mask)
+        Y = self.layer(X, mask=mask, train=train)
        return Y

    def get_output_shape(self, head, train=False):
@@ -1703,24 +1800,17 @@ class Siamese(Layer):
        return s

    def get_output_dot(self, train=False):
-        if K._BACKEND != 'theano':
-            raise Exception('"dot" merge mode will only work with Theano.')
-        from theano import tensor as T
        l1 = self.get_output_at(0, train)
        l2 = self.get_output_at(1, train)
-        output = T.batched_tensordot(l1, l2, self.dot_axes)
-        output = output.dimshuffle((0, 'x'))
+        output = K.batch_dot(l1, l2, self.dot_axes)
+        output = K.expand_dims(output, -1)
        return output

    def get_output_cos(self, train=False):
-        if K._BACKEND != 'theano':
-            raise Exception('"cos" merge mode will only work with Theano.')
-        import theano
-        from theano import tensor as T
        l1 = self.get_output_at(0, train)
        l2 = self.get_output_at(1, train)
-        output = T.batched_tensordot(l1, l2, self.dot_axes) / T.sqrt(T.batched_tensordot(l1, l1, self.dot_axes) * T.batched_tensordot(l2, l2, self.dot_axes))
-        output = output.dimshuffle((0, 'x'))
+        output = K.batch_dot(l1, l2, self.dot_axes) / K.sqrt(K.batch_dot(l1, l1, self.dot_axes) * K.batch_dot(l2, l2, self.dot_axes))
+        output = K.expand_dims(output, -1)
        return output

    def get_output(self, train=False):
@@ -1769,12 +1859,12 @@ class Siamese(Layer):
        return weights

    def set_weights(self, weights):
-        nb_param = len(self.layer.trainable_weights)
+        nb_param = len(self.layer.get_weights())
        self.layer.set_weights(weights[:nb_param])
        weights = weights[nb_param:]
        if self.merge_mode and not self.is_graph:
            for i in range(len(self.inputs)):
-                nb_param = len(self.inputs[i].trainable_weights)
+                nb_param = len(self.inputs[i].get_weights())
                self.inputs[i].set_weights(weights[:nb_param])
                weights = weights[nb_param:]

@@ -1824,9 +1914,6 @@ class SiameseHead(Layer):
        base_config = super(SiameseHead, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

-    def set_previous(self, layer):
-        self.previous = layer
-

 def add_shared_layer(layer, inputs):
    '''Use this function to add a shared layer across
@@ -1863,7 +1950,8 @@ class Highway(Layer):
            If you don't specify anything, no activation is applied
            (ie. "linear" activation: a(x) = x).
        weights: list of numpy arrays to set as initial weights.
-            The list should have 1 element, of shape `(input_dim, output_dim)`.
+            The list should have 2 elements, of shape `(input_dim, output_dim)`
+            and (output_dim,) for weights and biases respectively.
        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
            (eg. L1 or L2 regularization), applied to the main weights matrix.
        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
@@ -1904,18 +1992,20 @@ class Highway(Layer):
        self.input_dim = input_dim
        if self.input_dim:
            kwargs['input_shape'] = (self.input_dim,)
-        self.input = K.placeholder(ndim=2)
        super(Highway, self).__init__(**kwargs)

    def build(self):
        input_dim = self.input_shape[1]

-        self.W = self.init((input_dim, input_dim))
-        self.W_carry = self.init((input_dim, input_dim))
+        self.W = self.init((input_dim, input_dim),
+                           name='{}_W'.format(self.name))
+        self.W_carry = self.init((input_dim, input_dim),
+                                 name='{}_W_carry'.format(self.name))

-        self.b = K.zeros((input_dim,))
+        self.b = K.zeros((input_dim,), name='{}_b'.format(self.name))
        # initialize with a vector of values `transform_bias`
-        self.b_carry = K.variable(np.ones((input_dim,)) * self.transform_bias)
+        self.b_carry = K.variable(np.ones((input_dim,)) * self.transform_bias,
+                                  name='{}_b_carry'.format(self.name))

        self.trainable_weights = [self.W, self.b, self.W_carry, self.b_carry]

@@ -1,10 +1,8 @@
 from __future__ import absolute_import
+
 from .. import backend as K
-
-from .. import activations, initializations, regularizers, constraints
-from ..layers.core import Layer, MaskedLayer
-
-from ..constraints import unitnorm
+from .. import initializations, regularizers, constraints
+from ..layers.core import Layer


 class Embedding(Layer):
@@ -42,6 +40,10 @@ class Embedding(Layer):
          This argument is required if you are going to connect
          `Flatten` then `Dense` layers upstream
          (without it, the shape of the dense outputs cannot be computed).
+      dropout: float between 0 and 1. Fraction of the embeddings to drop.
+
+    # References
+        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
    '''
    input_ndim = 2

@@ -50,12 +52,13 @@ class Embedding(Layer):
                 W_regularizer=None, activity_regularizer=None,
                 W_constraint=None,
                 mask_zero=False,
-                 weights=None, **kwargs):
+                 weights=None, dropout=0., **kwargs):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.input_length = input_length
        self.mask_zero = mask_zero
+        self.dropout = dropout

        self.W_constraint = constraints.get(W_constraint)
        self.constraints = [self.W_constraint]
@@ -70,7 +73,8 @@ class Embedding(Layer):
    def build(self):
        self.input = K.placeholder(shape=(self.input_shape[0], self.input_length),
                                   dtype='int32')
-        self.W = self.init((self.input_dim, self.output_dim))
+        self.W = self.init((self.input_dim, self.output_dim),
+                           name='{}_W'.format(self.name))
        self.trainable_weights = [self.W]
        self.regularizers = []
        if self.W_regularizer:
@@ -97,7 +101,13 @@ class Embedding(Layer):

    def get_output(self, train=False):
        X = self.get_input(train)
-        out = K.gather(self.W, X)
+        retain_p = 1. - self.dropout
+        if train and self.dropout > 0:
+            B = K.random_binomial((self.input_dim,), p=retain_p)
+        else:
+            B = K.ones((self.input_dim)) * retain_p
+        # we zero-out rows of W at random
+        out = K.gather(self.W * K.expand_dims(B), X)
        return out

    def get_config(self):
@@ -109,6 +119,7 @@ class Embedding(Layer):
                  "mask_zero": self.mask_zero,
                  "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None}
+                  "W_constraint": self.W_constraint.get_config() if self.W_constraint else None,
+                  "dropout": self.dropout}
        base_config = super(Embedding, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
@@ -37,13 +37,21 @@ class BatchNormalization(Layer):
        weights: Initialization weights.
            List of 2 numpy arrays, with shapes:
            `[(input_shape,), (input_shape,)]`
-
+        beta_init: name of initialization function for shift parameter
+            (see [initializations](../initializations.md)), or alternatively,
+            Theano/TensorFlow function to use for weights initialization.
+            This parameter is only relevant if you don't pass a `weights` argument.
+        gamma_init: name of initialization function for scale parameter (see
+            [initializations](../initializations.md)), or alternatively,
+            Theano/TensorFlow function to use for weights initialization.
+            This parameter is only relevant if you don't pass a `weights` argument.
    # References
        - [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](http://arxiv.org/pdf/1502.03167v3.pdf)
    '''
    def __init__(self, epsilon=1e-6, mode=0, axis=-1, momentum=0.9,
-                 weights=None, **kwargs):
-        self.init = initializations.get("uniform")
+                 weights=None, beta_init='zero', gamma_init='one', **kwargs):
+        self.beta_init = initializations.get(beta_init)
+        self.gamma_init = initializations.get(gamma_init)
        self.epsilon = epsilon
        self.mode = mode
        self.axis = axis
@@ -55,12 +63,14 @@ class BatchNormalization(Layer):
        input_shape = self.input_shape  # starts with samples axis
        shape = (input_shape[self.axis],)

-        self.gamma = self.init(shape)
-        self.beta = K.zeros(shape)
+        self.gamma = self.gamma_init(shape, name='{}_gamma'.format(self.name))
+        self.beta = self.beta_init(shape, name='{}_beta'.format(self.name))
        self.trainable_weights = [self.gamma, self.beta]

-        self.running_mean = K.zeros(shape)
-        self.running_std = K.ones(shape)
+        self.running_mean = K.zeros(shape,
+                                    name='{}_running_mean'.format(self.name))
+        self.running_std = K.ones(shape,
+                                  name='{}_running_std'.format(self.name))
        self.non_trainable_weights = [self.running_mean, self.running_std]

        if self.initial_weights is not None:
@@ -3,10 +3,42 @@ from __future__ import absolute_import
 import numpy as np

 from .. import backend as K
-from .. import activations, initializations
+from .. import activations, initializations, regularizers
 from ..layers.core import MaskedLayer


+def time_distributed_dense(x, w, b=None, dropout=None,
+                           input_dim=None, output_dim=None, timesteps=None):
+    '''Apply y.w + b for every temporal slice y of x.
+    '''
+    if not input_dim:
+        # won't work with TensorFlow
+        input_dim = K.shape(x)[2]
+    if not timesteps:
+        # won't work with TensorFlow
+        timesteps = K.shape(x)[1]
+    if not output_dim:
+        # won't work with TensorFlow
+        output_dim = K.shape(w)[1]
+
+    if dropout:
+        # apply the same dropout pattern at every timestep
+        ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
+        dropout_matrix = K.dropout(ones, dropout)
+        expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
+        x *= expanded_dropout_matrix
+
+    # collapse time dimension and batch dimension together
+    x = K.reshape(x, (-1, input_dim))
+
+    x = K.dot(x, w)
+    if b:
+        x = x + b
+    # reshape to 3D tensor
+    x = K.reshape(x, (-1, timesteps, output_dim))
+    return x
+
+
 class Recurrent(MaskedLayer):
    '''Abstract base class for recurrent layers.
    Do not use in a model -- it's not a functional layer!
@@ -78,6 +110,10 @@ class Recurrent(MaskedLayer):

        To reset the states of your model, call `.reset_states()` on either
        a specific layer, or on your entire model.
+
+    # Note on using dropout with TensorFlow
+        When using the TensorFlow backend, specify a fixed batch size for your model
+        following the notes on statefulness RNNs.
    '''
    input_ndim = 3

@@ -112,15 +148,21 @@ class Recurrent(MaskedLayer):
    def step(self, x, states):
        raise NotImplementedError

-    def get_initial_states(self, X):
+    def get_constants(self, x, train=False):
+        return []
+
+    def get_initial_states(self, x):
        # build an all-zero tensor of shape (samples, output_dim)
-        initial_state = K.zeros_like(X)  # (samples, timesteps, input_dim)
+        initial_state = K.zeros_like(x)  # (samples, timesteps, input_dim)
        initial_state = K.sum(initial_state, axis=1)  # (samples, input_dim)
        reducer = K.zeros((self.input_dim, self.output_dim))
        initial_state = K.dot(initial_state, reducer)  # (samples, output_dim)
        initial_states = [initial_state for _ in range(len(self.states))]
        return initial_states

+    def preprocess_input(self, x, train=False):
+        return x
+
    def get_output(self, train=False):
        # input shape: (nb_samples, time (padded with zeros), input_dim)
        X = self.get_input(train)
@@ -142,11 +184,14 @@ class Recurrent(MaskedLayer):
            initial_states = self.states
        else:
            initial_states = self.get_initial_states(X)
+        constants = self.get_constants(X, train)
+        preprocessed_input = self.preprocess_input(X, train)

-        last_output, outputs, states = K.rnn(self.step, X,
+        last_output, outputs, states = K.rnn(self.step, preprocessed_input,
                                             initial_states,
                                             go_backwards=self.go_backwards,
-                                             mask=mask)
+                                             mask=mask,
+                                             constants=constants)
        if self.stateful:
            self.updates = []
            for i in range(len(states)):
@@ -167,13 +212,13 @@ class Recurrent(MaskedLayer):
        else:
            config['input_dim'] = self.input_dim
            config['input_length'] = self.input_length
-            
+
        base_config = super(Recurrent, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


 class SimpleRNN(Recurrent):
-    '''Fully-connected RNN where the output is to fed back to input.
+    '''Fully-connected RNN where the output is to be fed back to input.

    # Arguments
        output_dim: dimension of the internal projections and the final output.
@@ -184,14 +229,31 @@ class SimpleRNN(Recurrent):
        activation: activation function.
            Can be the name of an existing function (str),
            or a Theano function (see: [activations](../activations.md)).
+        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the input weights matrices.
+        U_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
+            applied to the bias.
+        dropout_W: float between 0 and 1. Fraction of the input units to drop for input gates.
+        dropout_U: float between 0 and 1. Fraction of the input units to drop for recurrent connections.
+
+    # References
+        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
    '''
    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
-                 activation='sigmoid', **kwargs):
+                 activation='tanh',
+                 W_regularizer=None, U_regularizer=None, b_regularizer=None,
+                 dropout_W=0., dropout_U=0., **kwargs):
        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.activation = activations.get(activation)
+        self.W_regularizer = regularizers.get(W_regularizer)
+        self.U_regularizer = regularizers.get(U_regularizer)
+        self.b_regularizer = regularizers.get(b_regularizer)
+        self.dropout_W, self.dropout_U = dropout_W, dropout_U
        super(SimpleRNN, self).__init__(**kwargs)

    def build(self):
@@ -204,9 +266,23 @@ class SimpleRNN(Recurrent):
        input_dim = input_shape[2]
        self.input_dim = input_dim

-        self.W = self.init((input_dim, self.output_dim))
-        self.U = self.inner_init((self.output_dim, self.output_dim))
-        self.b = K.zeros((self.output_dim,))
+        self.W = self.init((input_dim, self.output_dim),
+                           name='{}_W'.format(self.name))
+        self.U = self.inner_init((self.output_dim, self.output_dim),
+                                 name='{}_U'.format(self.name))
+        self.b = K.zeros((self.output_dim,), name='{}_b'.format(self.name))
+
+        self.regularizers = []
+        if self.W_regularizer:
+            self.W_regularizer.set_param(self.W)
+            self.regularizers.append(self.W_regularizer)
+        if self.U_regularizer:
+            self.W_regularizer.set_param(self.U)
+            self.regularizers.append(self.U_regularizer)
+        if self.b_regularizer:
+            self.W_regularizer.set_param(self.b)
+            self.regularizers.append(self.b_regularizer)
+
        self.trainable_weights = [self.W, self.U, self.b]

        if self.initial_weights is not None:
@@ -218,27 +294,51 @@ class SimpleRNN(Recurrent):
        input_shape = self.input_shape
        if not input_shape[0]:
            raise Exception('If a RNN is stateful, a complete ' +
-                            'input_shape must be provided ' +
-                            '(including batch size).')
+                            'input_shape must be provided (including batch size).')
        if hasattr(self, 'states'):
            K.set_value(self.states[0],
                        np.zeros((input_shape[0], self.output_dim)))
        else:
            self.states = [K.zeros((input_shape[0], self.output_dim))]

-    def step(self, x, states):
-        # states only contains the previous output.
-        assert len(states) == 1
+    def preprocess_input(self, x, train=False):
+        if train and (0 < self.dropout_W < 1):
+            dropout = self.dropout_W
+        else:
+            dropout = 0
+        input_shape = self.input_shape
+        input_dim = input_shape[2]
+        timesteps = input_shape[1]
+        return time_distributed_dense(x, self.W, self.b, dropout,
+                                      input_dim, self.output_dim, timesteps)
+
+    def step(self, h, states):
        prev_output = states[0]
-        h = K.dot(x, self.W) + self.b
-        output = self.activation(h + K.dot(prev_output, self.U))
+        if len(states) == 2:
+            B_U = states[1]
+        else:
+            B_U = 1.
+        output = self.activation(h + K.dot(prev_output * B_U, self.U))
        return output, [output]

+    def get_constants(self, x, train=False):
+        if train and (0 < self.dropout_U < 1):
+            ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
+            ones = K.concatenate([ones] * self.output_dim, 1)
+            B_U = K.dropout(ones, self.dropout_U)
+            return [B_U]
+        return []
+
    def get_config(self):
        config = {"output_dim": self.output_dim,
                  "init": self.init.__name__,
                  "inner_init": self.inner_init.__name__,
-                  "activation": self.activation.__name__}
+                  "activation": self.activation.__name__,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "U_regularizer": self.U_regularizer.get_config() if self.U_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "dropout_W": self.dropout_W,
+                  "dropout_U": self.dropout_U}
        base_config = super(SimpleRNN, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -256,39 +356,75 @@ class GRU(Recurrent):
            Can be the name of an existing function (str),
            or a Theano function (see: [activations](../activations.md)).
        inner_activation: activation function for the inner cells.
+        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the input weights matrices.
+        U_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
+            applied to the bias.
+        dropout_W: float between 0 and 1. Fraction of the input units to drop for input gates.
+        dropout_U: float between 0 and 1. Fraction of the input units to drop for recurrent connections.

    # References
        - [On the Properties of Neural Machine Translation: Encoder–Decoder Approaches](http://www.aclweb.org/anthology/W14-4012)
        - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/pdf/1412.3555v1.pdf)
+        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
    '''
    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
-                 activation='sigmoid', inner_activation='hard_sigmoid',
-                 **kwargs):
+                 activation='tanh', inner_activation='hard_sigmoid',
+                 W_regularizer=None, U_regularizer=None, b_regularizer=None,
+                 dropout_W=0., dropout_U=0., **kwargs):
        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.W_regularizer = regularizers.get(W_regularizer)
+        self.U_regularizer = regularizers.get(U_regularizer)
+        self.b_regularizer = regularizers.get(b_regularizer)
+        self.dropout_W, self.dropout_U = dropout_W, dropout_U
        super(GRU, self).__init__(**kwargs)

    def build(self):
        input_shape = self.input_shape
        input_dim = input_shape[2]
        self.input_dim = input_dim
-        self.input = K.placeholder(input_shape)

-        self.W_z = self.init((input_dim, self.output_dim))
-        self.U_z = self.inner_init((self.output_dim, self.output_dim))
-        self.b_z = K.zeros((self.output_dim,))
+        self.W_z = self.init((input_dim, self.output_dim),
+                             name='{}_W_z'.format(self.name))
+        self.U_z = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_z'.format(self.name))
+        self.b_z = K.zeros((self.output_dim,), name='{}_b_z'.format(self.name))

-        self.W_r = self.init((input_dim, self.output_dim))
-        self.U_r = self.inner_init((self.output_dim, self.output_dim))
-        self.b_r = K.zeros((self.output_dim,))
+        self.W_r = self.init((input_dim, self.output_dim),
+                             name='{}_W_r'.format(self.name))
+        self.U_r = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_r'.format(self.name))
+        self.b_r = K.zeros((self.output_dim,), name='{}_b_r'.format(self.name))

-        self.W_h = self.init((input_dim, self.output_dim))
-        self.U_h = self.inner_init((self.output_dim, self.output_dim))
-        self.b_h = K.zeros((self.output_dim,))
+        self.W_h = self.init((input_dim, self.output_dim),
+                             name='{}_W_h'.format(self.name))
+        self.U_h = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_h'.format(self.name))
+        self.b_h = K.zeros((self.output_dim,), name='{}_b_h'.format(self.name))
+
+        self.regularizers = []
+        if self.W_regularizer:
+            self.W_regularizer.set_param(K.concatenate([self.W_z,
+                                                        self.W_r,
+                                                        self.W_h]))
+            self.regularizers.append(self.W_regularizer)
+        if self.U_regularizer:
+            self.U_regularizer.set_param(K.concatenate([self.U_z,
+                                                        self.U_r,
+                                                        self.U_h]))
+            self.regularizers.append(self.U_regularizer)
+        if self.b_regularizer:
+            self.b_regularizer.set_param(K.concatenate([self.b_z,
+                                                        self.b_r,
+                                                        self.b_h]))
+            self.regularizers.append(self.b_regularizer)

        self.trainable_weights = [self.W_z, self.U_z, self.b_z,
                                  self.W_r, self.U_r, self.b_r,
@@ -308,34 +444,67 @@ class GRU(Recurrent):
        input_shape = self.input_shape
        if not input_shape[0]:
            raise Exception('If a RNN is stateful, a complete ' +
-                            'input_shape must be provided ' +
-                            '(including batch size).')
+                            'input_shape must be provided (including batch size).')
        if hasattr(self, 'states'):
            K.set_value(self.states[0],
                        np.zeros((input_shape[0], self.output_dim)))
        else:
            self.states = [K.zeros((input_shape[0], self.output_dim))]

+    def preprocess_input(self, x, train=False):
+        if train and (0 < self.dropout_W < 1):
+            dropout = self.dropout_W
+        else:
+            dropout = 0
+        input_shape = self.input_shape
+        input_dim = input_shape[2]
+        timesteps = input_shape[1]
+
+        x_z = time_distributed_dense(x, self.W_z, self.b_z, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        x_r = time_distributed_dense(x, self.W_r, self.b_r, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        x_h = time_distributed_dense(x, self.W_h, self.b_h, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        return K.concatenate([x_z, x_r, x_h], axis=2)
+
    def step(self, x, states):
-        assert len(states) == 1
-        x_z = K.dot(x, self.W_z) + self.b_z
-        x_r = K.dot(x, self.W_r) + self.b_r
-        x_h = K.dot(x, self.W_h) + self.b_h
+        h_tm1 = states[0]  # previous memory
+        if len(states) == 2:
+            B_U = states[1]  # dropout matrices for recurrent units
+        else:
+            B_U = [1., 1., 1.]

-        h_tm1 = states[0]
-        z = self.inner_activation(x_z + K.dot(h_tm1, self.U_z))
-        r = self.inner_activation(x_r + K.dot(h_tm1, self.U_r))
+        x_z = x[:, :self.output_dim]
+        x_r = x[:, self.output_dim: 2 * self.output_dim]
+        x_h = x[:, 2 * self.output_dim:]

-        hh = self.activation(x_h + K.dot(r * h_tm1, self.U_h))
+        z = self.inner_activation(x_z + K.dot(h_tm1 * B_U[0], self.U_z))
+        r = self.inner_activation(x_r + K.dot(h_tm1 * B_U[1], self.U_r))
+
+        hh = self.activation(x_h + K.dot(r * h_tm1 * B_U[2], self.U_h))
        h = z * h_tm1 + (1 - z) * hh
        return h, [h]

+    def get_constants(self, x, train=False):
+        if train and (0 < self.dropout_U < 1):
+            ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
+            ones = K.concatenate([ones] * self.output_dim, 1)
+            B_U = [K.dropout(ones, self.dropout_U) for _ in range(3)]
+            return [B_U]
+        return []
+
    def get_config(self):
        config = {"output_dim": self.output_dim,
                  "init": self.init.__name__,
                  "inner_init": self.inner_init.__name__,
                  "activation": self.activation.__name__,
-                  "inner_activation": self.inner_activation.__name__}
+                  "inner_activation": self.inner_activation.__name__,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "U_regularizer": self.U_regularizer.get_config() if self.U_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "dropout_W": self.dropout_W,
+                  "dropout_U": self.dropout_U}
        base_config = super(GRU, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

@@ -359,51 +528,94 @@ class LSTM(Recurrent):
            Can be the name of an existing function (str),
            or a Theano function (see: [activations](../activations.md)).
        inner_activation: activation function for the inner cells.
+        W_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the input weights matrices.
+        U_regularizer: instance of [WeightRegularizer](../regularizers.md)
+            (eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+        b_regularizer: instance of [WeightRegularizer](../regularizers.md),
+            applied to the bias.
+        dropout_W: float between 0 and 1. Fraction of the input units to drop for input gates.
+        dropout_U: float between 0 and 1. Fraction of the input units to drop for recurrent connections.

    # References
        - [Long short-term memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf) (original 1997 paper)
        - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
        - [Supervised sequence labelling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
+        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
    '''
    def __init__(self, output_dim,
                 init='glorot_uniform', inner_init='orthogonal',
                 forget_bias_init='one', activation='tanh',
-                 inner_activation='hard_sigmoid', **kwargs):
+                 inner_activation='hard_sigmoid',
+                 W_regularizer=None, U_regularizer=None, b_regularizer=None,
+                 dropout_W=0., dropout_U=0., **kwargs):
        self.output_dim = output_dim
        self.init = initializations.get(init)
        self.inner_init = initializations.get(inner_init)
        self.forget_bias_init = initializations.get(forget_bias_init)
        self.activation = activations.get(activation)
        self.inner_activation = activations.get(inner_activation)
+        self.W_regularizer = regularizers.get(W_regularizer)
+        self.U_regularizer = regularizers.get(U_regularizer)
+        self.b_regularizer = regularizers.get(b_regularizer)
+        self.dropout_W, self.dropout_U = dropout_W, dropout_U
        super(LSTM, self).__init__(**kwargs)

    def build(self):
        input_shape = self.input_shape
        input_dim = input_shape[2]
        self.input_dim = input_dim
-        self.input = K.placeholder(input_shape)

        if self.stateful:
            self.reset_states()
        else:
-            # initial states: 2 all-zero tensor of shape (output_dim)
+            # initial states: 2 all-zero tensors of shape (output_dim)
            self.states = [None, None]

-        self.W_i = self.init((input_dim, self.output_dim))
-        self.U_i = self.inner_init((self.output_dim, self.output_dim))
-        self.b_i = K.zeros((self.output_dim,))
+        self.W_i = self.init((input_dim, self.output_dim),
+                             name='{}_W_i'.format(self.name))
+        self.U_i = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_i'.format(self.name))
+        self.b_i = K.zeros((self.output_dim,), name='{}_b_i'.format(self.name))

-        self.W_f = self.init((input_dim, self.output_dim))
-        self.U_f = self.inner_init((self.output_dim, self.output_dim))
-        self.b_f = self.forget_bias_init((self.output_dim,))
+        self.W_f = self.init((input_dim, self.output_dim),
+                             name='{}_W_f'.format(self.name))
+        self.U_f = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_f'.format(self.name))
+        self.b_f = self.forget_bias_init((self.output_dim,),
+                                         name='{}_b_f'.format(self.name))

-        self.W_c = self.init((input_dim, self.output_dim))
-        self.U_c = self.inner_init((self.output_dim, self.output_dim))
-        self.b_c = K.zeros((self.output_dim,))
+        self.W_c = self.init((input_dim, self.output_dim),
+                             name='{}_W_c'.format(self.name))
+        self.U_c = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_c'.format(self.name))
+        self.b_c = K.zeros((self.output_dim,), name='{}_b_c'.format(self.name))

-        self.W_o = self.init((input_dim, self.output_dim))
-        self.U_o = self.inner_init((self.output_dim, self.output_dim))
-        self.b_o = K.zeros((self.output_dim,))
+        self.W_o = self.init((input_dim, self.output_dim),
+                             name='{}_W_o'.format(self.name))
+        self.U_o = self.inner_init((self.output_dim, self.output_dim),
+                                   name='{}_U_o'.format(self.name))
+        self.b_o = K.zeros((self.output_dim,), name='{}_b_o'.format(self.name))
+
+        self.regularizers = []
+        if self.W_regularizer:
+            self.W_regularizer.set_param(K.concatenate([self.W_i,
+                                                        self.W_f,
+                                                        self.W_c,
+                                                        self.W_o]))
+            self.regularizers.append(self.W_regularizer)
+        if self.U_regularizer:
+            self.U_regularizer.set_param(K.concatenate([self.U_i,
+                                                        self.U_f,
+                                                        self.U_c,
+                                                        self.U_o]))
+            self.regularizers.append(self.U_regularizer)
+        if self.b_regularizer:
+            self.b_regularizer.set_param(K.concatenate([self.b_i,
+                                                        self.b_f,
+                                                        self.b_c,
+                                                        self.b_o]))
+            self.regularizers.append(self.b_regularizer)

        self.trainable_weights = [self.W_i, self.U_i, self.b_i,
                                  self.W_c, self.U_c, self.b_c,
@@ -419,8 +631,7 @@ class LSTM(Recurrent):
        input_shape = self.input_shape
        if not input_shape[0]:
            raise Exception('If a RNN is stateful, a complete ' +
-                            'input_shape must be provided ' +
-                            '(including batch size).')
+                            'input_shape must be provided (including batch size).')
        if hasattr(self, 'states'):
            K.set_value(self.states[0],
                        np.zeros((input_shape[0], self.output_dim)))
@@ -430,29 +641,65 @@ class LSTM(Recurrent):
            self.states = [K.zeros((input_shape[0], self.output_dim)),
                           K.zeros((input_shape[0], self.output_dim))]

+    def preprocess_input(self, x, train=False):
+        if train and (0 < self.dropout_W < 1):
+            dropout = self.dropout_W
+        else:
+            dropout = 0
+        input_shape = self.input_shape
+        input_dim = input_shape[2]
+        timesteps = input_shape[1]
+
+        x_i = time_distributed_dense(x, self.W_i, self.b_i, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        x_f = time_distributed_dense(x, self.W_f, self.b_f, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        x_c = time_distributed_dense(x, self.W_c, self.b_c, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        x_o = time_distributed_dense(x, self.W_o, self.b_o, dropout,
+                                     input_dim, self.output_dim, timesteps)
+        return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
+
    def step(self, x, states):
-        assert len(states) == 2
        h_tm1 = states[0]
        c_tm1 = states[1]
+        if len(states) == 3:
+            B_U = states[2]
+        else:
+            B_U = [1. for _ in range(4)]

-        x_i = K.dot(x, self.W_i) + self.b_i
-        x_f = K.dot(x, self.W_f) + self.b_f
-        x_c = K.dot(x, self.W_c) + self.b_c
-        x_o = K.dot(x, self.W_o) + self.b_o
+        x_i = x[:, :self.output_dim]
+        x_f = x[:, self.output_dim: 2 * self.output_dim]
+        x_c = x[:, 2 * self.output_dim: 3 * self.output_dim]
+        x_o = x[:, 3 * self.output_dim:]
+
+        i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i))
+        f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f))
+        c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c))
+        o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o))

-        i = self.inner_activation(x_i + K.dot(h_tm1, self.U_i))
-        f = self.inner_activation(x_f + K.dot(h_tm1, self.U_f))
-        c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1, self.U_c))
-        o = self.inner_activation(x_o + K.dot(h_tm1, self.U_o))
        h = o * self.activation(c)
        return h, [h, c]

+    def get_constants(self, x, train=False):
+        if train and (0 < self.dropout_U < 1):
+            ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
+            ones = K.concatenate([ones] * self.output_dim, 1)
+            B_U = [K.dropout(ones, self.dropout_U) for _ in range(4)]
+            return [B_U]
+        return []
+
    def get_config(self):
        config = {"output_dim": self.output_dim,
                  "init": self.init.__name__,
                  "inner_init": self.inner_init.__name__,
                  "forget_bias_init": self.forget_bias_init.__name__,
                  "activation": self.activation.__name__,
-                  "inner_activation": self.inner_activation.__name__}
+                  "inner_activation": self.inner_activation.__name__,
+                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
+                  "U_regularizer": self.U_regularizer.get_config() if self.U_regularizer else None,
+                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
+                  "dropout_W": self.dropout_W,
+                  "dropout_U": self.dropout_U}
        base_config = super(LSTM, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
@@ -0,0 +1,114 @@
+from .core import MaskedLayer
+from .. import backend as K
+
+
+class TimeDistributed(MaskedLayer):
+    """This wrapper allows to apply a layer to every
+    temporal slice of an input.
+
+    The input should be at least 3D,
+    and the dimension of index one will be considered to be
+    the temporal dimension.
+
+    Consider a batch of 32 samples, where each sample is a sequence of 10
+    vectors of 16 dimensions. The batch input shape of the layer is then `(32, 10, 16)`
+    (and the `input_shape`, not including the samples dimension, is `(10, 16)`).
+
+    You can then use `TimeDistributed` to apply a `Dense` layer to each of the 10 timesteps, independently:
+    ```python
+        model = Sequential()
+        model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
+    ```
+
+    The output will then have shape `(32, 10, 8)`.
+
+    Note this is strictly equivalent to using `layers.core.TimeDistributedDense`.
+    However what is different about `TimeDistributed`
+    is that it can be used with arbitrary layers, not just `Dense`,
+    for instance with a `Convolution2D` layer:
+
+    ```python
+        model = Sequential()
+        model.add(TimeDistributed(Convolution2D(64, 3, 3), input_shape=(10, 3, 299, 299)))
+    ```
+
+    # Arguments
+        layer: a layer instance.
+    """
+
+    def __init__(self, layer, **kwargs):
+        self.layer = layer
+        super(TimeDistributed, self).__init__(**kwargs)
+
+    def build(self):
+        input_shape = self.input_shape
+        assert len(input_shape) >= 3
+        child_input_shape = (input_shape[0],) + input_shape[2:]
+        self.layer.set_input_shape(child_input_shape)
+        self.layer.build()
+
+        trainable_weights, regularizers, constraints, updates = self.layer.get_params()
+        self.trainable_weights = trainable_weights
+        self.non_trainable_weights = self.layer.non_trainable_weights
+        self.regularizers = regularizers
+        self.constraints = constraints
+        self.updates = updates
+
+    @property
+    def output_shape(self):
+        child_output_shape = self.layer.output_shape
+        timesteps = self.input_shape[1]
+        return (child_output_shape[0], timesteps) + child_output_shape[1:]
+
+    def get_output(self, train=False):
+        X = self.get_input(train)
+        mask = self.get_input_mask(train)
+
+        if K._BACKEND == 'tensorflow':
+            if not self.input_shape[1]:
+                raise Exception('When using TensorFlow, you should define ' +
+                                'explicitly the number of timesteps of ' +
+                                'your sequences.\n' +
+                                'If your first layer is an Embedding, ' +
+                                'make sure to pass it an "input_length" ' +
+                                'argument. Otherwise, make sure ' +
+                                'the first layer has ' +
+                                'an "input_shape" or "batch_input_shape" ' +
+                                'argument, including the time axis.')
+
+        if self.input_shape[0]:
+            # batch size matters, use rnn-based implementation
+            def step(x, states):
+                output = self.layer(x, train=train)
+                return output, []
+
+            last_output, outputs, states = K.rnn(step, X,
+                                                 initial_states=[],
+                                                 mask=mask)
+            y = outputs
+        else:
+            # no batch size specified, therefore the layer will be able
+            # to process batches of any size
+            # we can go with reshape-based implementation for performance
+            input_shape = self.input_shape
+            x = K.reshape(X, (-1, ) + input_shape[2:])  # (nb_samples * timesteps, ...)
+            y = self.layer(x, train=False)  # (nb_samples * timesteps, ...)
+            input_length = input_shape[1]
+            if not input_length:
+                input_length = K.shape(X)[1]
+            # (nb_samples, timesteps, ...)
+            y = K.reshape(y, (-1, input_length) + self.layer.output_shape[1:])
+        return y
+
+    def get_weights(self):
+        weights = self.layer.get_weights()
+        return weights
+
+    def set_weights(self, weights):
+        self.layer.set_weights(weights)
+
+    def get_config(self):
+        config = {'name': self.__class__.__name__,
+                  'layer': self.layer.get_config()}
+        base_config = super(TimeDistributed, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
@@ -1,3 +1,7 @@
+'''Fairly basic set of tools for realtime data augmentation on image data.
+Can easily be extended to include new transformations,
+new preprocessing methods, etc...
+'''
 from __future__ import absolute_import

 import numpy as np
@@ -7,18 +11,13 @@ from scipy import linalg

 from os import listdir
 from os.path import isfile, join
-import random
 import math
 from six.moves import range
 import threading

-'''Fairly basic set of tools for realtime data augmentation on image data.
-Can easily be extended to include new transformations, new preprocessing methods, etc...
-'''

-
-def random_rotation(x, rg, fill_mode="nearest", cval=0.):
-    angle = random.uniform(-rg, rg)
+def random_rotation(x, rg, fill_mode='nearest', cval=0.):
+    angle = np.random.uniform(-rg, rg)
    x = ndimage.interpolation.rotate(x, angle,
                                     axes=(1, 2),
                                     reshape=False,
@@ -27,19 +26,14 @@ def random_rotation(x, rg, fill_mode="nearest", cval=0.):
    return x


-def random_shift(x, wrg, hrg, fill_mode="nearest", cval=0.):
-    crop_left_pixels = 0
-    crop_top_pixels = 0
+def random_shift(x, wrg, hrg, fill_mode='nearest', cval=0.):
+    shift_x = shift_y = 0

    if wrg:
-        crop = random.uniform(0., wrg)
-        split = random.uniform(0, 1)
-        crop_left_pixels = int(split*crop*x.shape[1])
+        shift_x = np.random.uniform(-wrg, wrg) * x.shape[2]
    if hrg:
-        crop = random.uniform(0., hrg)
-        split = random.uniform(0, 1)
-        crop_top_pixels = int(split*crop*x.shape[2])
-    x = ndimage.interpolation.shift(x, (0, crop_left_pixels, crop_top_pixels),
+        shift_y = np.random.uniform(-hrg, hrg) * x.shape[1]
+    x = ndimage.interpolation.shift(x, (0, shift_y, shift_x),
                                    order=0,
                                    mode=fill_mode,
                                    cval=cval)
@@ -63,8 +57,8 @@ def random_barrel_transform(x, intensity):
    pass


-def random_shear(x, intensity, fill_mode="nearest", cval=0.):
-    shear = random.uniform(-intensity, intensity)
+def random_shear(x, intensity, fill_mode='nearest', cval=0.):
+    shear = np.random.uniform(-intensity, intensity)
    shear_matrix = np.array([[1.0, -math.sin(shear), 0.0],
                            [0.0, math.cos(shear), 0.0],
                            [0.0, 0.0, 1.0]])
@@ -80,9 +74,9 @@ def random_channel_shift(x, rg):
    pass


-def random_zoom(x, rg, fill_mode="nearest", cval=0.):
-    zoom_w = random.uniform(1.-rg, 1.)
-    zoom_h = random.uniform(1.-rg, 1.)
+def random_zoom(x, rg, fill_mode='nearest', cval=0.):
+    zoom_w = np.random.uniform(1.-rg, 1.)
+    zoom_h = np.random.uniform(1.-rg, 1.)
    x = ndimage.interpolation.zoom(x, zoom=(1., zoom_w, zoom_h),
                                   mode=fill_mode,
                                   cval=cval)
@@ -98,10 +92,10 @@ def array_to_img(x, scale=True):
        x *= 255
    if x.shape[2] == 3:
        # RGB
-        return Image.fromarray(x.astype("uint8"), "RGB")
+        return Image.fromarray(x.astype('uint8'), 'RGB')
    else:
        # grayscale
-        return Image.fromarray(x[:, :, 0].astype("uint8"), "L")
+        return Image.fromarray(x[:, :, 0].astype('uint8'), 'L')


 def img_to_array(img):
@@ -132,21 +126,33 @@ def list_pictures(directory, ext='jpg|jpeg|bmp|png'):

 class ImageDataGenerator(object):
    '''Generate minibatches with
-    realtime data augmentation.
+    real-time data augmentation.
+
+    # Arguments
+        featurewise_center: set input mean to 0 over the dataset.
+        samplewise_center: set each sample mean to 0.
+        featurewise_std_normalization: divide inputs by std of the dataset.
+        samplewise_std_normalization: divide each input by its std.
+        zca_whitening: apply ZCA whitening.
+        rotation_range: degrees (0 to 180).
+        width_shift_range: fraction of total width.
+        height_shift_range: fraction of total height.
+        shear_range: shear intensity (shear angle in radians).
+        horizontal_flip: whether to randomly flip images horizontally.
+        vertical_flip: whether to randomly flip images vertically.
    '''
    def __init__(self,
-                 featurewise_center=True,  # set input mean to 0 over the dataset
-                 samplewise_center=False,  # set each sample mean to 0
-                 featurewise_std_normalization=True,  # divide inputs by std of the dataset
-                 samplewise_std_normalization=False,  # divide each input by its std
-                 zca_whitening=False,  # apply ZCA whitening
-                 rotation_range=0.,  # degrees (0 to 180)
-                 width_shift_range=0.,  # fraction of total width
-                 height_shift_range=0.,  # fraction of total height
-                 shear_range=0.,  # shear intensity (shear angle in radians)
+                 featurewise_center=True,
+                 samplewise_center=False,
+                 featurewise_std_normalization=True,
+                 samplewise_std_normalization=False,
+                 zca_whitening=False,
+                 rotation_range=0.,
+                 width_shift_range=0.,
+                 height_shift_range=0.,
+                 shear_range=0.,
                 horizontal_flip=False,
                 vertical_flip=False):
-
        self.__dict__.update(locals())
        self.mean = None
        self.std = None
@@ -177,26 +183,30 @@ class ImageDataGenerator(object):
            else:
                b = 0
            total_b += 1
-            yield index_array[current_index: current_index + current_batch_size], current_index, current_batch_size
+            yield (index_array[current_index: current_index + current_batch_size],
+                   current_index, current_batch_size)

    def flow(self, X, y, batch_size=32, shuffle=False, seed=None,
-             save_to_dir=None, save_prefix="", save_format="jpeg"):
+             save_to_dir=None, save_prefix='', save_format='jpeg'):
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format
-        self.flow_generator = self._flow_index(X.shape[0], batch_size, shuffle, seed)
+        self.flow_generator = self._flow_index(X.shape[0], batch_size,
+                                               shuffle, seed)
        return self

    def __iter__(self):
-        # needed if we want to do something like for x,y in data_gen.flow(...):
+        # needed if we want to do something like:
+        # for x, y in data_gen.flow(...):
        return self

    def next(self):
-        # for python 2.x
-        # Keep under lock only the mechainsem which advance the indexing of each batch
+        # for python 2.x.
+        # Keeps under lock only the mechanism which advances
+        # the indexing of each batch
        # see # http://anandology.com/blog/using-iterators-and-generators/
        with self.lock:
            index_array, current_index, current_batch_size = next(self.flow_generator)
@@ -204,36 +214,36 @@ class ImageDataGenerator(object):
        bX = np.zeros(tuple([current_batch_size] + list(self.X.shape)[1:]))
        for i, j in enumerate(index_array):
            x = self.X[j]
-            x = self.random_transform(x.astype("float32"))
+            x = self.random_transform(x.astype('float32'))
            x = self.standardize(x)
            bX[i] = x
        if self.save_to_dir:
            for i in range(current_batch_size):
                img = array_to_img(bX[i], scale=True)
-                img.save(self.save_to_dir + "/" + self.save_prefix + "_" + str(current_index + i) + "." + self.save_format)
+                img.save(self.save_to_dir + '/' + self.save_prefix + '_' + str(current_index + i) + '.' + self.save_format)
        bY = self.y[index_array]
        return bX, bY

    def __next__(self):
-        # for python 3.x
+        # for python 3.x.
        return self.next()

    def standardize(self, x):
+        if self.samplewise_center:
+            x -= np.mean(x, axis=1, keepdims=True)
+        if self.samplewise_std_normalization:
+            x /= (np.std(x, axis=1, keepdims=True) + 1e-7)
+
        if self.featurewise_center:
            x -= self.mean
        if self.featurewise_std_normalization:
-            x /= self.std
+            x /= (self.std + 1e-7)

        if self.zca_whitening:
-            flatx = np.reshape(x, (x.shape[0]*x.shape[1]*x.shape[2]))
+            flatx = np.reshape(x, (x.shape[0] * x.shape[1] * x.shape[2]))
            whitex = np.dot(flatx, self.principal_components)
            x = np.reshape(whitex, (x.shape[0], x.shape[1], x.shape[2]))

-        if self.samplewise_center:
-            x -= np.mean(x)
-        if self.samplewise_std_normalization:
-            x /= np.std(x)
-
        return x

    def random_transform(self, x):
@@ -242,34 +252,41 @@ class ImageDataGenerator(object):
        if self.width_shift_range or self.height_shift_range:
            x = random_shift(x, self.width_shift_range, self.height_shift_range)
        if self.horizontal_flip:
-            if random.random() < 0.5:
+            if np.random.random() < 0.5:
                x = horizontal_flip(x)
        if self.vertical_flip:
-            if random.random() < 0.5:
+            if np.random.random() < 0.5:
                x = vertical_flip(x)
        if self.shear_range:
-            x = random_shear(x,self.shear_range)
+            x = random_shear(x, self.shear_range)
        # TODO:
        # zoom
        # barrel/fisheye
-        # shearing
        # channel shifting
        return x

    def fit(self, X,
-            augment=False,  # fit on randomly augmented samples
-            rounds=1,  # if augment, how many augmentation passes over the data do we use
+            augment=False,
+            rounds=1,
            seed=None):
-        '''Required for featurewise_center, featurewise_std_normalization and zca_whitening.
+        '''Required for featurewise_center, featurewise_std_normalization
+        and zca_whitening.
+
+        # Arguments
+            X: Numpy array, the data to fit on.
+            augment: whether to fit on randomly augmented samples
+            rounds: if `augment`,
+                how many augmentation passes to do over the data
+            seed: random seed.
        '''
        X = np.copy(X)
        if augment:
-            aX = np.zeros(tuple([rounds*X.shape[0]]+list(X.shape)[1:]))
+            aX = np.zeros(tuple([rounds * X.shape[0]] + list(X.shape)[1:]))
            for r in range(rounds):
                for i in range(X.shape[0]):
                    img = array_to_img(X[i])
                    img = self.random_transform(img)
-                    aX[i+r*X.shape[0]] = img_to_array(img)
+                    aX[i + r * X.shape[0]] = img_to_array(img)
            X = aX

        if self.featurewise_center:
@@ -277,14 +294,13 @@ class ImageDataGenerator(object):
            X -= self.mean
        if self.featurewise_std_normalization:
            self.std = np.std(X, axis=0)
-            X /= self.std
+            X /= (self.std + 1e-7)

        if self.zca_whitening:
-            flatX = np.reshape(X, (X.shape[0], X.shape[1]*X.shape[2]*X.shape[3]))
-            fudge = 10e-6
+            flatX = np.reshape(X, (X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]))
            sigma = np.dot(flatX.T, flatX) / flatX.shape[1]
            U, S, V = linalg.svd(sigma)
-            self.principal_components = np.dot(np.dot(U, np.diag(1. / np.sqrt(S + fudge))), U.T)
+            self.principal_components = np.dot(np.dot(U, np.diag(1. / np.sqrt(S + 10e-7))), U.T)


 class GraphImageDataGenerator(ImageDataGenerator):
@@ -4,19 +4,20 @@ import numpy as np
 import random
 from six.moves import range

-def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.):
-    """
-        Pad each sequence to the same length:
-        the length of the longest sequence.

-        If maxlen is provided, any sequence longer
-        than maxlen is truncated to maxlen. Truncation happens off either the beginning (default) or
-        the end of the sequence.
+def pad_sequences(sequences, maxlen=None, dtype='int32',
+                  padding='pre', truncating='pre', value=0.):
+    '''Pads each sequence to the same length:
+    the length of the longest sequence.

-        Supports post-padding and pre-padding (default).
+    If maxlen is provided, any sequence longer
+    than maxlen is truncated to maxlen.
+    Truncation happens off either the beginning (default) or
+    the end of the sequence.

-        Parameters:
-        -----------
+    Supports post-padding and pre-padding (default).
+
+    # Arguments
        sequences: list of lists where each element is a sequence
        maxlen: int, maximum length
        dtype: type to cast the resulting sequence.
@@ -25,53 +26,64 @@ def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncati
            maxlen either in the beginning or in the end of the sequence
        value: float, value to pad the sequences to the desired value.

-        Returns:
+    # Returns
        x: numpy array with dimensions (number_of_sequences, maxlen)
-
-    """
+    '''
    lengths = [len(s) for s in sequences]

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

-    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
+    # take the sample shape from the first non empty sequence
+    # checking for consistency in the main loop below.
+    sample_shape = tuple()
+    for s in sequences:
+        if len(s) > 0:
+            sample_shape = np.asarray(s).shape[1:]
+            break
+
+    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
-            continue # empty list was found
+            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
-            raise ValueError("Truncating type '%s' not understood" % padding)
+            raise ValueError('Truncating type "%s" not understood' % truncating)
+
+        # check `trunc` has expected shape
+        trunc = np.asarray(trunc, dtype=dtype)
+        if trunc.shape[1:] != sample_shape:
+            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
+                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
-            raise ValueError("Padding type '%s' not understood" % padding)
+            raise ValueError('Padding type "%s" not understood' % padding)
    return x


 def make_sampling_table(size, sampling_factor=1e-5):
-    '''
-        This generates an array where the ith element
-        is the probability that a word of rank i would be sampled,
-        according to the sampling distribution used in word2vec.
+    '''This generates an array where the ith element
+    is the probability that a word of rank i would be sampled,
+    according to the sampling distribution used in word2vec.

-        The word2vec formula is:
-            p(word) = min(1, sqrt(word.frequency/sampling_factor) / (word.frequency/sampling_factor))
+    The word2vec formula is:
+        p(word) = min(1, sqrt(word.frequency/sampling_factor) / (word.frequency/sampling_factor))

-        We assume that the word frequencies follow Zipf's law (s=1) to derive
-        a numerical approximation of frequency(rank):
-           frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
+    We assume that the word frequencies follow Zipf's law (s=1) to derive
+    a numerical approximation of frequency(rank):
+       frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
        where gamma is the Euler-Mascheroni constant.

-        Parameters:
-        -----------
-        size: int, number of possible words to sample. 
+    # Arguments
+        size: int, number of possible words to sample.
    '''
    gamma = 0.577
    rank = np.array(list(range(size)))
@@ -85,28 +97,28 @@ def make_sampling_table(size, sampling_factor=1e-5):
 def skipgrams(sequence, vocabulary_size,
              window_size=4, negative_samples=1., shuffle=True,
              categorical=False, sampling_table=None):
-    '''
-        Take a sequence (list of indexes of words),
-        returns couples of [word_index, other_word index] and labels (1s or 0s),
-        where label = 1 if 'other_word' belongs to the context of 'word',
-        and label=0 if 'other_word' is ramdomly sampled
+    '''Take a sequence (list of indexes of words),
+    returns couples of [word_index, other_word index] and labels (1s or 0s),
+    where label = 1 if 'other_word' belongs to the context of 'word',
+    and label=0 if 'other_word' is ramdomly sampled

-        Paramaters:
-        -----------
+    # Arguments
        vocabulary_size: int. maximum possible word index + 1
-        window_size: int. actually half-window. The window of a word wi will be [i-window_size, i+window_size+1]
-        negative_samples: float >= 0. 0 for no negative (=random) samples. 1 for same number as positive samples. etc.
-        categorical: bool. if False, labels will be integers (eg. [0, 1, 1 .. ]),
+        window_size: int. actually half-window.
+            The window of a word wi will be [i-window_size, i+window_size+1]
+        negative_samples: float >= 0. 0 for no negative (=random) samples.
+            1 for same number as positive samples. etc.
+        categorical: bool. if False, labels will be
+            integers (eg. [0, 1, 1 .. ]),
            if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]

-        Returns:
-        --------
+    # Returns
        couples, lables: where `couples` are int pairs and
            `labels` are either 0 or 1.

-        Notes:
-        ------
-        By convention, index 0 in the vocabulary is a non-word and will be skipped.
+    # Notes
+        By convention, index 0 in the vocabulary is
+        a non-word and will be skipped.
    '''
    couples = []
    labels = []
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
-'''
-    These preprocessing utils would greatly benefit
-    from a fast Cython rewrite.
+'''These preprocessing utilities would greatly benefit
+from a fast Cython rewrite.
 '''
 from __future__ import absolute_import

@@ -75,8 +74,7 @@ class Tokenizer(object):
        self.char_level = char_level

    def fit_on_texts(self, texts):
-        '''
-            required before using texts_to_sequences or texts_to_matrix
+        '''Required before using texts_to_sequences or texts_to_matrix

        # Arguments
            texts: can be a list of strings,
@@ -107,9 +105,8 @@ class Tokenizer(object):
            self.index_docs[self.word_index[w]] = c

    def fit_on_sequences(self, sequences):
-        '''
-            required before using sequences_to_matrix
-            (if fit_on_texts was never called)
+        '''Required before using sequences_to_matrix
+        (if fit_on_texts was never called)
        '''
        self.document_count = len(sequences)
        self.index_docs = {}
@@ -122,12 +119,11 @@ class Tokenizer(object):
                    self.index_docs[i] += 1

    def texts_to_sequences(self, texts):
-        '''
-            Transform each text in texts in a sequence of integers.
-            Only top "nb_words" most frequent words will be taken into account.
-            Only words known by the tokenizer will be taken into account.
+        '''Transforms each text in texts in a sequence of integers.
+        Only top "nb_words" most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.

-            Returns a list of sequences.
+        Returns a list of sequences.
        '''
        res = []
        for vect in self.texts_to_sequences_generator(texts):
@@ -135,12 +131,14 @@ class Tokenizer(object):
        return res

    def texts_to_sequences_generator(self, texts):
-        '''
-            Transform each text in texts in a sequence of integers.
-            Only top "nb_words" most frequent words will be taken into account.
-            Only words known by the tokenizer will be taken into account.
+        '''Transforms each text in texts in a sequence of integers.
+        Only top "nb_words" most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.

-            Yields individual sequences.
+        Yields individual sequences.
+
+        # Arguments:
+            texts: list of strings.
        '''
        nb_words = self.nb_words
        for text in texts:
@@ -150,56 +148,67 @@ class Tokenizer(object):
                i = self.word_index.get(w)
                if i is not None:
                    if nb_words and i >= nb_words:
-                        pass
+                        continue
                    else:
                        vect.append(i)
            yield vect

-    def texts_to_matrix(self, texts, mode="binary"):
-        '''
-            modes: binary, count, tfidf, freq
+    def texts_to_matrix(self, texts, mode='binary'):
+        '''Convert a list of texts to a Numpy matrix,
+        according to some vectorization mode.
+
+        # Arguments:
+            texts: list of strings.
+            modes: one of "binary", "count", "tfidf", "freq"
        '''
        sequences = self.texts_to_sequences(texts)
        return self.sequences_to_matrix(sequences, mode=mode)

-    def sequences_to_matrix(self, sequences, mode="binary"):
-        '''
-            modes: binary, count, tfidf, freq
+    def sequences_to_matrix(self, sequences, mode='binary'):
+        '''Converts a list of sequences into a Numpy matrix,
+        according to some vectorization mode.
+
+        # Arguments:
+            sequences: list of sequences
+                (a sequence is a list of integer word indices).
+            modes: one of "binary", "count", "tfidf", "freq"
        '''
        if not self.nb_words:
            if self.word_index:
                nb_words = len(self.word_index) + 1
            else:
-                raise Exception("Specify a dimension (nb_words argument), or fit on some text data first.")
+                raise Exception('Specify a dimension (nb_words argument), '
+                                'or fit on some text data first.')
        else:
            nb_words = self.nb_words

-        if mode == "tfidf" and not self.document_count:
-            raise Exception("Fit the Tokenizer on some data before using tfidf mode.")
+        if mode == 'tfidf' and not self.document_count:
+            raise Exception('Fit the Tokenizer on some data '
+                            'before using tfidf mode.')

        X = np.zeros((len(sequences), nb_words))
        for i, seq in enumerate(sequences):
            if not seq:
-                pass
+                continue
            counts = {}
            for j in seq:
                if j >= nb_words:
-                    pass
+                    continue
                if j not in counts:
                    counts[j] = 1.
                else:
                    counts[j] += 1
            for j, c in list(counts.items()):
-                if mode == "count":
+                if mode == 'count':
                    X[i][j] = c
-                elif mode == "freq":
+                elif mode == 'freq':
                    X[i][j] = c / len(seq)
-                elif mode == "binary":
+                elif mode == 'binary':
                    X[i][j] = 1
-                elif mode == "tfidf":
+                elif mode == 'tfidf':
                    tf = np.log(c / len(seq))
                    df = (1 + np.log(1 + self.index_docs.get(j, 0) / (1 + self.document_count)))
                    X[i][j] = tf / df
                else:
-                    raise Exception("Unknown vectorization mode: " + str(mode))
+                    raise Exception('Unknown vectorization mode: ' + str(mode))
        return X
@@ -0,0 +1,95 @@
+from __future__ import absolute_import
+from __future__ import print_function
+
+import tarfile
+import os
+import sys
+import shutil
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.error import URLError, HTTPError
+
+from ..utils.generic_utils import Progbar
+
+
+# Under Python 2, 'urlretrieve' relies on FancyURLopener from legacy
+# urllib module, known to have issues with proxy management
+if sys.version_info[0] == 2:
+    def urlretrieve(url, filename, reporthook=None, data=None):
+        def chunk_read(response, chunk_size=8192, reporthook=None):
+            total_size = response.info().get('Content-Length').strip()
+            total_size = int(total_size)
+            count = 0
+            while 1:
+                chunk = response.read(chunk_size)
+                if not chunk:
+                    break
+                count += 1
+                if reporthook:
+                    reporthook(count, chunk_size, total_size)
+                yield chunk
+
+        response = urlopen(url, data)
+        with open(filename, 'wb') as fd:
+            for chunk in chunk_read(response, reporthook=reporthook):
+                fd.write(chunk)
+else:
+    from six.moves.urllib.request import urlretrieve
+
+
+def get_file(fname, origin, untar=False):
+    datadir_base = os.path.expanduser(os.path.join('~', '.keras'))
+    if not os.access(datadir_base, os.W_OK):
+        datadir_base = os.path.join('/tmp', '.keras')
+    datadir = os.path.join(datadir_base, 'datasets')
+    if not os.path.exists(datadir):
+        os.makedirs(datadir)
+
+    if untar:
+        untar_fpath = os.path.join(datadir, fname)
+        fpath = untar_fpath + '.tar.gz'
+    else:
+        fpath = os.path.join(datadir, fname)
+
+    if not os.path.exists(fpath):
+        print('Downloading data from',  origin)
+        global progbar
+        progbar = None
+
+        def dl_progress(count, block_size, total_size):
+            global progbar
+            if progbar is None:
+                progbar = Progbar(total_size)
+            else:
+                progbar.update(count*block_size)
+
+        error_msg = 'URL fetch failure on {}: {} -- {}'
+        try:
+            try:
+                urlretrieve(origin, fpath, dl_progress)
+            except URLError as e:
+                raise Exception(error_msg.format(origin, e.errno, e.reason))
+            except HTTPError as e:
+                raise Exception(error_msg.format(origin, e.code, e.msg))
+        except (Exception, KeyboardInterrupt) as e:
+            if os.path.exists(fpath):
+                os.remove(fpath)
+            raise e
+        progbar = None
+
+    if untar:
+        if not os.path.exists(untar_fpath):
+            print('Untaring file...')
+            tfile = tarfile.open(fpath, 'r:gz')
+            try:
+                tfile.extractall(path=datadir)
+            except (Exception, KeyboardInterrupt) as e:
+                if os.path.exists(untar_fpath):
+                    if os.path.isfile(untar_fpath):
+                        os.remove(untar_fpath)
+                    else:
+                        shutil.rmtree(untar_fpath)
+                raise e
+            tfile.close()
+        return untar_fpath
+
+    return fpath
@@ -63,15 +63,15 @@ class Progbar(object):
            numdigits = int(np.floor(np.log10(self.target))) + 1
            barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
            bar = barstr % (current, self.target)
-            prog = float(current)/self.target
-            prog_width = int(self.width*prog)
+            prog = float(current) / self.target
+            prog_width = int(self.width * prog)
            if prog_width > 0:
-                bar += ('='*(prog_width-1))
+                bar += ('=' * (prog_width-1))
                if current < self.target:
                    bar += '>'
                else:
                    bar += '='
-            bar += ('.'*(self.width-prog_width))
+            bar += ('.' * (self.width - prog_width))
            bar += ']'
            sys.stdout.write(bar)
            self.total_width = len(bar)
@@ -80,7 +80,7 @@ class Progbar(object):
                time_per_unit = (now - self.start) / current
            else:
                time_per_unit = 0
-            eta = time_per_unit*(self.target - current)
+            eta = time_per_unit * (self.target - current)
            info = ''
            if current < self.target:
                info += ' - ETA: %ds' % eta
@@ -99,7 +99,7 @@ class Progbar(object):

            self.total_width += len(info)
            if prev_total_width > self.total_width:
-                info += ((prev_total_width-self.total_width) * " ")
+                info += ((prev_total_width - self.total_width) * " ")

            sys.stdout.write(info)
            sys.stdout.flush()
@@ -120,4 +120,4 @@ class Progbar(object):
                sys.stdout.write(info + "\n")

    def add(self, n, values=[]):
-        self.update(self.seen_so_far+n, values)
+        self.update(self.seen_so_far + n, values)
@@ -10,6 +10,7 @@ from ..layers.embeddings import *
 from ..layers.noise import *
 from ..layers.normalization import *
 from ..layers.recurrent import *
+from ..layers.wrappers import *
 from ..layers import containers
 from .. import regularizers
 from .. import constraints
@@ -56,6 +57,7 @@ def container_from_config(original_layer_dict, custom_objects={}):
        for node in nodes:
            layer = container_from_config(layer_dict['nodes'].get(node['name']))
            node['layer'] = layer
+            node['create_output'] = False  # outputs will be added below
            graph_layer.add_node(**node)

        outputs = layer_dict.get('output_config')
@@ -71,6 +73,13 @@ def container_from_config(original_layer_dict, custom_objects={}):
                kwargs[kwarg] = layer_dict[kwarg]
        return AutoEncoder(**kwargs)

+    elif name == 'TimeDistributed':
+        child_layer = container_from_config(layer_dict.pop('layer'))
+        # the "name" keyword argument of layers is saved as "custom_name"
+        if 'custom_name' in layer_dict:
+            layer_dict['name'] = layer_dict.pop('custom_name')
+        return TimeDistributed(child_layer, **layer_dict)
+
    else:  # this is a non-topological layer (e.g. Dense, etc.)
        layer_dict.pop('name')

@@ -89,6 +98,7 @@ def container_from_config(original_layer_dict, custom_objects={}):
        # the "name" keyword argument of layers is saved as "custom_name"
        if 'custom_name' in layer_dict:
            layer_dict['name'] = layer_dict.pop('custom_name')
+
        base_layer = get_layer(name, layer_dict)
        return base_layer

@@ -149,6 +149,6 @@ def to_graph(model, **kwargs):
    return ModelToDot()(model, **kwargs)


-def plot(model, to_file='model.png'):
-    graph = to_graph(model)
+def plot(model, to_file='model.png', **kwargs):
+    graph = to_graph(model, **kwargs)
    graph.write_png(to_file)
@@ -1,266 +1,268 @@
 from __future__ import absolute_import
-import abc
 import copy
+import inspect
+import types
 import numpy as np

 from ..utils.np_utils import to_categorical
+from ..models import Sequential


 class BaseWrapper(object):
-    """
-    Base class for the Keras scikit-learn wrapper.
+    '''Base class for the Keras scikit-learn wrapper.

-    Warning: This class should not be used directly. Use derived classes instead.
+    Warning: This class should not be used directly.
+    Use descendant classes instead.

-    Parameters
-    ----------
-    train_batch_size : int, optional
-        Number of training samples evaluated at a time.
-    test_batch_size : int, optional
-        Number of test samples evaluated at a time.
-    nb_epochs : int, optional
-        Number of training epochs.
-    shuffle : boolean, optional
-        Whether to shuffle the samples at each epoch.
-    show_accuracy : boolean, optional
-        Whether to display class accuracy in the logs at each epoch.
-    validation_split : float [0, 1], optional
-        Fraction of the data to use as held-out validation data.
-    validation_data : tuple (X, y), optional
-        Data to be used as held-out validation data. Will override validation_split.
-    callbacks : list, optional
-        List of callbacks to apply during training.
-    verbose : int, optional
-        Verbosity level.
-    """
-    __metaclass__ = abc.ABCMeta
+    # Arguments
+        build_fn: callable function or class instance
+        sk_params: model parameters & fitting parameters

-    @abc.abstractmethod
-    def __init__(self, model, optimizer, loss,
-                 train_batch_size=128, test_batch_size=128,
-                 nb_epoch=100, shuffle=True, show_accuracy=False,
-                 validation_split=0, validation_data=None, callbacks=None,
-                 verbose=0,):
-        self.model = model
-        self.optimizer = optimizer
-        self.loss = loss
-        self.compiled_model_ = None
-        self.classes_ = []
-        self.config_ = []
-        self.weights_ = []
+    The build_fn should construct, compile and return a Keras model, which
+    will then be used to fit/predict. One of the following
+    three values could be passed to build_fn:
+    1. A function
+    2. An instance of a class that implements the __call__ method
+    3. None. This means you implement a class that inherits from either
+    `KerasClassifier` or `KerasRegressor`. The __call__ method of the
+    present class will then be treated as the default build_fn.

-        self.train_batch_size = train_batch_size
-        self.test_batch_size = test_batch_size
-        self.nb_epoch = nb_epoch
-        self.shuffle = shuffle
-        self.show_accuracy = show_accuracy
-        self.validation_split = validation_split
-        self.validation_data = validation_data
-        self.callbacks = [] if callbacks is None else callbacks
+    `sk_params` takes both model parameters and fitting parameters. Legal model
+    parameters are the arguments of `build_fn`. Note that like all other
+    estimators in scikit-learn, 'build_fn' should provide defalult values for
+    its arguments, so that you could create the estimator without passing any
+    values to `sk_params`.

-        self.verbose = verbose
+    `sk_params` could also accept parameters for calling `fit`, `predict`,
+    `predict_proba`, and `score` methods (e.g., `nb_epoch`, `batch_size`).
+    fitting (predicting) parameters are selected in the following order:
+
+    1. Values passed to the dictionary arguments of
+    `fit`, `predict`, `predict_proba`, and `score` methods
+    2. Values passed to `sk_params`
+    3. The default values of the `keras.models.Sequential`
+    `fit`, `predict`, `predict_proba` and `score` methods
+
+    When using scikit-learn's `grid_search` API, legal tunable parameters are
+    those you could pass to `sk_params`, including fitting parameters.
+    In other words, you could use `grid_search` to search for the best
+    `batch_size` or `nb_epoch` as well as the model parameters.
+    '''
+
+    def __init__(self, build_fn=None, **sk_params):
+        self.build_fn = build_fn
+        self.sk_params = sk_params
+        self.check_params(sk_params)
+
+    def check_params(self, params):
+        '''Check for user typos in "params" keys to avoid
+        unwanted usage of default values
+
+        # Arguments
+            params: dictionary
+                The parameters to be checked
+        '''
+        legal_params_fns = [Sequential.fit, Sequential.predict,
+                            Sequential.predict_classes, Sequential.evaluate]
+        if self.build_fn is None:
+            legal_params_fns.append(self.__call__)
+        elif not isinstance(self.build_fn, types.FunctionType):
+            legal_params_fns.append(self.build_fn.__call__)
+        else:
+            legal_params_fns.append(self.build_fn)
+
+        legal_params = []
+        for fn in legal_params_fns:
+            legal_params += inspect.getargspec(fn)[0]
+        legal_params = set(legal_params)
+
+        for params_name in params:
+            if params_name not in legal_params:
+                assert False, '{} is not a legal parameter'.format(params_name)

    def get_params(self, deep=True):
-        """
-        Get parameters for this estimator.
+        '''Get parameters for this estimator.

-        Parameters
-        ----------
-        deep: boolean, optional
-            If True, will return the parameters for this estimator and
-            contained subobjects that are estimators.
+        # Arguments
+            deep: boolean, optional
+                If True, will return the parameters for this estimator and
+                contained sub-objects that are estimators.

-        Returns
-        -------
-        params : dict
-            Dictionary of parameter names mapped to their values.
-        """
-        return {'model': self.model, 'optimizer': self.optimizer, 'loss': self.loss}
+        # Returns
+            params : dict
+                Dictionary of parameter names mapped to their values.
+        '''
+        res = copy.deepcopy(self.sk_params)
+        res.update({'build_fn': self.build_fn})
+        return res

    def set_params(self, **params):
-        """
-        Set the parameters of this estimator.
+        '''Set the parameters of this estimator.

-        Parameters
-        ----------
+        # Arguments
        params: dict
            Dictionary of parameter names mapped to their values.

-        Returns
-        -------
-        self
-        """
-        for parameter, value in params.items():
-            setattr(self, parameter, value)
+        # Returns
+            self
+        '''
+        self.check_params(params)
+        self.sk_params.update(params)
        return self

-    def fit(self, X, y):
-        """
-        Fit the model according to the given training data.
+    def fit(self, X, y, **kwargs):
+        '''Construct a new model with build_fn and fit the model according
+        to the given training data.

-        Makes a copy of the un-compiled model definition to use for
-        compilation and fitting, leaving the original definition
-        intact.
+        # Arguments
+            X : array-like, shape `(n_samples, n_features)`
+                Training samples where n_samples in the number of samples
+                and n_features is the number of features.
+            y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+                True labels for X.
+            kwargs: dictionary arguments
+                Legal arguments are the arguments of `Sequential.fit`

-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
-        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
-            True labels for X.
+        # Returns
+            history : object
+                details about the training history at each epoch.
+        '''

-        Returns
-        -------
-        history : object
-            Returns details about the training history at each epoch.
-        """
-        if len(y.shape) == 1:
-            self.classes_ = list(np.unique(y))
-            if self.loss == 'categorical_crossentropy':
-                y = to_categorical(y)
+        if self.build_fn is None:
+            self.model = self.__call__(**self.filter_sk_params(self.__call__))
+        elif not isinstance(self.build_fn, types.FunctionType):
+            self.model = self.build_fn(
+                **self.filter_sk_params(self.build_fn.__call__))
        else:
-            self.classes_ = np.arange(0, y.shape[1])
+            self.model = self.build_fn(**self.filter_sk_params(self.build_fn))

-        self.compiled_model_ = copy.deepcopy(self.model)
-        self.compiled_model_.compile(optimizer=self.optimizer, loss=self.loss)
-        history = self.compiled_model_.fit(
-            X, y, batch_size=self.train_batch_size, nb_epoch=self.nb_epoch, verbose=self.verbose,
-            shuffle=self.shuffle, show_accuracy=self.show_accuracy,
-            validation_split=self.validation_split, validation_data=self.validation_data,
-            callbacks=self.callbacks)
+        if self.model.loss.__name__ == 'categorical_crossentropy' and len(y.shape) != 2:
+            y = to_categorical(y)

-        self.config_ = self.model.get_config()
-        self.weights_ = self.model.get_weights()
+        fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
+        fit_args.update(kwargs)
+
+        history = self.model.fit(X, y, **fit_args)

        return history

+    def filter_sk_params(self, fn, override={}):
+        '''Filter sk_params and return those in fn's arguments
+
+        # Arguments
+            fn : arbitrary function
+            override: dictionary, values to overrid sk_params
+
+        # Returns
+            res : dictionary dictionary containing variabls
+                in both sk_params and fn's arguments.
+        '''
+        res = {}
+        fn_args = inspect.getargspec(fn)[0]
+        for name, value in self.sk_params.items():
+            if name in fn_args:
+                res.update({name: value})
+        res.update(override)
+        return res
+

 class KerasClassifier(BaseWrapper):
-    """
-    Implementation of the scikit-learn classifier API for Keras.
+    '''Implementation of the scikit-learn classifier API for Keras.
+    '''

-    Parameters
-    ----------
-    model : object
-        An un-compiled Keras model object is required to use the scikit-learn wrapper.
-    optimizer : string
-        Optimization method used by the model during compilation/training.
-    loss : string
-        Loss function used by the model during compilation/training.
-    """
-    def __init__(self, model, optimizer='adam', loss='categorical_crossentropy', **kwargs):
-        super(KerasClassifier, self).__init__(model, optimizer, loss, **kwargs)
+    def predict(self, X, **kwargs):
+        '''Returns the class predictions for the given test data.

-    def predict(self, X):
-        """
-        Returns the class predictions for the given test data.
+        # Arguments
+            X: array-like, shape `(n_samples, n_features)`
+                Test samples where n_samples in the number of samples
+                and n_features is the number of features.
+            kwargs: dictionary arguments
+                Legal arguments are the arguments of `Sequential.predict_classes`.

-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+        # Returns
+            preds: array-like, shape `(n_samples,)`
+                Class predictions.
+        '''
+        kwargs = self.filter_sk_params(Sequential.predict_classes, kwargs)
+        return self.model.predict_classes(X, **kwargs)

-        Returns
-        -------
-        preds : array-like, shape = (n_samples)
-            Class predictions.
-        """
-        return self.compiled_model_.predict_classes(
-            X, batch_size=self.test_batch_size, verbose=self.verbose)
+    def predict_proba(self, X, **kwargs):
+        '''Returns class probability estimates for the given test data.

-    def predict_proba(self, X):
-        """
-        Returns class probability estimates for the given test data.
+        # Arguments
+            X: array-like, shape `(n_samples, n_features)`
+                Test samples where n_samples in the number of samples
+                and n_features is the number of features.
+            kwargs: dictionary arguments
+                Legal arguments are the arguments of `Sequential.predict_classes`.

-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+        # Returns
+            proba: array-like, shape `(n_samples, n_outputs)`
+                Class probability estimates.
+        '''
+        kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
+        return self.model.predict_proba(X, **kwargs)

-        Returns
-        -------
-        proba : array-like, shape = (n_samples, n_outputs)
-            Class probability estimates.
-        """
-        return self.compiled_model_.predict_proba(
-            X, batch_size=self.test_batch_size, verbose=self.verbose)
+    def score(self, X, y, **kwargs):
+        '''Returns the mean accuracy on the given test data and labels.

-    def score(self, X, y):
-        """
-        Returns the mean accuracy on the given test data and labels.
+        # Arguments
+            X: array-like, shape `(n_samples, n_features)`
+                Test samples where n_samples in the number of samples
+                and n_features is the number of features.
+            y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+                True labels for X.
+            kwargs: dictionary arguments
+                Legal arguments are the arguments of `Sequential.evaluate`.

-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
-        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
-            True labels for X.
-
-        Returns
-        -------
-        score : float
-            Mean accuracy of predictions on X wrt. y.
-        """
-        loss, accuracy = self.compiled_model_.evaluate(
-            X, y, batch_size=self.test_batch_size, show_accuracy=True, verbose=self.verbose)
+        # Returns
+            score: float
+                Mean accuracy of predictions on X wrt. y.
+        '''
+        kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
+        kwargs.update({'show_accuracy': True})
+        loss, accuracy = self.model.evaluate(X, y, **kwargs)
        return accuracy


 class KerasRegressor(BaseWrapper):
-    """
-    Implementation of the scikit-learn regressor API for Keras.
+    '''Implementation of the scikit-learn regressor API for Keras.
+    '''

-    Parameters
-    ----------
-    model : object
-        An un-compiled Keras model object is required to use the scikit-learn wrapper.
-    optimizer : string
-        Optimization method used by the model during compilation/training.
-    loss : string
-        Loss function used by the model during compilation/training.
-    """
-    def __init__(self, model, optimizer='adam', loss='mean_squared_error', **kwargs):
-        super(KerasRegressor, self).__init__(model, optimizer, loss, **kwargs)
+    def predict(self, X, **kwargs):
+        '''Returns predictions for the given test data.

-    def predict(self, X):
-        """
-        Returns predictions for the given test data.
+        # Arguments
+            X: array-like, shape `(n_samples, n_features)`
+                Test samples where n_samples in the number of samples
+                and n_features is the number of features.
+            kwargs: dictionary arguments
+                Legal arguments are the arguments of `Sequential.predict`.
+        # Returns
+            preds: array-like, shape `(n_samples,)`
+                Predictions.
+        '''
+        kwargs = self.filter_sk_params(Sequential.predict, kwargs)
+        return self.model.predict(X, **kwargs)

-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+    def score(self, X, y, **kwargs):
+        '''Returns the mean accuracy on the given test data and labels.

-        Returns
-        -------
-        preds : array-like, shape = (n_samples)
-            Predictions.
-        """
-        return self.compiled_model_.predict(
-            X, batch_size=self.test_batch_size, verbose=self.verbose).ravel()
+        # Arguments
+            X: array-like, shape `(n_samples, n_features)`
+                Test samples where n_samples in the number of samples
+                and n_features is the number of features.
+            y: array-like, shape `(n_samples,)`
+                True labels for X.
+            kwargs: dictionary arguments
+                Legal arguments are the arguments of `Sequential.evaluate`.

-    def score(self, X, y):
-        """
-        Returns the mean accuracy on the given test data and labels.
-
-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
-        y : array-like, shape = (n_samples)
-            True labels for X.
-
-        Returns
-        -------
-        score : float
-            Loss from predictions on X wrt. y.
-        """
-        loss = self.compiled_model_.evaluate(
-            X, y, batch_size=self.test_batch_size, show_accuracy=False, verbose=self.verbose)
+        # Returns
+            score: float
+                Mean accuracy of predictions on X wrt. y.
+        '''
+        kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
+        kwargs.update({'show_accuracy': False})
+        loss = self.model.evaluate(X, y, **kwargs)
        return loss
@@ -3,12 +3,12 @@ from setuptools import find_packages


 setup(name='Keras',
-      version='0.3.1',
-      description='Theano-based Deep Learning library',
+      version='0.3.3',
+      description='Deep Learning for Python',
      author='Francois Chollet',
      author_email='francois.chollet@gmail.com',
      url='https://github.com/fchollet/keras',
-      download_url='https://github.com/fchollet/keras/tarball/0.3.1',
+      download_url='https://github.com/fchollet/keras/tarball/0.3.3',
      license='MIT',
      install_requires=['theano', 'pyyaml', 'six'],
      extras_require={
@@ -35,7 +35,7 @@ def test_image_classification():
        Activation('relu'),
        Dense(y_test.shape[-1], activation='softmax')
    ])
-    model.compile(loss='categorical_crossentropy', optimizer='sgd')
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    history = model.fit(X_train, y_train, nb_epoch=10, batch_size=16,
                        validation_data=(X_test, y_test),
                        show_accuracy=True, verbose=0)
@@ -40,6 +40,8 @@ class TestBackend(object):

    def test_linear_operations(self):
        check_two_tensor_operation('dot', (4, 2), (2, 4))
+        check_two_tensor_operation('batch_dot', (4, 2, 3), (4, 5, 3),
+                                   axes=((2,), (2,)))
        check_single_tensor_operation('transpose', (4, 2))

    def test_shape_operations(self):
@@ -145,6 +147,7 @@ class TestBackend(object):
        check_single_tensor_operation('exp', (4, 2))
        check_single_tensor_operation('log', (4, 2))
        check_single_tensor_operation('round', (4, 2))
+        check_single_tensor_operation('sign', (4, 2))
        check_single_tensor_operation('pow', (4, 2), a=3)
        check_single_tensor_operation('clip', (4, 2), min_value=0.4,
                                      max_value=0.6)
@@ -273,7 +276,7 @@ class TestBackend(object):
        check_single_tensor_operation('tanh', (4, 2))

        # dropout
-        val = np.random.random((20, 20))
+        val = np.random.random((100, 100))
        xth = KTH.variable(val)
        xtf = KTF.variable(val)
        zth = KTH.eval(KTH.dropout(xth, level=0.2))
@@ -355,6 +358,20 @@ class TestBackend(object):
        assert(np.max(rand) <= max)
        assert(np.min(rand) >= min)

+    def test_random_binomial(self):
+        p = 0.5
+        rand = KTF.eval(KTF.random_binomial((1000, 1000), p))
+        assert(rand.shape == (1000, 1000))
+        assert(np.abs(np.mean(rand) - p) < 0.01)
+        assert(np.max(rand) == 1)
+        assert(np.min(rand) == 0)
+
+        rand = KTH.eval(KTH.random_binomial((1000, 1000), p))
+        assert(rand.shape == (1000, 1000))
+        assert(np.abs(np.mean(rand) - p) < 0.01)
+        assert(np.max(rand) == 1)
+        assert(np.min(rand) == 0)
+

 if __name__ == '__main__':
    pytest.main([__file__])
@@ -29,7 +29,7 @@ def test_leaky_relu():
        layer.input = K.variable(-inp)
        for train in [True, False]:
            outp = K.eval(layer.get_output(train))
-            assert_allclose(outp, -inp*alpha)
+            assert_allclose(outp, -inp * alpha)

        config = layer.get_config()
        assert config['alpha'] == alpha
@@ -53,7 +53,7 @@ def test_prelu():

        layer.input = K.variable(-inp)
        outp = K.eval(layer.get_output(train))
-        assert_allclose(-alphas*inp, outp)
+        assert_allclose(-alphas * inp, outp)

        # test with default weights
        layer = PReLU(input_shape=inp.flatten().shape)
@@ -65,7 +65,7 @@ def test_prelu():
        layer.input = K.variable(-inp)
        outp = K.eval(layer.get_output(train))

-        assert_allclose(0., alphas*outp)
+        assert_allclose(0., alphas * outp)

        layer.get_config()

@@ -84,7 +84,7 @@ def test_elu():
        layer.input = K.variable(-inp)
        for train in [True, False]:
            outp = K.eval(layer.get_output(train))
-            assert_allclose(outp, alpha*(np.exp(-inp)-1.), rtol=1e-3)
+            assert_allclose(outp, alpha * (np.exp(-inp) - 1.), rtol=1e-3)

        config = layer.get_config()
        assert config['alpha'] == alpha
@@ -107,7 +107,7 @@ def test_parametric_softplus():
            layer.build()
            for train in [True, False]:
                outp = K.eval(layer.get_output(train))
-                assert_allclose(outp, alpha*np.log(1.+np.exp(beta*inp)),
+                assert_allclose(outp, alpha * np.log(1. + np.exp(beta * inp)),
                                atol=1e-3)

            config = layer.get_config()
@@ -126,12 +126,12 @@ def test_thresholded_linear():
        layer.input = K.variable(inp)
        for train in [True, False]:
            outp = K.eval(layer.get_output(train))
-            assert_allclose(outp, inp*(np.abs(inp) >= theta))
+            assert_allclose(outp, inp * (np.abs(inp) >= theta))

        layer.input = K.variable(-inp)
        for train in [True, False]:
            outp = K.eval(layer.get_output(train))
-            assert_allclose(outp, -inp*(np.abs(inp) >= theta))
+            assert_allclose(outp, -inp * (np.abs(inp) >= theta))

        config = layer.get_config()
        assert config['theta'] == theta
@@ -148,16 +148,34 @@ def test_thresholded_relu():
        layer.input = K.variable(inp)
        for train in [True, False]:
            outp = K.eval(layer.get_output(train))
-            assert_allclose(outp, inp*(inp > theta))
+            assert_allclose(outp, inp * (inp > theta))

        layer.input = K.variable(-inp)
        for train in [True, False]:
            outp = K.eval(layer.get_output(train))
-            assert_allclose(outp, -inp*(-inp > theta))
+            assert_allclose(outp, -inp * (-inp > theta))

        config = layer.get_config()
        assert config['theta'] == theta


+def test_srelu():
+    from keras.layers.advanced_activations import SReLU
+    np.random.seed(1337)
+    inp = np.array([-2, -1., -0.5, 0., 0.5, 1., 2.])
+    out = np.array([-1.5, -1., -0.5, 0., 0.5, 1., 3.])
+    input_size = len(inp)
+    for train in [True, False]:
+        layer = SReLU(input_shape=inp.flatten().shape)
+        ones_proto = np.ones(input_size)
+        layer.set_weights([ones_proto * -1., ones_proto * 0.5,
+                           ones_proto * 2., ones_proto * 2.])
+        layer.input = K.variable(inp)
+        outp = K.eval(layer.get_output(train))
+        assert_allclose(out, outp)
+
+        layer.get_config()
+
+
 if __name__ == '__main__':
    pytest.main([__file__])
@@ -6,7 +6,7 @@ from numpy.testing import assert_allclose

 from keras import backend as K
 from keras.layers.core import Dense
-from keras.models import Sequential
+from keras.models import Sequential, Graph


 def test_layer_call():
@@ -56,5 +56,157 @@ def test_sequential_call():
    assert_allclose(y1, y2)


+def test_graph_call():
+    """Test keras.models.Graph.__call__"""
+    nb_samples, input_dim, output_dim = 3, 10, 5
+    model = Graph()
+    model.add_input('input', input_shape=(input_dim, ))
+    model.add_node(Dense(output_dim=output_dim, input_dim=input_dim),
+                   input='input', name='output', create_output=True)
+
+    model.compile('sgd', {'output': 'mse'})
+
+    # test flat model
+    X = K.placeholder(ndim=2)
+    Y = model(X)
+    f = K.function([X], [Y])
+
+    x = np.ones((nb_samples, input_dim)).astype(K.floatx())
+    y1 = f([x])[0].astype(K.floatx())
+    y2 = model.predict({'input': x})['output']
+    # results of __call__ should match model.predict
+    assert_allclose(y1, y2)
+
+    # test nested Graph models
+    model2 = Graph()
+    model2.add_input('input', input_shape=(input_dim, ))
+    model2.add_node(model, input='input', name='output', create_output=True)
+    # need to turn off cache because we're reusing model
+    model2.cache_enabled = False
+    model2.compile('sgd', {'output': 'mse'})
+
+    Y2 = model2(X)
+    f = K.function([X], [Y2])
+
+    y1 = f([x])[0].astype(K.floatx())
+    y2 = model2.predict({'input': x})['output']
+    # results of __call__ should match model.predict
+    assert_allclose(y1, y2)
+
+
+def test_graph_multiple_in_out_call():
+    """Test keras.models.Graph.__call__ with multiple inputs"""
+    nb_samples, input_dim, output_dim = 3, 10, 5
+    model = Graph()
+    model.add_input('input1', input_shape=(input_dim, ))
+    model.add_input('input2', input_shape=(input_dim, ))
+    model.add_node(Dense(output_dim=output_dim, input_dim=input_dim),
+                   inputs=['input1', 'input2'], merge_mode='sum', name='output', create_output=True)
+
+    model.compile('sgd', {'output': 'mse'})
+
+    # test flat model
+    X1 = K.placeholder(ndim=2)
+    X2 = K.placeholder(ndim=2)
+    Y = model({'input1': X1, 'input2': X2})['output']
+    f = K.function([X1, X2], [Y])
+
+    x1 = np.ones((nb_samples, input_dim)).astype(K.floatx())
+    x2 = np.ones((nb_samples, input_dim)).astype(K.floatx()) * -2
+    y1 = f([x1, x2])[0].astype(K.floatx())
+    y2 = model.predict({'input1': x1, 'input2': x2})['output']
+    # results of __call__ should match model.predict
+    assert_allclose(y1, y2)
+
+    # test with single input, multiple outputs
+    model2 = Graph()
+    model2.add_input('input', input_shape=(input_dim, ))
+    model2.add_node(Dense(output_dim=output_dim, input_dim=input_dim),
+                    input='input', name='output1', create_output=True)
+    model2.add_node(Dense(output_dim=output_dim, input_dim=input_dim),
+                    input='input', name='output2', create_output=True)
+
+    model2.compile('sgd', {'output1': 'mse', 'output2': 'mse'})
+
+    # test flat model
+    X = K.placeholder(ndim=2)
+    Y = model2(X)
+    f = K.function([X], [Y['output1'], Y['output2']])
+
+    x = np.ones((nb_samples, input_dim)).astype(K.floatx())
+    out = f([x])
+    y1a = out[0].astype(K.floatx())
+    y1b = out[1].astype(K.floatx())
+    y2 = model2.predict({'input': x})
+    # results of __call__ should match model.predict
+    assert_allclose(y1a, y2['output1'])
+    assert_allclose(y1b, y2['output2'])
+
+    # test with multiple inputs, multiple outputs
+    model3 = Graph()
+    model3.add_input('input1', input_shape=(input_dim, ))
+    model3.add_input('input2', input_shape=(input_dim, ))
+    model3.add_shared_node(Dense(output_dim=output_dim, input_dim=input_dim),
+                           inputs=['input1', 'input2'], name='output',
+                           outputs=['output1', 'output2'], create_output=True)
+    model3.compile('sgd', {'output1': 'mse', 'output2': 'mse'})
+
+    # test flat model
+    Y = model3({'input1': X1, 'input2': X2})
+    f = K.function([X1, X2], [Y['output1'], Y['output2']])
+
+    x1 = np.ones((nb_samples, input_dim)).astype(K.floatx())
+    x2 = np.ones((nb_samples, input_dim)).astype(K.floatx()) * -2
+    out = f([x1, x2])
+    y1a = out[0].astype(K.floatx())
+    y1b = out[1].astype(K.floatx())
+    y2 = model3.predict({'input1': x1, 'input2': x2})
+    # results of __call__ should match model.predict
+    assert_allclose(y1a, y2['output1'])
+    assert_allclose(y1b, y2['output2'])
+
+
+def test_nested_call():
+    """Test nested Sequential and Graph models"""
+    nb_samples, input_dim, output_dim = 3, 10, 5
+    X = K.placeholder(ndim=2)
+    x = np.ones((nb_samples, input_dim)).astype(K.floatx())
+
+    # test Graph model nested inside Sequential model
+    model = Graph()
+    model.add_input('input', input_shape=(input_dim, ))
+    model.add_node(Dense(output_dim=output_dim, input_dim=input_dim),
+                   input='input', name='output', create_output=True)
+
+    model2 = Sequential()
+    model2.add(model)
+    model2.compile('sgd', 'mse')
+
+    Y2 = model2(X)
+    f = K.function([X], [Y2])
+
+    y1 = f([x])[0].astype(K.floatx())
+    y2 = model2.predict(x)
+    # results of __call__ should match model.predict
+    assert_allclose(y1, y2)
+
+    # test Sequential model inside Graph model
+    model3 = Sequential()
+    model3.add(Dense(output_dim=output_dim, input_dim=input_dim))
+
+    model4 = Graph()
+    model4.add_input('input', input_shape=(input_dim, ))
+    model4.add_node(model3, input='input', name='output', create_output=True)
+    model4.compile('sgd', {'output': 'mse'})
+
+    Y2 = model4(X)
+    f = K.function([X], [Y2])
+
+    y1 = f([x])[0].astype(K.floatx())
+    y2 = model4.predict({'input': x})['output']
+    # results of __call__ should match model.predict
+    assert_allclose(y1, y2)
+
+
 if __name__ == '__main__':
    pytest.main([__file__])
@@ -113,6 +113,38 @@ def test_convolution_2d():
                            layer.get_config()


+def test_convolution_2d_dim_ordering():
+    nb_filter = 4
+    nb_row = 3
+    nb_col = 2
+    stack_size = 3
+
+    np.random.seed(1337)
+    weights = [np.random.random((nb_filter, stack_size, nb_row, nb_col)),
+               np.random.random(nb_filter)]
+    input = np.random.random((1, stack_size, 10, 10))
+
+    layer = convolutional.Convolution2D(
+        nb_filter, nb_row, nb_col,
+        weights=weights,
+        input_shape=input.shape[1:],
+        dim_ordering='th')
+    layer.input = K.variable(input)
+    out_th = K.eval(layer.get_output(False))
+
+    input = np.transpose(input, (0, 2, 3, 1))
+    weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+    layer = convolutional.Convolution2D(
+        nb_filter, nb_row, nb_col,
+        weights=weights,
+        input_shape=input.shape[1:],
+        dim_ordering='tf')
+    layer.input = K.variable(input)
+    out_tf = K.eval(layer.get_output(False))
+
+    assert_allclose(out_tf, np.transpose(out_th, (0, 2, 3, 1)), atol=1e-05)
+
+
 def test_maxpooling_2d():
    nb_samples = 9
    stack_size = 7
@@ -131,16 +163,128 @@ def test_maxpooling_2d():
        layer.get_config()


+def test_maxpooling_2d_dim_ordering():
+    stack_size = 3
+
+    input = np.random.random((1, stack_size, 10, 10))
+
+    layer = convolutional.MaxPooling2D(
+        (2, 2),
+        input_shape=input.shape[1:],
+        dim_ordering='th')
+    layer.input = K.variable(input)
+    out_th = K.eval(layer.get_output(False))
+
+    input = np.transpose(input, (0, 2, 3, 1))
+    layer = convolutional.MaxPooling2D(
+        (2, 2),
+        input_shape=input.shape[1:],
+        dim_ordering='tf')
+    layer.input = K.variable(input)
+    out_tf = K.eval(layer.get_output(False))
+
+    assert_allclose(out_tf, np.transpose(out_th, (0, 2, 3, 1)), atol=1e-05)
+
+
 def test_averagepooling_2d():
    nb_samples = 9
    stack_size = 7
    input_nb_row = 11
    input_nb_col = 12
-    pool_size = (3, 3)

    input = np.ones((nb_samples, stack_size, input_nb_row, input_nb_col))
-    for strides in [(1, 1), (2, 2)]:
-        layer = convolutional.AveragePooling2D(strides=strides,
+    for border_mode in ['valid', 'same']:
+        for pool_size in [(2, 2), (3, 3), (4, 4), (5, 5)]:
+            for strides in [(1, 1), (2, 2)]:
+                layer = convolutional.AveragePooling2D(strides=strides,
+                                                       border_mode=border_mode,
+                                                       pool_size=pool_size)
+                layer.input = K.variable(input)
+                for train in [True, False]:
+                    out = K.eval(layer.get_output(train))
+                    if border_mode == 'same' and strides == (1, 1):
+                        assert input.shape == out.shape
+                layer.get_config()
+
+
+@pytest.mark.skipif(K._BACKEND != 'theano', reason="Requires Theano backend")
+def test_convolution_3d():
+    nb_samples = 8
+    nb_filter = 9
+    stack_size = 7
+    len_conv_dim1 = 2
+    len_conv_dim2 = 10
+    len_conv_dim3 = 6
+
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+
+    weights_in = [np.ones((nb_filter, stack_size, len_conv_dim1, len_conv_dim2, len_conv_dim3)),
+                  np.ones(nb_filter)]
+
+    input = np.ones((nb_samples, stack_size, input_len_dim1,
+                     input_len_dim2, input_len_dim3))
+    for weight in [None, weights_in]:
+        for border_mode in ['same', 'valid']:
+            for subsample in [(1, 1, 1), (2, 2, 2)]:
+                if border_mode == 'same' and subsample != (1, 1, 1):
+                    continue
+                for W_regularizer in [None, 'l2']:
+                    for b_regularizer in [None, 'l2']:
+                        for act_regularizer in [None, 'l2']:
+                            layer = convolutional.Convolution3D(
+                                nb_filter, len_conv_dim1, len_conv_dim2, len_conv_dim3,
+                                weights=weight,
+                                border_mode=border_mode,
+                                W_regularizer=W_regularizer,
+                                b_regularizer=b_regularizer,
+                                activity_regularizer=act_regularizer,
+                                subsample=subsample,
+                                input_shape=(stack_size, None, None, None))
+
+                            layer.input = K.variable(input)
+                            for train in [True, False]:
+                                out = K.eval(layer.get_output(train))
+                                if border_mode == 'same' and subsample == (1, 1, 1):
+                                    assert out.shape[2:] == input.shape[2:]
+                            layer.get_config()
+
+
+@pytest.mark.skipif(K._BACKEND != 'theano', reason="Requires Theano backend")
+def test_maxpooling_3d():
+    nb_samples = 9
+    stack_size = 7
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+    pool_size = (3, 3, 3)
+
+    input = np.ones((nb_samples, stack_size, input_len_dim1,
+                     input_len_dim2, input_len_dim3))
+    for strides in [(1, 1, 1), (2, 2, 2)]:
+        layer = convolutional.MaxPooling3D(strides=strides,
+                                           border_mode='valid',
+                                           pool_size=pool_size)
+        layer.input = K.variable(input)
+        for train in [True, False]:
+            K.eval(layer.get_output(train))
+        layer.get_config()
+
+
+@pytest.mark.skipif(K._BACKEND != 'theano', reason="Requires Theano backend")
+def test_averagepooling_3d():
+    nb_samples = 9
+    stack_size = 7
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+    pool_size = (3, 3, 3)
+
+    input = np.ones((nb_samples, stack_size, input_len_dim1,
+                     input_len_dim2, input_len_dim3))
+    for strides in [(1, 1, 1), (2, 2, 2)]:
+        layer = convolutional.AveragePooling3D(strides=strides,
                                               border_mode='valid',
                                               pool_size=pool_size)
        layer.input = K.variable(input)
@@ -167,6 +311,28 @@ def test_zero_padding_2d():
    layer.get_config()


+@pytest.mark.skipif(K._BACKEND != 'theano', reason="Requires Theano backend")
+def test_zero_padding_3d():
+    nb_samples = 9
+    stack_size = 7
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+
+    input = np.ones((nb_samples, stack_size, input_len_dim1,
+                     input_len_dim2, input_len_dim3))
+    layer = convolutional.ZeroPadding3D(padding=(2, 2, 2))
+    layer.input = K.variable(input)
+    for train in [True, False]:
+        out = K.eval(layer.get_output(train))
+        for offset in [0, 1, -1, -2]:
+            assert_allclose(out[:, :, offset, :, :], 0.)
+            assert_allclose(out[:, :, :, offset, :], 0.)
+            assert_allclose(out[:, :, :, :, offset], 0.)
+        assert_allclose(out[:, :, 2:-2, 2:-2, 2:-2], 1.)
+    layer.get_config()
+
+
 def test_upsampling_1d():
    nb_samples = 9
    nb_steps = 7
@@ -198,29 +364,76 @@ def test_upsampling_2d():

        for length_row in [2, 3, 9]:
            for length_col in [2, 3, 9]:
-                    layer = convolutional.UpSampling2D(
-                        size=(length_row, length_col),
+                layer = convolutional.UpSampling2D(
+                    size=(length_row, length_col),
+                    input_shape=input.shape[1:],
+                    dim_ordering=dim_ordering)
+                layer.input = K.variable(input)
+                for train in [True, False]:
+                    out = K.eval(layer.get_output(train))
+                    if dim_ordering == 'th':
+                        assert out.shape[2] == length_row * input_nb_row
+                        assert out.shape[3] == length_col * input_nb_col
+                    else:  # tf
+                        assert out.shape[1] == length_row * input_nb_row
+                        assert out.shape[2] == length_col * input_nb_col
+
+                    # compare with numpy
+                    if dim_ordering == 'th':
+                        expected_out = np.repeat(input, length_row, axis=2)
+                        expected_out = np.repeat(expected_out, length_col, axis=3)
+                    else:  # tf
+                        expected_out = np.repeat(input, length_row, axis=1)
+                        expected_out = np.repeat(expected_out, length_col, axis=2)
+
+                    assert_allclose(out, expected_out)
+
+                layer.get_config()
+
+
+@pytest.mark.skipif(K._BACKEND != 'theano', reason="Requires Theano backend")
+def test_upsampling_3d():
+    nb_samples = 9
+    stack_size = 7
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+
+    for dim_ordering in ['th', 'tf']:
+        if dim_ordering == 'th':
+            input = np.random.rand(nb_samples, stack_size, input_len_dim1, input_len_dim2,
+                                   input_len_dim3)
+        else:  # tf
+            input = np.random.rand(nb_samples, input_len_dim1, input_len_dim2, input_len_dim3,
+                                   stack_size)
+        for length_dim1 in [2, 3, 9]:
+            for length_dim2 in [2, 3, 9]:
+                for length_dim3 in [2, 3, 9]:
+                    layer = convolutional.UpSampling3D(
+                        size=(length_dim1, length_dim2, length_dim3),
                        input_shape=input.shape[1:],
                        dim_ordering=dim_ordering)
                    layer.input = K.variable(input)
                    for train in [True, False]:
                        out = K.eval(layer.get_output(train))
                        if dim_ordering == 'th':
-                            assert out.shape[2] == length_row * input_nb_row
-                            assert out.shape[3] == length_col * input_nb_col
+                            assert out.shape[2] == length_dim1 * input_len_dim1
+                            assert out.shape[3] == length_dim2 * input_len_dim2
+                            assert out.shape[4] == length_dim3 * input_len_dim3
                        else:  # tf
-                            assert out.shape[1] == length_row * input_nb_row
-                            assert out.shape[2] == length_col * input_nb_col
+                            assert out.shape[1] == length_dim1 * input_len_dim1
+                            assert out.shape[2] == length_dim2 * input_len_dim2
+                            assert out.shape[3] == length_dim3 * input_len_dim3

                        # compare with numpy
                        if dim_ordering == 'th':
-                            expected_out = np.repeat(input, length_row, axis=2)
-                            expected_out = np.repeat(expected_out, length_col,
-                                                     axis=3)
+                            expected_out = np.repeat(input, length_dim1, axis=2)
+                            expected_out = np.repeat(expected_out, length_dim2, axis=3)
+                            expected_out = np.repeat(expected_out, length_dim3, axis=4)
                        else:  # tf
-                            expected_out = np.repeat(input, length_row, axis=1)
-                            expected_out = np.repeat(expected_out, length_col,
-                                                     axis=2)
+                            expected_out = np.repeat(input, length_dim1, axis=1)
+                            expected_out = np.repeat(expected_out, length_dim2, axis=2)
+                            expected_out = np.repeat(expected_out, length_dim3, axis=3)

                        assert_allclose(out, expected_out)

@@ -175,7 +175,7 @@ def test_naming():
 def test_sequences():
    '''Test masking sequences with zeroes as padding'''
    # integer inputs, one per timestep, like embeddings
-    layer = core.Masking()
+    layer = core.Masking(input_shape=(4, 1))
    func = K.function([layer.get_input(True)], [layer.get_output_mask()])
    input_data = np.array([[[1], [2], [3], [0]],
                           [[0], [4], [5], [0]]], dtype=np.int32)
@@ -190,7 +190,7 @@ def test_sequences():

 def test_non_zero():
    '''Test masking with non-zero mask value'''
-    layer = core.Masking(5)
+    layer = core.Masking(5, input_shape=(4, 2))
    func = K.function([layer.input], [layer.get_output_mask()])
    input_data = np.array([[[1, 1], [2, 1], [3, 1], [5, 5]],
                           [[1, 5], [5, 0], [0, 0], [0, 0]]],
@@ -202,7 +202,7 @@ def test_non_zero():

 def test_non_zero_output():
    '''Test output of masking layer with non-zero mask value'''
-    layer = core.Masking(5)
+    layer = core.Masking(5, input_shape=(4, 2))
    func = K.function([layer.input], [layer.get_output()])

    input_data = np.array([[[1, 1], [2, 1], [3, 1], [5, 5]],
@@ -228,6 +228,7 @@ def _runner(layer):
    layer.trainable = True
    layer.trainable = False

+
 def test_siamese_all():
    right_input_layer = core.Dense(7, input_dim=3)
    left_input_layer = core.Dense(7, input_dim=3)
@@ -238,6 +239,7 @@ def test_siamese_all():
        siamese_layer.output_shape
        siamese_layer.get_output()

+
@pytest.mark.skipif(K._BACKEND == 'tensorflow',
                    reason='currently not working with TensorFlow')
 def test_siamese_theano_only():
@@ -5,6 +5,7 @@ from numpy.testing import assert_allclose
 from keras.layers import recurrent, embeddings
 from keras.models import Sequential
 from keras.layers.core import Masking
+from keras import regularizers

 from keras import backend as K
 from keras.models import Sequential, model_from_json
@@ -34,6 +35,24 @@ def _runner(layer_class):

            mask = layer.get_output_mask(train)

+    # check dropout
+    for ret_seq in [True, False]:
+        layer = layer_class(output_dim, return_sequences=ret_seq, weights=None, 
+                            batch_input_shape=(nb_samples, timesteps, embedding_dim),
+                            dropout_W=0.5, dropout_U=0.5)
+        layer.input = K.variable(np.ones((nb_samples, timesteps, embedding_dim)))
+        layer.get_config()
+
+        for train in [True, False]:
+            out = K.eval(layer.get_output(train))
+            # Make sure the output has the desired shape
+            if ret_seq:
+                assert(out.shape == (nb_samples, timesteps, output_dim))
+            else:
+                assert(out.shape == (nb_samples, output_dim))
+
+            mask = layer.get_output_mask(train)
+
    # check statefulness
    model = Sequential()
    model.add(embeddings.Embedding(embedding_num, embedding_dim,
@@ -90,6 +109,15 @@ def _runner(layer_class):

    assert_allclose(out7, out6, atol=1e-5)

+    # check regularizers
+    layer = layer_class(output_dim, return_sequences=ret_seq, weights=None,
+                        batch_input_shape=(nb_samples, timesteps, embedding_dim),
+                        W_regularizer=regularizers.WeightRegularizer(l1=0.01),
+                        U_regularizer=regularizers.WeightRegularizer(l1=0.01),
+                        b_regularizer='l2')
+    layer.input = K.variable(np.ones((nb_samples, timesteps, embedding_dim)))
+    out = K.eval(layer.get_output(train=True))
+

 def test_SimpleRNN():
    _runner(recurrent.SimpleRNN)
@@ -0,0 +1,64 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+
+from keras.layers import wrappers
+from keras.layers import core, convolutional
+from keras.models import Sequential, model_from_json
+
+
+def test_TimeDistributed():
+    # first, test with Dense layer
+    model = Sequential()
+    model.add(wrappers.TimeDistributed(core.Dense(2), input_shape=(3, 4)))
+    model.add(core.Activation('relu'))
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.fit(np.random.random((10, 3, 4)), np.random.random((10, 3, 2)), nb_epoch=1, batch_size=10)
+
+    # test config
+    model.get_config()
+
+    # compare to TimeDistributedDense
+    test_input = np.random.random((1, 3, 4))
+    test_output = model.predict(test_input)
+    weights = model.layers[0].get_weights()
+
+    reference = Sequential()
+    reference.add(core.TimeDistributedDense(2, input_shape=(3, 4), weights=weights))
+    reference.add(core.Activation('relu'))
+    reference.compile(optimizer='rmsprop', loss='mse')
+
+    reference_output = reference.predict(test_input)
+    assert_allclose(test_output, reference_output, atol=1e-05)
+
+    # test when specifying a batch_input_shape
+    reference = Sequential()
+    reference.add(core.TimeDistributedDense(2, batch_input_shape=(1, 3, 4), weights=weights))
+    reference.add(core.Activation('relu'))
+    reference.compile(optimizer='rmsprop', loss='mse')
+
+    reference_output = reference.predict(test_input)
+    assert_allclose(test_output, reference_output, atol=1e-05)
+
+    # test with Convolution2D
+    model = Sequential()
+    model.add(wrappers.TimeDistributed(convolutional.Convolution2D(5, 2, 2, border_mode='same'), input_shape=(2, 3, 4, 4)))
+    model.add(core.Activation('relu'))
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.random.random((1, 2, 3, 4, 4)), np.random.random((1, 2, 5, 4, 4)))
+
+    model = model_from_json(model.to_json())
+    model.summary()
+
+    # test stacked layers
+    model = Sequential()
+    model.add(wrappers.TimeDistributed(core.Dense(2), input_shape=(3, 4)))
+    model.add(wrappers.TimeDistributed(core.Dense(3)))
+    model.add(core.Activation('relu'))
+    model.compile(optimizer='rmsprop', loss='mse')
+
+    model.fit(np.random.random((10, 3, 4)), np.random.random((10, 3, 3)), nb_epoch=1, batch_size=10)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
@@ -1,59 +1,59 @@
-# import pytest
-# from keras.preprocessing.image import *
-# from PIL import Image
-# import numpy as np
-# import os
-# import shutil
+import pytest
+from keras.preprocessing.image import *
+from PIL import Image
+import numpy as np
+import os
+import shutil


-# def setup_function(func):
-#     os.mkdir('test_images')
-#     os.mkdir('test_images/rgb')
-#     os.mkdir('test_images/gsc')
+def setup_function(func):
+    os.mkdir('test_images')
+    os.mkdir('test_images/rgb')
+    os.mkdir('test_images/gsc')

-#     img_w = img_h = 20
-#     for n in range(8):
-#         bias = np.random.rand(img_w, img_h, 1) * 64
-#         variance = np.random.rand(img_w, img_h, 1) * (255-64)
-#         imarray = np.random.rand(img_w, img_h, 3) * variance + bias
-#         im = Image.fromarray(imarray.astype('uint8')).convert('RGBA')
-#         im.save('test_images/rgb/rgb_test_image_'+str(n)+'.png')
+    img_w = img_h = 20
+    for n in range(8):
+        bias = np.random.rand(img_w, img_h, 1) * 64
+        variance = np.random.rand(img_w, img_h, 1) * (255-64)
+        imarray = np.random.rand(img_w, img_h, 3) * variance + bias
+        im = Image.fromarray(imarray.astype('uint8')).convert('RGBA')
+        im.save('test_images/rgb/rgb_test_image_'+str(n)+'.png')

-#         imarray = np.random.rand(img_w, img_h, 1) * variance + bias
-#         im = Image.fromarray(imarray.astype('uint8').squeeze()).convert('L')
-#         im.save('test_images/gsc/gsc_test_image_'+str(n)+'.png')
+        imarray = np.random.rand(img_w, img_h, 1) * variance + bias
+        im = Image.fromarray(imarray.astype('uint8').squeeze()).convert('L')
+        im.save('test_images/gsc/gsc_test_image_'+str(n)+'.png')


-# def teardown_function(func):
-#     shutil.rmtree('test_images')
+def teardown_function(func):
+    shutil.rmtree('test_images')


-# def test_image_data_generator():
-#     for color_mode in ['gsc', 'rgb']:
-#         file_list = list_pictures('test_images/' + color_mode)
-#         img_list = []
-#         for f in file_list:
-#             img_list.append(img_to_array(load_img(f))[None, ...])
+def test_image_data_generator():
+    for color_mode in ['gsc', 'rgb']:
+        file_list = list_pictures('test_images/' + color_mode)
+        img_list = []
+        for f in file_list:
+            img_list.append(img_to_array(load_img(f))[None, ...])

-#         images = np.vstack(img_list)
-#         generator = ImageDataGenerator(
-#             featurewise_center=True,
-#             samplewise_center=True,
-#             featurewise_std_normalization=True,
-#             samplewise_std_normalization=True,
-#             zca_whitening=True,
-#             rotation_range=90.,
-#             width_shift_range=10.,
-#             height_shift_range=10.,
-#             shear_range=0.5,
-#             horizontal_flip=True,
-#             vertical_flip=True)
-#         generator.fit(images, augment=True)
+        images = np.vstack(img_list)
+        generator = ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            rotation_range=90.,
+            width_shift_range=10.,
+            height_shift_range=10.,
+            shear_range=0.5,
+            horizontal_flip=True,
+            vertical_flip=True)
+        generator.fit(images, augment=True)

-#         for x, y in generator.flow(images, np.arange(images.shape[0]),
-#                                    shuffle=True, save_to_dir='test_images'):
-#             assert x.shape[1:] == images.shape[1:]
-#             break
+        for x, y in generator.flow(images, np.arange(images.shape[0]),
+                                   shuffle=True, save_to_dir='test_images'):
+            assert x.shape[1:] == images.shape[1:]
+            break

-# if __name__ == '__main__':
-#     pytest.main([__file__])
+if __name__ == '__main__':
+    pytest.main([__file__])
@@ -28,6 +28,39 @@ def test_pad_sequences():
    assert_allclose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])


+def test_pad_sequences_vector():
+    a = [[[1, 1]],
+         [[2, 1], [2, 2]],
+         [[3, 1], [3, 2], [3, 3]]]
+
+    # test padding
+    b = pad_sequences(a, maxlen=3, padding='pre')
+    assert_allclose(b, [[[0, 0], [0, 0], [1, 1]],
+                        [[0, 0], [2, 1], [2, 2]],
+                        [[3, 1], [3, 2], [3, 3]]])
+    b = pad_sequences(a, maxlen=3, padding='post')
+    assert_allclose(b, [[[1, 1], [0, 0], [0, 0]],
+                        [[2, 1], [2, 2], [0, 0]],
+                        [[3, 1], [3, 2], [3, 3]]])
+
+    # test truncating
+    b = pad_sequences(a, maxlen=2, truncating='pre')
+    assert_allclose(b, [[[0, 0], [1, 1]],
+                        [[2, 1], [2, 2]],
+                        [[3, 2], [3, 3]]])
+
+    b = pad_sequences(a, maxlen=2, truncating='post')
+    assert_allclose(b, [[[0, 0], [1, 1]],
+                        [[2, 1], [2, 2]],
+                        [[3, 1], [3, 2]]])
+
+    # test value
+    b = pad_sequences(a, maxlen=3, value=1)
+    assert_allclose(b, [[[1, 1], [1, 1], [1, 1]],
+                        [[1, 1], [2, 1], [2, 2]],
+                        [[3, 1], [3, 2], [3, 3]]])
+
+
 def test_make_sampling_table():
    a = make_sampling_table(3)
    assert_allclose(a, np.asarray([0.00315225,  0.00315225,  0.00547597]),
@@ -127,7 +127,7 @@ def test_TensorBoard():
    import shutil
    import tensorflow as tf
    import keras.backend.tensorflow_backend as KTF
-    old_session = KTF._get_session()
+    old_session = KTF.get_session()
    filepath = './logs'
    (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=train_samples,
                                                         nb_test=test_samples,
@@ -162,7 +162,7 @@ def test_TensorBoard():

    with tf.Graph().as_default():
        session = tf.Session('')
-        KTF._set_session(session)
+        KTF.set_session(session)
        model = Sequential()
        model.add(Dense(nb_hidden, input_dim=input_dim, activation='relu'))
        model.add(Dense(nb_class, activation='softmax'))
@@ -208,7 +208,7 @@ def test_TensorBoard():

    with tf.Graph().as_default():
        session = tf.Session('')
-        KTF._set_session(session)
+        KTF.set_session(session)
        model = Graph()
        model.add_input(name='X_vars', input_shape=(input_dim, ))

@@ -246,7 +246,7 @@ def test_TensorBoard():
        assert os.path.exists(filepath)
        shutil.rmtree(filepath)

-    KTF._set_session(old_session)
+    KTF.set_session(old_session)

 if __name__ == '__main__':
    pytest.main([__file__])
@@ -0,0 +1,425 @@
+from __future__ import absolute_import
+from __future__ import print_function
+import pytest
+import os
+import numpy as np
+np.random.seed(1337)
+
+from keras import backend as K
+from keras.models import Graph, Sequential, model_from_json, model_from_yaml
+from keras.layers.core import Dense, Activation, Merge, Lambda, LambdaMerge, Siamese, add_shared_layer
+from keras.layers import containers
+from keras.utils.test_utils import get_test_data
+
+
+batch_size = 32
+
+(X_train_graph, y_train_graph), (X_test_graph, y_test_graph) = get_test_data(nb_train=1000,
+                                                                             nb_test=200,
+                                                                             input_shape=(32,),
+                                                                             classification=False,
+                                                                             output_shape=(4,))
+(X2_train_graph, y2_train_graph), (X2_test_graph, y2_test_graph) = get_test_data(nb_train=1000,
+                                                                                 nb_test=200,
+                                                                                 input_shape=(32,),
+                                                                                 classification=False,
+                                                                                 output_shape=(1,))
+
+
+def test_graph_fit_generator():
+    def data_generator_graph(train):
+        while 1:
+            if train:
+                yield {'input1': X_train_graph, 'output1': y_train_graph}
+            else:
+                yield {'input1': X_test_graph, 'output1': y_test_graph}
+
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(4), name='dense3', input='dense1')
+
+    graph.add_output(name='output1',
+                     inputs=['dense2', 'dense3'],
+                     merge_mode='sum')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4)
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4,
+                        validation_data={'input1': X_test_graph, 'output1': y_test_graph})
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4,
+                        validation_data=data_generator_graph(False), nb_val_samples=batch_size * 3)
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4,
+                        validation_data=data_generator_graph(False), nb_val_samples=batch_size * 3)
+    gen_loss = graph.evaluate_generator(data_generator_graph(True), 128, verbose=0)
+    assert(gen_loss < 3.)
+
+    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph}, verbose=0)
+    assert(loss < 3.)
+
+    # test show_accuracy
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4, show_accuracy=True)
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4,
+                        validation_data={'input1': X_test_graph, 'output1': y_test_graph}, show_accuracy=True)
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4,
+                        validation_data=data_generator_graph(False), nb_val_samples=batch_size * 3, show_accuracy=True)
+    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4,
+                        validation_data=data_generator_graph(False), nb_val_samples=batch_size * 3, show_accuracy=True)
+    gen_loss = graph.evaluate_generator(data_generator_graph(True), 128, verbose=0, show_accuracy=True)
+
+
+def test_1o_1i():
+    # test a non-sequential graph with 1 input and 1 output
+    np.random.seed(1337)
+
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(4), name='dense3', input='dense1')
+
+    graph.add_output(name='output1',
+                     inputs=['dense2', 'dense3'],
+                     merge_mode='sum')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph}, verbose=0)
+    assert(loss < 2.5)
+
+    # test show_accuracy:
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
+              nb_epoch=1, show_accuracy=True)
+    loss, acc = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph}, accuracy=True)
+    loss, acc = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph}, accuracy=True)
+    loss, acc = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph}, verbose=0, show_accuracy=True)
+
+    # test validation split
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
+              validation_split=0.2, nb_epoch=1)
+    # test validation data
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
+              validation_data={'input1': X_train_graph, 'output1': y_train_graph},
+              nb_epoch=1)
+
+
+def test_1o_1i_2():
+    # test a more complex non-sequential graph with 1 input and 1 output
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2-0', input='input1')
+    graph.add_node(Activation('relu'), name='dense2', input='dense2-0')
+
+    graph.add_node(Dense(16), name='dense3', input='dense2')
+    graph.add_node(Dense(4), name='dense4', inputs=['dense1', 'dense3'],
+                   merge_mode='sum')
+
+    graph.add_output(name='output1', inputs=['dense2', 'dense4'],
+                     merge_mode='sum')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_train_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+
+    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph})
+    assert(loss < 2.5)
+
+    graph.get_config(verbose=1)
+    graph.summary()
+
+
+def test_1o_2i():
+    # test a non-sequential graph with 2 inputs and 1 output
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_input(name='input2', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input2')
+    graph.add_node(Dense(4), name='dense3', input='dense1')
+
+    graph.add_output(name='output1', inputs=['dense2', 'dense3'],
+                     merge_mode='sum')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+
+    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    assert(loss < 3.0)
+
+    graph.get_config(verbose=1)
+
+
+def test_siamese_3():
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_input(name='input2', input_shape=(32,))
+
+    graph.add_shared_node(Dense(16), name='shared', inputs=['input1', 'input2'], merge_mode='sum')
+    graph.add_node(Dense(4), name='dense1', input='shared')
+    graph.add_node(Dense(4), name='dense2', input='dense1')
+
+    graph.add_output(name='output1', input='dense2')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+
+    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    assert(loss < 3.0)
+
+    graph.get_config(verbose=1)
+
+
+def test_siamese_4():
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_input(name='input2', input_shape=(32,))
+
+    graph.add_shared_node(Dense(16), name='shared1', inputs=['input1', 'input2'])
+    graph.add_shared_node(Dense(4), name='shared2', inputs=['shared1'])
+    graph.add_shared_node(Dense(4), name='shared3', inputs=['shared2'], merge_mode='sum')
+    graph.add_node(Dense(4), name='dense', input='shared3')
+
+    graph.add_output(name='output1', input='dense',
+                     merge_mode='sum')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+
+    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    assert(loss < 3.0)
+
+    graph.get_config(verbose=1)
+
+
+def test_siamese_5():
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_input(name='input2', input_shape=(32,))
+
+    graph.add_shared_node(Dense(16), name='shared1', inputs=['input1', 'input2'])
+    graph.add_shared_node(Dense(4), name='shared2', inputs=['shared1'])
+    graph.add_shared_node(Dense(4), name='shared3', inputs=['shared2'], outputs=['shared_output1','shared_output2'])
+    graph.add_node(Dense(4), name='dense1',  input='shared_output1')
+    graph.add_node(Dense(4), name='dense2',  input='shared_output2')
+
+    graph.add_output(name='output1', inputs=['dense1', 'dense2'],
+                     merge_mode='sum')
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+
+    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
+    assert(loss < 3.0)
+
+    graph.get_config(verbose=1)
+
+
+def test_2o_1i_weights():
+    # test a non-sequential graph with 1 input and 2 outputs
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(1), name='dense3', input='dense1')
+
+    graph.add_output(name='output1', input='dense2')
+    graph.add_output(name='output2', input='dense3')
+    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
+              nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 2)
+    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
+    assert(loss < 4.)
+
+    # test weight saving
+    fname = 'test_2o_1i_weights_temp.h5'
+    graph.save_weights(fname, overwrite=True)
+
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(1), name='dense3', input='dense1')
+    graph.add_output(name='output1', input='dense2')
+    graph.add_output(name='output2', input='dense3')
+    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
+    graph.load_weights('test_2o_1i_weights_temp.h5')
+    os.remove(fname)
+
+    nloss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
+    assert(loss == nloss)
+
+    # test loss weights
+    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'},
+                  loss_weights={'output1': 1., 'output2': 2.})
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
+              nb_epoch=1)
+
+
+def test_2o_1i_sample_weights():
+    # test a non-sequential graph with 1 input and 2 outputs with sample weights
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(1), name='dense3', input='dense1')
+
+    graph.add_output(name='output1', input='dense2')
+    graph.add_output(name='output2', input='dense3')
+
+    weights1 = np.random.uniform(size=y_train_graph.shape[0])
+    weights2 = np.random.uniform(size=y2_train_graph.shape[0])
+    weights1_test = np.random.uniform(size=y_test_graph.shape[0])
+    weights2_test = np.random.uniform(size=y2_test_graph.shape[0])
+
+    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
+
+    graph.fit({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
+              nb_epoch=10,
+              sample_weight={'output1': weights1, 'output2': weights2})
+    out = graph.predict({'input1': X_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 2)
+    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph},
+                               sample_weight={'output1': weights1_test, 'output2': weights2_test})
+    loss = graph.train_on_batch({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
+                                sample_weight={'output1': weights1, 'output2': weights2})
+    loss = graph.evaluate({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
+                          sample_weight={'output1': weights1, 'output2': weights2})
+
+
+def test_recursive():
+    # test layer-like API
+
+    graph = containers.Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(4), name='dense3', input='dense1')
+    graph.add_output(name='output1', inputs=['dense2', 'dense3'],
+                     merge_mode='sum')
+
+    seq = Sequential()
+    seq.add(Dense(32, input_shape=(32,)))
+    seq.add(graph)
+    seq.add(Dense(4))
+
+    seq.compile('rmsprop', 'mse')
+
+    seq.fit(X_train_graph, y_train_graph, batch_size=10, nb_epoch=10)
+    loss = seq.evaluate(X_test_graph, y_test_graph)
+    assert(loss < 2.5)
+
+    loss = seq.evaluate(X_test_graph, y_test_graph, show_accuracy=True)
+    seq.predict(X_test_graph)
+    seq.get_config(verbose=1)
+
+
+def test_create_output():
+    # test create_output argument
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+
+    graph.add_node(Dense(16), name='dense1', input='input1')
+    graph.add_node(Dense(4), name='dense2', input='input1')
+    graph.add_node(Dense(4), name='dense3', input='dense1')
+    graph.add_node(Dense(4), name='output1', inputs=['dense2', 'dense3'],
+                   merge_mode='sum', create_output=True)
+    graph.compile('rmsprop', {'output1': 'mse'})
+
+    history = graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
+                        nb_epoch=10)
+    out = graph.predict({'input1': X_test_graph})
+    assert(type(out == dict))
+    assert(len(out) == 1)
+
+    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
+    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
+    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph})
+    assert(loss < 2.5)
+
+    # test serialization
+    config = graph.to_json()
+    del graph
+    graph = model_from_json(config)
+
+
+def test_count_params():
+    # test count params
+
+    nb_units = 100
+    nb_classes = 2
+
+    graph = Graph()
+    graph.add_input(name='input1', input_shape=(32,))
+    graph.add_input(name='input2', input_shape=(32,))
+    graph.add_node(Dense(nb_units),
+                   name='dense1', input='input1')
+    graph.add_node(Dense(nb_classes),
+                   name='dense2', input='input2')
+    graph.add_node(Dense(nb_classes),
+                   name='dense3', input='dense1')
+    graph.add_output(name='output', inputs=['dense2', 'dense3'],
+                     merge_mode='sum')
+
+    n = 32 * nb_units + nb_units
+    n += 32 * nb_classes + nb_classes
+    n += nb_units * nb_classes + nb_classes
+
+    assert(n == graph.count_params())
+
+    graph.compile('rmsprop', {'output': 'binary_crossentropy'})
+
+    assert(n == graph.count_params())
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
@@ -1,3 +1,4 @@
+import pytest
 import numpy as np

 from keras import objectives
@@ -29,3 +30,7 @@ def test_objective_shapes_2d():
    for obj in allobj:
        objective_output = obj(y_a, y_b)
        assert K.eval(objective_output).shape == (6,)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import pytest
+import os
 import numpy as np
 np.random.seed(1337)

@@ -11,8 +12,6 @@ from keras.layers import containers
 from keras.utils import np_utils
 from keras.utils.test_utils import get_test_data

-import os
-

 input_dim = 32
 nb_hidden = 16
@@ -37,10 +36,6 @@ def _get_test_data():
    return (X_train, y_train), (X_test, y_test)


-####################
-# SEQUENTIAL TEST  #
-####################
-
 def test_sequential_fit_generator():
    (X_train, y_train), (X_test, y_test) = _get_test_data()

@@ -69,6 +64,10 @@ def test_sequential_fit_generator():
    model.fit_generator(data_generator(True), len(X_train), nb_epoch, show_accuracy=True)
    model.fit_generator(data_generator(True), len(X_train), nb_epoch, show_accuracy=False, validation_data=(X_test, y_test))
    model.fit_generator(data_generator(True), len(X_train), nb_epoch, show_accuracy=True, validation_data=(X_test, y_test))
+    model.fit_generator(data_generator(True), len(X_train), nb_epoch, show_accuracy=False,
+                        validation_data=data_generator(False), nb_val_samples=batch_size * 3)
+    model.fit_generator(data_generator(True), len(X_train), nb_epoch, show_accuracy=True,
+                        validation_data=data_generator(False), nb_val_samples=batch_size * 3)

    loss = model.evaluate(X_train, y_train, verbose=0)
    assert(loss < 0.9)
@@ -77,6 +76,21 @@ def test_sequential_fit_generator():
 def test_sequential():
    (X_train, y_train), (X_test, y_test) = _get_test_data()

+    # TODO: factor out
+    def data_generator(train):
+        if train:
+            max_batch_index = len(X_train) // batch_size
+        else:
+            max_batch_index = len(X_test) // batch_size
+        i = 0
+        while 1:
+            if train:
+                yield (X_train[i * batch_size: (i + 1) * batch_size], y_train[i * batch_size: (i + 1) * batch_size])
+            else:
+                yield (X_test[i * batch_size: (i + 1) * batch_size], y_test[i * batch_size: (i + 1) * batch_size])
+            i += 1
+            i = i % max_batch_index
+
    model = Sequential()
    model.add(Dense(nb_hidden, input_shape=(input_dim,)))
    model.add(Activation('relu'))
@@ -94,6 +108,66 @@ def test_sequential():

    model.train_on_batch(X_train[:32], y_train[:32])

+    gen_loss = model.evaluate_generator(data_generator(True), 256, verbose=0)
+    assert(gen_loss < 0.8)
+
+    loss = model.evaluate(X_test, y_test, verbose=0)
+    assert(loss < 0.8)
+
+    model.predict(X_test, verbose=0)
+    model.predict_classes(X_test, verbose=0)
+    model.predict_proba(X_test, verbose=0)
+    model.get_config(verbose=0)
+
+    fname = 'test_sequential_temp.h5'
+    model.save_weights(fname, overwrite=True)
+    model = Sequential()
+    model.add(Dense(nb_hidden, input_shape=(input_dim,)))
+    model.add(Activation('relu'))
+    model.add(Dense(nb_class))
+    model.add(Activation('softmax'))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+    model.load_weights(fname)
+    os.remove(fname)
+
+    nloss = model.evaluate(X_test, y_test, verbose=0)
+    assert(loss == nloss)
+
+    # test json serialization
+    json_data = model.to_json()
+    model = model_from_json(json_data)
+
+    # test yaml serialization
+    yaml_data = model.to_yaml()
+    model = model_from_yaml(yaml_data)
+
+
+def test_nested_sequential():
+    (X_train, y_train), (X_test, y_test) = _get_test_data()
+
+    inner = Sequential()
+    inner.add(Dense(nb_hidden, input_shape=(input_dim,)))
+    inner.add(Activation('relu'))
+    inner.add(Dense(nb_class))
+
+    middle = Sequential()
+    middle.add(inner)
+
+    model = Sequential()
+    model.add(middle)
+    model.add(Activation('softmax'))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+    model.summary()
+
+    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, verbose=1, validation_data=(X_test, y_test))
+    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=False, verbose=2, validation_data=(X_test, y_test))
+    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, verbose=2, validation_split=0.1)
+    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=False, verbose=1, validation_split=0.1)
+    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=0)
+    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1, shuffle=False)
+
+    model.train_on_batch(X_train[:32], y_train[:32])
+
    loss = model.evaluate(X_test, y_test, verbose=0)
    assert(loss < 0.8)

@@ -102,12 +176,19 @@ def test_sequential():
    model.predict_proba(X_test, verbose=0)
    model.get_config(verbose=0)

-    fname = 'test_sequential_temp.h5'
+    fname = 'test_nested_sequential_temp.h5'
    model.save_weights(fname, overwrite=True)
+
+    inner = Sequential()
+    inner.add(Dense(nb_hidden, input_shape=(input_dim,)))
+    inner.add(Activation('relu'))
+    inner.add(Dense(nb_class))
+
+    middle = Sequential()
+    middle.add(inner)
+
    model = Sequential()
-    model.add(Dense(nb_hidden, input_shape=(input_dim,)))
-    model.add(Activation('relu'))
-    model.add(Dense(nb_class))
+    model.add(middle)
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    model.load_weights(fname)
@@ -430,7 +511,7 @@ def test_lambda():
    g = Graph()
    g.add_input(name='input_a', input_shape=(2,))
    g.add_input(name='input_b', input_shape=(2,))
-    g.add_node(Lambda(difference),
+    g.add_node(Lambda(difference, output_shape=(2,)),
               inputs=['input_a', 'input_b'],
               merge_mode='join',
               name='d')
@@ -582,384 +663,5 @@ def test_siamese_2():
    assert(loss == nloss)


-###############
-# GRAPH TEST  #
-###############
-
-(X_train_graph, y_train_graph), (X_test_graph, y_test_graph) = get_test_data(nb_train=1000,
-                                                                             nb_test=200,
-                                                                             input_shape=(32,),
-                                                                             classification=False,
-                                                                             output_shape=(4,))
-(X2_train_graph, y2_train_graph), (X2_test_graph, y2_test_graph) = get_test_data(nb_train=1000,
-                                                                                 nb_test=200,
-                                                                                 input_shape=(32,),
-                                                                                 classification=False,
-                                                                                 output_shape=(1,))
-
-
-def test_graph_fit_generator():
-    def data_generator_graph(train):
-        while 1:
-            if train:
-                yield {'input1': X_train_graph, 'output1': y_train_graph}
-            else:
-                yield {'input1': X_test_graph, 'output1': y_test_graph}
-
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(4), name='dense3', input='dense1')
-
-    graph.add_output(name='output1',
-                     inputs=['dense2', 'dense3'],
-                     merge_mode='sum')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4)
-    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4)
-    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4, validation_data={'input1': X_test_graph, 'output1': y_test_graph})
-    graph.fit_generator(data_generator_graph(True), 1000, nb_epoch=4, validation_data={'input1': X_test_graph, 'output1': y_test_graph})
-
-    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph}, verbose=0)
-    assert(loss < 3.)
-
-
-def test_1o_1i():
-    # test a non-sequential graph with 1 input and 1 output
-    np.random.seed(1337)
-
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(4), name='dense3', input='dense1')
-
-    graph.add_output(name='output1',
-                     inputs=['dense2', 'dense3'],
-                     merge_mode='sum')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph}, verbose=0)
-    assert(loss < 2.5)
-
-    # test validation split
-    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
-              validation_split=0.2, nb_epoch=1)
-    # test validation data
-    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
-              validation_data={'input1': X_train_graph, 'output1': y_train_graph},
-              nb_epoch=1)
-
-
-def test_1o_1i_2():
-    # test a more complex non-sequential graph with 1 input and 1 output
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2-0', input='input1')
-    graph.add_node(Activation('relu'), name='dense2', input='dense2-0')
-
-    graph.add_node(Dense(16), name='dense3', input='dense2')
-    graph.add_node(Dense(4), name='dense4', inputs=['dense1', 'dense3'],
-                   merge_mode='sum')
-
-    graph.add_output(name='output1', inputs=['dense2', 'dense4'],
-                     merge_mode='sum')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_train_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-
-    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph})
-    assert(loss < 2.5)
-
-    graph.get_config(verbose=1)
-    graph.summary()
-
-
-def test_1o_2i():
-    # test a non-sequential graph with 2 inputs and 1 output
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_input(name='input2', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input2')
-    graph.add_node(Dense(4), name='dense3', input='dense1')
-
-    graph.add_output(name='output1', inputs=['dense2', 'dense3'],
-                     merge_mode='sum')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-
-    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    assert(loss < 3.0)
-
-    graph.get_config(verbose=1)
-
-
-def test_siamese_3():
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_input(name='input2', input_shape=(32,))
-
-    graph.add_shared_node(Dense(16), name='shared', inputs=['input1', 'input2'], merge_mode='sum')
-    graph.add_node(Dense(4), name='dense1', input='shared')
-    graph.add_node(Dense(4), name='dense2', input='dense1')
-
-    graph.add_output(name='output1', input='dense2')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-
-    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    assert(loss < 3.0)
-
-    graph.get_config(verbose=1)
-
-
-def test_siamese_4():
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_input(name='input2', input_shape=(32,))
-
-    graph.add_shared_node(Dense(16), name='shared1', inputs=['input1', 'input2'])
-    graph.add_shared_node(Dense(4), name='shared2', inputs=['shared1'])
-    graph.add_shared_node(Dense(4), name='shared3', inputs=['shared2'], merge_mode='sum')
-    graph.add_node(Dense(4), name='dense', input='shared3')
-
-    graph.add_output(name='output1', input='dense',
-                     merge_mode='sum')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-
-    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    assert(loss < 3.0)
-
-    graph.get_config(verbose=1)
-
-
-def test_siamese_5():
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_input(name='input2', input_shape=(32,))
-
-    graph.add_shared_node(Dense(16), name='shared1', inputs=['input1', 'input2'])
-    graph.add_shared_node(Dense(4), name='shared2', inputs=['shared1'])
-    graph.add_shared_node(Dense(4), name='shared3', inputs=['shared2'], outputs=['shared_output1','shared_output2'])
-    graph.add_node(Dense(4), name='dense1',  input='shared_output1')
-    graph.add_node(Dense(4), name='dense2',  input='shared_output2')
-
-    graph.add_output(name='output1', inputs=['dense1', 'dense2'],
-                     merge_mode='sum')
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'input2': X2_train_graph, 'output1': y_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph, 'input2': X2_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-
-    loss = graph.test_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'input2': X2_test_graph, 'output1': y_test_graph})
-    assert(loss < 3.0)
-
-    graph.get_config(verbose=1)
-
-
-def test_2o_1i_weights():
-    # test a non-sequential graph with 1 input and 2 outputs
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(1), name='dense3', input='dense1')
-
-    graph.add_output(name='output1', input='dense2')
-    graph.add_output(name='output2', input='dense3')
-    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
-              nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 2)
-    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
-    assert(loss < 4.)
-
-    # test weight saving
-    fname = 'test_2o_1i_weights_temp.h5'
-    graph.save_weights(fname, overwrite=True)
-
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(1), name='dense3', input='dense1')
-    graph.add_output(name='output1', input='dense2')
-    graph.add_output(name='output2', input='dense3')
-    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
-    graph.load_weights('test_2o_1i_weights_temp.h5')
-    os.remove(fname)
-
-    nloss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph})
-    assert(loss == nloss)
-
-
-def test_2o_1i_sample_weights():
-    # test a non-sequential graph with 1 input and 2 outputs with sample weights
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(1), name='dense3', input='dense1')
-
-    graph.add_output(name='output1', input='dense2')
-    graph.add_output(name='output2', input='dense3')
-
-    weights1 = np.random.uniform(size=y_train_graph.shape[0])
-    weights2 = np.random.uniform(size=y2_train_graph.shape[0])
-    weights1_test = np.random.uniform(size=y_test_graph.shape[0])
-    weights2_test = np.random.uniform(size=y2_test_graph.shape[0])
-
-    graph.compile('rmsprop', {'output1': 'mse', 'output2': 'mse'})
-
-    graph.fit({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
-              nb_epoch=10,
-              sample_weight={'output1': weights1, 'output2': weights2})
-    out = graph.predict({'input1': X_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 2)
-    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph, 'output2': y2_test_graph},
-                               sample_weight={'output1': weights1_test, 'output2': weights2_test})
-    loss = graph.train_on_batch({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
-                                sample_weight={'output1': weights1, 'output2': weights2})
-    loss = graph.evaluate({'input1': X_train_graph, 'output1': y_train_graph, 'output2': y2_train_graph},
-                          sample_weight={'output1': weights1, 'output2': weights2})
-
-
-def test_recursive():
-    # test layer-like API
-
-    graph = containers.Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(4), name='dense3', input='dense1')
-    graph.add_output(name='output1', inputs=['dense2', 'dense3'],
-                     merge_mode='sum')
-
-    seq = Sequential()
-    seq.add(Dense(32, input_shape=(32,)))
-    seq.add(graph)
-    seq.add(Dense(4))
-
-    seq.compile('rmsprop', 'mse')
-
-    seq.fit(X_train_graph, y_train_graph, batch_size=10, nb_epoch=10)
-    loss = seq.evaluate(X_test_graph, y_test_graph)
-    assert(loss < 2.5)
-
-    loss = seq.evaluate(X_test_graph, y_test_graph, show_accuracy=True)
-    seq.predict(X_test_graph)
-    seq.get_config(verbose=1)
-
-
-def test_create_output():
-    # test create_output argument
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-
-    graph.add_node(Dense(16), name='dense1', input='input1')
-    graph.add_node(Dense(4), name='dense2', input='input1')
-    graph.add_node(Dense(4), name='dense3', input='dense1')
-    graph.add_node(Dense(4), name='output1', inputs=['dense2', 'dense3'],
-                   merge_mode='sum', create_output=True)
-    graph.compile('rmsprop', {'output1': 'mse'})
-
-    history = graph.fit({'input1': X_train_graph, 'output1': y_train_graph},
-                        nb_epoch=10)
-    out = graph.predict({'input1': X_test_graph})
-    assert(type(out == dict))
-    assert(len(out) == 1)
-
-    loss = graph.test_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
-    loss = graph.train_on_batch({'input1': X_test_graph, 'output1': y_test_graph})
-    loss = graph.evaluate({'input1': X_test_graph, 'output1': y_test_graph})
-    assert(loss < 2.5)
-
-
-def test_count_params():
-    # test count params
-
-    nb_units = 100
-    nb_classes = 2
-
-    graph = Graph()
-    graph.add_input(name='input1', input_shape=(32,))
-    graph.add_input(name='input2', input_shape=(32,))
-    graph.add_node(Dense(nb_units),
-                   name='dense1', input='input1')
-    graph.add_node(Dense(nb_classes),
-                   name='dense2', input='input2')
-    graph.add_node(Dense(nb_classes),
-                   name='dense3', input='dense1')
-    graph.add_output(name='output', inputs=['dense2', 'dense3'],
-                     merge_mode='sum')
-
-    n = 32 * nb_units + nb_units
-    n += 32 * nb_classes + nb_classes
-    n += nb_units * nb_classes + nb_classes
-
-    assert(n == graph.count_params())
-
-    graph.compile('rmsprop', {'output': 'binary_crossentropy'})
-
-    assert(n == graph.count_params())
-
-
 if __name__ == '__main__':
-    # pytest.main([__file__])
-    test_lambda()
+    pytest.main([__file__])
@@ -37,33 +37,85 @@ y_test = np_utils.to_categorical(y_test, nb_classes=nb_class)
                                                                     output_shape=(1,))


-@pytest.mark.skipif(K._BACKEND=='tensorflow', reason="currently not working with TensorFlow")
-def test_keras_classifier():
+def build_fn_clf(hidden_dims=50):
    model = Sequential()
    model.add(Dense(input_dim, input_shape=(input_dim,)))
    model.add(Activation('relu'))
+    model.add(Dense(hidden_dims))
+    model.add(Activation('relu'))
    model.add(Dense(nb_class))
    model.add(Activation('softmax'))
-
-    sklearn_clf = KerasClassifier(model, optimizer=optim, loss=loss,
-                                  train_batch_size=batch_size,
-                                  test_batch_size=batch_size,
-                                  nb_epoch=nb_epoch)
-    sklearn_clf.fit(X_train, y_train)
-    sklearn_clf.score(X_test, y_test)
+    model.compile(optimizer='sgd', loss='categorical_crossentropy',
+                  class_mode='binary')
+    return model


-@pytest.mark.skipif(K._BACKEND=='tensorflow', reason="currently not working with TensorFlow")
-def test_keras_regressor():
+class Class_build_fn_clf(object):
+    def __call__(self, hidden_dims):
+        return build_fn_clf(hidden_dims)
+
+
+class Inherit_class_build_fn_clf(KerasClassifier):
+    def __call__(self, hidden_dims):
+        return build_fn_clf(hidden_dims)
+
+
+def build_fn_reg(hidden_dims=50):
    model = Sequential()
    model.add(Dense(input_dim, input_shape=(input_dim,)))
    model.add(Activation('relu'))
+    model.add(Dense(hidden_dims))
+    model.add(Activation('relu'))
    model.add(Dense(1))
-    model.add(Activation('softmax'))
+    model.add(Activation('linear'))
+    model.compile(optimizer='sgd', loss='mean_absolute_error')
+    return model

-    sklearn_regressor = KerasRegressor(model, optimizer=optim, loss=loss,
-                                       train_batch_size=batch_size,
-                                       test_batch_size=batch_size,
-                                       nb_epoch=nb_epoch)
-    sklearn_regressor.fit(X_train_reg, y_train_reg)
-    sklearn_regressor.score(X_test_reg, y_test_reg)
+
+class Class_build_fn_reg(object):
+    def __call__(self, hidden_dims):
+        return build_fn_reg(hidden_dims)
+
+
+class Inherit_class_build_fn_reg(KerasRegressor):
+    def __call__(self, hidden_dims):
+        return build_fn_reg(hidden_dims)
+
+for fn in [build_fn_clf, Class_build_fn_clf(), Inherit_class_build_fn_clf]:
+    if fn is Inherit_class_build_fn_clf:
+        classifier = Inherit_class_build_fn_clf(
+            build_fn=None, hidden_dims=50, batch_size=batch_size, nb_epoch=nb_epoch)
+    else:
+        classifier = KerasClassifier(
+            build_fn=fn, hidden_dims=50, batch_size=batch_size, nb_epoch=nb_epoch)
+
+    classifier.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch)
+    score = classifier.score(X_train, y_train, batch_size=batch_size)
+    preds = classifier.predict(X_test, batch_size=batch_size)
+    proba = classifier.predict_proba(X_test, batch_size=batch_size)
+
+
+for fn in [build_fn_reg, Class_build_fn_reg(), Inherit_class_build_fn_reg]:
+    if fn is Inherit_class_build_fn_reg:
+        regressor = Inherit_class_build_fn_reg(
+            build_fn=None, hidden_dims=50, batch_size=batch_size, nb_epoch=nb_epoch)
+    else:
+        regressor = KerasRegressor(
+            build_fn=fn, hidden_dims=50, batch_size=batch_size, nb_epoch=nb_epoch)
+
+    regressor.fit(X_train_reg, y_train_reg,
+                  batch_size=batch_size, nb_epoch=nb_epoch)
+    score = regressor.score(X_train_reg, y_train_reg, batch_size=batch_size)
+    preds = regressor.predict(X_test, batch_size=batch_size)
+
+
+# Usage of sklearn's grid_search
+# from sklearn import grid_search
+# parameters = dict(hidden_dims = [20, 30], batch_size=[64, 128], nb_epoch=[2], verbose=[0])
+# classifier = Inherit_class_build_fn_clf()
+# clf = grid_search.GridSearchCV(classifier, parameters)
+# clf.fit(X_train, y_train)
+# parameters = dict(hidden_dims = [20, 30], batch_size=[64, 128], nb_epoch=[2], verbose=[0])
+# regressor = Inherit_class_build_fn_reg()
+# reg = grid_search.GridSearchCV(regressor, parameters, scoring='mean_squared_error', n_jobs=1, cv=2, verbose=2)
+# reg.fit(X_train_reg, y_train_reg)
@@ -15,6 +15,7 @@ def check_layer_output_shape(layer, input_data):

    function = K.function([layer.input], [layer.get_output()])
    output = function([input_data])[0]
+
    assert output.shape[1:] == expected_output_shape


@@ -36,6 +37,7 @@ def test_Reshape():
    layer = Reshape(dims=(2, -1))
    check_layer_output_shape(layer, input_data)

+
 def test_Permute():
    layer = Permute(dims=(1, 3, 2))
    input_data = np.random.random((2, 2, 4, 3))
@@ -86,11 +88,11 @@ def test_Convolution1D():

 def test_Convolution2D():
    for border_mode in ['same', 'valid']:
-        for nb_row, nb_col in [(2, 2), (3, 3)]:
-            for subsample in [(1, 1), (2, 2)]:
-                if (subsample[0] > 1 or subsample[1] > 1) and border_mode == 'same':
+        for nb_row, nb_col in [(3, 3), (4, 4), (3, 4)]:
+            for subsample in [(1, 1), (2, 2), (3, 3)]:
+                if (subsample[0] > nb_row or subsample[1] > nb_col) and border_mode == 'same':
                    continue
-                for input_data_shape in [(2, 1, 3, 3), (2, 1, 4, 4)]:
+                for input_data_shape in [(2, 1, 5, 5), (2, 1, 6, 6)]:
                    layer = Convolution2D(nb_filter=1, nb_row=nb_row,
                                          nb_col=nb_row,
                                          border_mode=border_mode,
@@ -99,7 +101,7 @@ def test_Convolution2D():
                    input_data = np.random.random(input_data_shape)
                    check_layer_output_shape(layer, input_data)

-                for input_data_shape in [(2, 3, 3, 1)]:
+                for input_data_shape in [(2, 5, 5, 1)]:
                    layer = Convolution2D(nb_filter=1, nb_row=nb_row,
                                          nb_col=nb_row,
                                          border_mode=border_mode,