Addition of MobileNet to application (#7009)

* Add MobileNet to application * Add support for 1001 classes in imagenet utils * Revert a mistake in the tests * Setup application test for mobilenet to run only when on tensorflow * Correct pytest.mark.skipif explanation for skipping tests if not on tensorflow * Corrected mobilenet to support 1000 classes and reverted imagenet_utils to prior state * Fix tensorflow test * Restrict mobilenets to data format "channels_last" * Add review fixes * PEP8 fix * Add relu6 to activations.py * Corrected imports in mobilenet.py * Rolled back activation relu6 and inlined it to mobilenet.py * Refactored DepthwiseConv2D and other corrections * Fixed tests * PEP8 correction * Add docs to private functions and other fixes * Fix failed test where input shape is None * Fix value of size for model name
2017-06-27 13:11:48 -05:00
commit 49a7c7376d
@@ -3,3 +3,4 @@ from .vgg19 import VGG19
 from .resnet50 import ResNet50
 from .inception_v3 import InceptionV3
 from .xception import Xception
+from .mobilenet import MobileNet
@@ -58,7 +58,7 @@ def decode_predictions(preds, top=5):
    if len(preds.shape) != 2 or preds.shape[1] != 1000:
        raise ValueError('`decode_predictions` expects '
                         'a batch of predictions '
-                         '(i.e. a 2D array of shape (samples, 1000)). '
+                         '(i.e. a 2D array of shape (samples, 1000)).'
                         'Found array with shape: ' + str(preds.shape))
    if CLASS_INDEX is None:
        fpath = get_file('imagenet_class_index.json',
@@ -0,0 +1,616 @@
+"""MobileNet v1 models for Keras.
+
+MobileNet is a general architecture and can be used for multiple use cases.
+Depending on the use case, it can use different input layer size and
+different width factors. This allows different width models to reduce
+the number of multiply-adds and thereby reduce inference cost on mobile devices.
+
+MobileNets support any input size greater than 32 x 32, with larger image sizes
+offering better performance. The number of parameters and number of multiply-adds
+can be modified by using the `alpha` parameter, which increases/decreases the number
+of filters in each layer. By altering the image size and `alpha` parameter,
+all 16 models from the paper can be built, with ImageNet weights provided.
+
+The paper demonstrates the performance of MobileNets using `alpha` values of
+1.0 (also called 100 % MobileNet), 0.75, 0.5 and 0.25. For each of these `alpha`
+values, weights for 4 different input image sizes are provided (224, 192, 160, 128).
+
+The following table describes the size and accuracy of the 100% MobileNet on size 224 x 224
+-----------------------------------------------------------------------------------------
+Width Multiplier (alpha) | ImageNet Accuracy | Million Multiply-Adds | Million Parameters
+-----------------------------------------------------------------------------------------
+|   1.0 MobileNet-224    |      70.6 %       |          529          |        4.2       |
+|   0.75 MobileNet-224   |      68.4 %       |          325          |        2.6       |
+|   0.50 MobileNet-224   |      63.7 %       |          149          |        1.3       |
+|   0.25 MobileNet-224   |      50.6 %       |          41           |        0.5       |
+-----------------------------------------------------------------------------------------
+
+The following table describes the performance of the 100 % MobileNet on various input sizes
+-----------------------------------------------------------------------------------------
+        Resolution       | ImageNet Accuracy | Million Multiply-Adds | Million Parameters
+-----------------------------------------------------------------------------------------
+|   1.0 MobileNet-224    |      70.6 %       |          529          |        4.2       |
+|   1.0 MobileNet-192    |      69.1 %       |          529          |        4.2       |
+|   1.0 MobileNet-160    |      67.2 %       |          529          |        4.2       |
+|   1.0 MobileNet-128    |      64.4 %       |          529          |        4.2       |
+-----------------------------------------------------------------------------------------
+
+The weights for all 16 models are obtained and translated from Tensorflow checkpoints
+found in https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.md
+
+# Reference
+- [MobileNets: Efficient Convolutional Neural Networks for
+   Mobile Vision Applications](https://arxiv.org/pdf/1704.04861.pdf))
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import warnings
+
+from ..models import Model
+from ..layers import Input
+from ..layers import Activation
+from ..layers import Dropout
+from ..layers import Reshape
+from ..layers import BatchNormalization
+from ..layers import Convolution2D
+from ..layers import GlobalAveragePooling2D
+from ..layers import GlobalMaxPooling2D
+from ..layers import Conv2D
+from .. import initializers
+from .. import regularizers
+from .. import constraints
+from ..utils import conv_utils
+from ..utils.data_utils import get_file
+from ..engine.topology import get_source_inputs
+from ..engine import InputSpec
+from ..applications.imagenet_utils import _obtain_input_shape
+from ..applications.imagenet_utils import preprocess_input
+from ..applications.imagenet_utils import decode_predictions
+from .. import backend as K
+
+
+BASE_WEIGHT_PATH = 'https://github.com/titu1994/MobileNetworks/releases/download/v1.0/'
+
+
+def relu6(x):
+    return K.relu(x, max_value=6)
+
+
+class DepthwiseConv2D(Conv2D):
+    """Depthwise separable 2D convolution.
+    Depthwise Separable convolutions consists in performing
+    just the first step in a depthwise spatial convolution
+    (which acts on each input channel separately).
+    The `depth_multiplier` argument controls how many
+    output channels are generated per input channel in the depthwise step.
+
+    # Arguments
+        kernel_size: An integer or tuple/list of 2 integers, specifying the
+            width and height of the 2D convolution window.
+            Can be a single integer to specify the same value for
+            all spatial dimensions.
+        strides: An integer or tuple/list of 2 integers,
+            specifying the strides of the convolution along the width and height.
+            Can be a single integer to specify the same value for
+            all spatial dimensions.
+            Specifying any stride value != 1 is incompatible with specifying
+            any `dilation_rate` value != 1.
+        padding: one of `"valid"` or `"same"` (case-insensitive).
+        depth_multiplier: The number of depthwise convolution output channels
+            for each input channel.
+            The total number of depthwise convolution output
+            channels will be equal to `filters_in * depth_multiplier`.
+        data_format: A string,
+            one of `channels_last` (default) or `channels_first`.
+            The ordering of the dimensions in the inputs.
+            `channels_last` corresponds to inputs with shape
+            `(batch, height, width, channels)` while `channels_first`
+            corresponds to inputs with shape
+            `(batch, channels, height, width)`.
+            It defaults to the `image_data_format` value found in your
+            Keras config file at `~/.keras/keras.json`.
+            If you never set it, then it will be "channels_last".
+        activation: Activation function to use
+            (see [activations](../activations.md)).
+            If you don't specify anything, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        depthwise_initializer: Initializer for the depthwise kernel matrix
+            (see [initializers](../initializers.md)).
+        bias_initializer: Initializer for the bias vector
+            (see [initializers](../initializers.md)).
+        depthwise_regularizer: Regularizer function applied to
+            the depthwise kernel matrix
+            (see [regularizer](../regularizers.md)).
+        bias_regularizer: Regularizer function applied to the bias vector
+            (see [regularizer](../regularizers.md)).
+        activity_regularizer: Regularizer function applied to
+            the output of the layer (its "activation").
+            (see [regularizer](../regularizers.md)).
+        depthwise_constraint: Constraint function applied to
+            the depthwise kernel matrix
+            (see [constraints](../constraints.md)).
+        bias_constraint: Constraint function applied to the bias vector
+            (see [constraints](../constraints.md)).
+
+    # Input shape
+        4D tensor with shape:
+        `[batch, channels, rows, cols]` if data_format='channels_first'
+        or 4D tensor with shape:
+        `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+        or 4D tensor with shape:
+        `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to padding.
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 strides=(1, 1),
+                 padding='valid',
+                 depth_multiplier=1,
+                 data_format=None,
+                 activation=None,
+                 use_bias=True,
+                 depthwise_initializer='glorot_uniform',
+                 bias_initializer='zeros',
+                 depthwise_regularizer=None,
+                 bias_regularizer=None,
+                 activity_regularizer=None,
+                 depthwise_constraint=None,
+                 bias_constraint=None,
+                 **kwargs):
+        super(DepthwiseConv2D, self).__init__(
+            filters=None,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            activation=activation,
+            use_bias=use_bias,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            bias_constraint=bias_constraint,
+            **kwargs)
+        self.depth_multiplier = depth_multiplier
+        self.depthwise_initializer = initializers.get(depthwise_initializer)
+        self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+        self.depthwise_constraint = constraints.get(depthwise_constraint)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+    def build(self, input_shape):
+        if len(input_shape) < 4:
+            raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                             'Received input shape:', str(input_shape))
+        if self.data_format == 'channels_first':
+            channel_axis = 1
+        else:
+            channel_axis = 3
+        if input_shape[channel_axis] is None:
+            raise ValueError('The channel dimension of the inputs to '
+                             '`DepthwiseConv2D` '
+                             'should be defined. Found `None`.')
+        input_dim = int(input_shape[channel_axis])
+        depthwise_kernel_shape = (self.kernel_size[0],
+                                  self.kernel_size[1],
+                                  input_dim,
+                                  self.depth_multiplier)
+
+        self.depthwise_kernel = self.add_weight(
+            shape=depthwise_kernel_shape,
+            initializer=self.depthwise_initializer,
+            name='depthwise_kernel',
+            regularizer=self.depthwise_regularizer,
+            constraint=self.depthwise_constraint)
+
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
+                                        initializer=self.bias_initializer,
+                                        name='bias',
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = None
+        # Set input spec.
+        self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+        self.built = True
+
+    def call(self, inputs, training=None):
+        outputs = K.depthwise_conv2d(
+            inputs,
+            self.depthwise_kernel,
+            strides=self.strides,
+            padding=self.padding,
+            dilation_rate=self.dilation_rate,
+            data_format=self.data_format)
+
+        if self.bias:
+            outputs = K.bias_add(
+                outputs,
+                self.bias,
+                data_format=self.data_format)
+
+        if self.activation is not None:
+            return self.activation(outputs)
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        if self.data_format == 'channels_first':
+            rows = input_shape[2]
+            cols = input_shape[3]
+            out_filters = input_shape[1] * self.depth_multiplier
+        elif self.data_format == 'channels_last':
+            rows = input_shape[1]
+            cols = input_shape[2]
+            out_filters = input_shape[3] * self.depth_multiplier
+
+        rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                             self.padding,
+                                             self.strides[0])
+        cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                             self.padding,
+                                             self.strides[1])
+
+        if self.data_format == 'channels_first':
+            return (input_shape[0], out_filters, rows, cols)
+        elif self.data_format == 'channels_last':
+            return (input_shape[0], rows, cols, out_filters)
+
+    def get_config(self):
+        config = super(DepthwiseConv2D, self).get_config()
+        config.pop('filters')
+        config.pop('kernel_initializer')
+        config.pop('kernel_regularizer')
+        config.pop('kernel_constraint')
+        config['depth_multiplier'] = self.depth_multiplier
+        config['depthwise_initializer'] = initializers.serialize(self.depthwise_initializer)
+        config['depthwise_regularizer'] = regularizers.serialize(self.depthwise_regularizer)
+        config['depthwise_constraint'] = constraints.serialize(self.depthwise_constraint)
+        return config
+
+
+def MobileNet(input_shape=None,
+              alpha=1.0,
+              depth_multiplier=1,
+              dropout=1e-3,
+              include_top=True,
+              weights='imagenet',
+              input_tensor=None,
+              pooling=None,
+              classes=1000):
+    """
+    Instantiate the MobileNet architecture.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+
+    To load a MobileNet model via `load_model`, import the custom
+    objects `relu6` and `DepthwiseConv2D` and pass them to the
+    `custom_objects` parameter.
+    E.g.
+    model = load_model('mobilenet.h5', custom_objects={
+                       'relu6': mobilenet.relu6,
+                       'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` data format)
+            or (3, 224, 224) (with `channels_first` data format).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(200, 200, 3)` would be one valid value.
+        alpha: controls the width of the network.
+            - If `alpha` < 1.0, proportionally decreases the number
+                of filters in each layer.
+            - If `alpha` > 1.0, proportionally increases the number
+                of filters in each layer.
+            - If `alpha` = 1, default number of filters from the paper
+                 are used at each layer.
+        depth_multiplier: depth multiplier for depthwise convolution
+            (also called the resolution multiplier)
+        dropout: dropout rate
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+
+    # Returns
+        A Keras model instance.
+
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+
+    if K.backend() != 'tensorflow':
+        raise RuntimeError('Only Tensorflow backend is currently supported, '
+                           'as other backends do not support depthwise convolution.')
+
+    if weights not in {'imagenet', None}:
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `imagenet` '
+                         '(pre-training on ImageNet).')
+
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as ImageNet with `include_top`'
+                         ' as true, `classes` should be 1000')
+
+    if weights == 'imagenet' and depth_multiplier != 1:
+        if depth_multiplier != 1:
+            raise ValueError('If imagenet weights are being loaded, '
+                             'depth multiplier must be 1')
+
+        if alpha not in [0.25, 0.50, 0.75, 1.0]:
+            raise ValueError('If imagenet weights are being loaded, alpha can be one of'
+                             '`0.25`, `0.50`, `0.75` or `1.0` only.')
+
+        rows, cols = (0, 1) if K.image_data_format() == 'channels_last' else (1, 2)
+        rows = int(input_shape[rows])
+        cols = int(input_shape[cols])
+
+        if rows != cols or rows not in [128, 160, 192, 224]:
+            raise ValueError('If imagenet weights are being loaded,'
+                             'image must have a square shape (one of '
+                             '(128,128), (160,160), (192,192), or (224, 224)).'
+                             'Image shape provided = (%d, %d)' % (rows, cols))
+
+    if K.image_data_format() != 'channels_last':
+        warnings.warn('The MobileNet family of models is only available '
+                      'for the input data format "channels_last" '
+                      '(width, height, channels). '
+                      'However your settings specify the default '
+                      'data format "channels_first" (channels, width, height). '
+                      'You should set `image_data_format="channels_last"` in your Keras '
+                      'config located at ~/.keras/keras.json. '
+                      'The model being returned right now will expect inputs '
+                      'to follow the "channels_last" data format.')
+        K.set_image_data_format('channels_last')
+        old_data_format = 'channels_first'
+    else:
+        old_data_format = None
+
+    # Determine proper input shape. Note, include_top is False by default, as
+    # input shape can be anything larger than 32x32 and the same number of parameters
+    # will be used.
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=224,
+                                      min_size=32,
+                                      data_format=K.image_data_format(),
+                                      include_top=False)
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    x = _conv_block(img_input, 32, alpha, strides=(2, 2))
+    x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, id=1)
+
+    x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, strides=(2, 2), id=2)
+    x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, id=3)
+
+    x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, strides=(2, 2), id=4)
+    x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, id=5)
+
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, strides=(2, 2), id=6)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, id=7)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, id=8)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, id=9)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, id=10)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, id=11)
+
+    x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, strides=(2, 2), id=12)
+    x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, id=13)
+
+    if include_top:
+        if K.image_data_format() == 'channels_first':
+            shape = (int(1024 * alpha), 1, 1)
+        else:
+            shape = (1, 1, int(1024 * alpha))
+
+        x = GlobalAveragePooling2D()(x)
+        x = Reshape(shape, name='reshape_1')(x)
+        x = Dropout(dropout, name='dropout')(x)
+        x = Convolution2D(classes, (1, 1), padding='same', name='conv_preds')(x)
+        x = Activation('softmax', name='act_softmax')(x)
+        x = Reshape((classes,), name='reshape_2')(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D()(x)
+        elif pooling == 'max':
+            x = GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    rows, cols = (0, 1) if K.image_data_format() == 'channels_last' else (1, 2)
+    if input_shape[rows] is None:
+        size = 'None'
+    else:
+        size = str(input_shape[rows])
+
+    # Create model.
+    model = Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, size))
+
+    # load weights
+    if weights == 'imagenet':
+        if K.image_data_format() == 'channels_first':
+            raise AttributeError('Weights for Channels Last format are not available')
+
+        if alpha == 1.0:
+            alpha_text = '1_0'
+        elif alpha == 0.75:
+            alpha_text = '7_5'
+        elif alpha == 0.50:
+            alpha_text = '5_0'
+        else:
+            alpha_text = '2_5'
+
+        if include_top:
+            model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)
+            weigh_path = BASE_WEIGHT_PATH + model_name
+            weights_path = get_file(model_name,
+                                    weigh_path,
+                                    cache_subdir='models')
+        else:
+            model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)
+            weigh_path = BASE_WEIGHT_PATH + model_name
+            weights_path = get_file(model_name,
+                                    weigh_path,
+                                    cache_subdir='models')
+
+        model.load_weights(weights_path)
+
+    if old_data_format:
+        K.set_image_data_format(old_data_format)
+
+    return model
+
+
+def _conv_block(input, filters, alpha, kernel=(3, 3), strides=(1, 1)):
+    """Adds an initial convolution layer (with batch normalization and relu6).
+
+    # Arguments
+        input: Input tensor of shape `(rows, cols, 3)`
+            (with `channels_last` data format) or
+            (3, rows, cols) (with `channels_first` data format).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        filters: Integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution).
+        alpha: controls the width of the network.
+            - If `alpha` < 1.0, proportionally decreases the number
+                of filters in each layer.
+            - If `alpha` > 1.0, proportionally increases the number
+                of filters in each layer.
+            - If `alpha` = 1, default number of filters from the paper
+                 are used at each layer.
+        kernel: An integer or tuple/list of 2 integers, specifying the
+            width and height of the 2D convolution window.
+            Can be a single integer to specify the same value for
+            all spatial dimensions.
+        strides: An integer or tuple/list of 2 integers,
+            specifying the strides of the convolution along the width and height.
+            Can be a single integer to specify the same value for
+            all spatial dimensions.
+            Specifying any stride value != 1 is incompatible with specifying
+            any `dilation_rate` value != 1.
+
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to stride.
+    """
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    filters = int(filters * alpha)
+
+    x = Convolution2D(filters, kernel, padding='same', use_bias=False, strides=strides,
+                      name='conv1')(input)
+    x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
+    x = Activation(relu6, name='conv1_relu')(x)
+
+    return x
+
+
+def _depthwise_conv_block(input, pointwise_conv_filters, alpha,
+                          depth_multiplier=1, strides=(1, 1), id=1):
+    """
+    Adds a depthwise convolution block (depthwise conv, batch normalization, relu6,
+    pointwise convolution, batch normalization and relu6).
+
+    # Arguments
+        input: Input tensor of shape `(rows, cols, channels)`
+            (with `channels_last` data format) or
+            (channels, rows, cols) (with `channels_first` data format).
+        pointwise_conv_filters: Integer, the dimensionality of the output space
+            (i.e. the number output of filters in the pointwise convolution).
+        alpha: controls the width of the network.
+            - If `alpha` < 1.0, proportionally decreases the number
+                of filters in each layer.
+            - If `alpha` > 1.0, proportionally increases the number
+                of filters in each layer.
+            - If `alpha` = 1, default number of filters from the paper
+                 are used at each layer.
+        depth_multiplier: The number of depthwise convolution output channels
+            for each input channel.
+            The total number of depthwise convolution output
+            channels will be equal to `filters_in * depth_multiplier`.
+        strides: An integer or tuple/list of 2 integers,
+            specifying the strides of the convolution along the width and height.
+            Can be a single integer to specify the same value for
+            all spatial dimensions.
+            Specifying any stride value != 1 is incompatible with specifying
+            any `dilation_rate` value != 1.
+        id: Integer, a unique identification designating  the block number.
+
+    # Input shape
+        4D tensor with shape:
+        `(batch, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to stride.
+    """
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    pointwise_conv_filters = int(pointwise_conv_filters * alpha)
+
+    x = DepthwiseConv2D(kernel_size=(3, 3), padding='same', depth_multiplier=depth_multiplier,
+                        strides=strides, use_bias=False, name='conv_dw_%d' % id)(input)
+    x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % id)(x)
+    x = Activation(relu6, name='conv_dw_%d_relu' % id)(x)
+
+    x = Convolution2D(pointwise_conv_filters, (1, 1), padding='same', use_bias=False, strides=(1, 1),
+                      name='conv_pw_%d' % id)(x)
+    x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % id)(x)
+    x = Activation(relu6, name='conv_pw_%d_relu' % id)(x)
+
+    return x
@@ -1352,6 +1352,16 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
    return _postprocess_conv2d_output(x, data_format)


+def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
+                     padding='valid', data_format=None, dilation_rate=(1, 1)):
+    raise NotImplementedError
+
+
+def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
+                     data_format=None, dilation_rate=(1, 1)):
+    raise NotImplementedError
+
+
 def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
           data_format=None, dilation_rate=(1, 1, 1)):
    if data_format is None:
@@ -3211,6 +3211,41 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
    return _postprocess_conv2d_output(x, data_format)


+def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
+                     data_format=None, dilation_rate=(1, 1)):
+    """2D convolution with separable filters.
+
+    # Arguments
+        x: input tensor
+        depthwise_kernel: convolution kernel for the depthwise convolution.
+        strides: strides tuple (length 2).
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        dilation_rate: tuple of integers,
+            dilation rates for the separable convolution.
+
+    # Returns
+        Output tensor.
+
+    # Raises
+        ValueError: if `data_format` is neither `channels_last` or `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {'channels_first', 'channels_last'}:
+        raise ValueError('Unknown data_format ' + str(data_format))
+
+    x = _preprocess_conv2d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    strides = (1,) + strides + (1,)
+
+    x = tf.nn.depthwise_conv2d(x, depthwise_kernel,
+                               strides=strides,
+                               padding=padding,
+                               rate=dilation_rate)
+    return _postprocess_conv2d_output(x, data_format)
+
+
 def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
           data_format=None, dilation_rate=(1, 1, 1)):
    """3D convolution.
@@ -1919,6 +1919,11 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
    raise NotImplementedError


+def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
+                     data_format=None, dilation_rate=(1, 1)):
+    raise NotImplementedError
+
+
 def conv3d(x, kernel, strides=(1, 1, 1),
           padding='valid', data_format=None,
           dilation_rate=(1, 1, 1)):
@@ -1,5 +1,8 @@
 import pytest
 from keras.utils.test_utils import keras_test
+from keras.utils.test_utils import layer_test
+from keras.utils.generic_utils import CustomObjectScope
+from keras.models import Sequential
 from keras import applications
 from keras import backend as K

@@ -166,5 +169,73 @@ def test_inceptionv3_pooling():
    assert model.output_shape == (None, 2048)


+@keras_test
+@pytest.mark.skipif((K.backend() != 'tensorflow'),
+                    reason="MobileNets are supported only on Tensorflow")
+def test_mobilenet():
+    model = applications.MobileNet(weights=None)
+    assert model.output_shape == (None, 1000)
+
+
+@keras_test
+@pytest.mark.skipif((K.backend() != 'tensorflow'),
+                    reason="MobileNets are supported only on Tensorflow")
+def test_mobilenet_no_top():
+    model = applications.MobileNet(weights=None, include_top=False)
+    assert model.output_shape == (None, None, None, 1024)
+
+
+@keras_test
+@pytest.mark.skipif((K.backend() != 'tensorflow'),
+                    reason="MobileNets are supported only on Tensorflow")
+def test_mobilenet_pooling():
+    model = applications.MobileNet(weights=None, include_top=False, pooling='avg')
+    assert model.output_shape == (None, 1024)
+
+
+@pytest.mark.skipif(K.backend() != 'tensorflow', reason='Requires TF backend')
+@keras_test
+def test_depthwise_conv_2d():
+    _convolution_paddings = ['valid', 'same']
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    with CustomObjectScope({'relu6': applications.mobilenet.relu6,
+                            'DepthwiseConv2D': applications.mobilenet.DepthwiseConv2D}):
+        for padding in _convolution_paddings:
+            for strides in [(1, 1), (2, 2)]:
+                for multiplier in [1, 2]:
+                    if padding == 'same' and strides != (1, 1):
+                        continue
+
+                    layer_test(applications.mobilenet.DepthwiseConv2D,
+                               kwargs={'kernel_size': (3, 3),
+                                       'padding': padding,
+                                       'strides': strides,
+                                       'depth_multiplier': multiplier},
+                               input_shape=(num_samples, num_row, num_col, stack_size))
+
+        layer_test(applications.mobilenet.DepthwiseConv2D,
+                   kwargs={'kernel_size': 3,
+                           'padding': padding,
+                           'data_format': 'channels_first',
+                           'activation': None,
+                           'depthwise_regularizer': 'l2',
+                           'bias_regularizer': 'l2',
+                           'activity_regularizer': 'l2',
+                           'depthwise_constraint': 'unit_norm',
+                           'strides': strides,
+                           'depth_multiplier': multiplier},
+                   input_shape=(num_samples, stack_size, num_row, num_col))
+
+        # Test invalid use case
+        with pytest.raises(ValueError):
+            model = Sequential([applications.mobilenet.DepthwiseConv2D(kernel_size=3,
+                                                                       padding=padding,
+                                                                       batch_input_shape=(None, None, 5, None))])
+
+
 if __name__ == '__main__':
    pytest.main([__file__])