Source code for cntk.contrib.deeprl.agent.shared.customized_models

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Customized Q function or (unnormalized) log of policy function.

If models from cntk.contrib.deeprl.agent.shared.models are not adequate, write
your own model as a function, which takes two required arguments
'shape_of_inputs', 'number_of_outputs', and two optional arguments
'loss_function', 'use_placeholder_for_input', and outputs a dictionary
containing 'inputs', 'outputs', 'f' and 'loss'. In the config file, set
QRepresentation or PolicyRepresentation to path (module_name.function_name) of
the function. QLearning/PolicyGradient will then automatically search for it.
"""

import cntk as C
import numpy as np


[docs]def conv_dqn(shape_of_inputs,
             number_of_outputs,
             loss_function=None,
             use_placeholder_for_input=False):
    """Example convolutional neural network for approximating the Q value function.

    This is the model used in the original DQN paper
    https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf.

    Args:
        shape_of_inputs: tuple of array (input) dimensions.
        number_of_outputs: dimension of output, equals the number of
            possible actions.
        loss_function: if not specified, use squared loss by default.
        use_placeholder_for_input: if true, inputs have to be replaced
            later with actual input_variable.

    Returns: a Python dictionary with string-valued keys including
        'inputs', 'outputs', 'loss' and 'f'.
    """
    # input/output
    inputs = C.ops.placeholder(shape=shape_of_inputs) \
        if use_placeholder_for_input \
        else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32)
    outputs = C.ops.input_variable(
        shape=(number_of_outputs,), dtype=np.float32)

    # network structure
    centered_inputs = inputs - 128
    scaled_inputs = centered_inputs / 256

    with C.layers.default_options(activation=C.ops.relu):
        q = C.layers.Sequential([
            C.layers.Convolution((8, 8), 32, strides=4),
            C.layers.Convolution((4, 4), 64, strides=2),
            C.layers.Convolution((3, 3), 64, strides=2),
            C.layers.Dense((512,)),
            C.layers.Dense(number_of_outputs, activation=None)
        ])(scaled_inputs)

    if loss_function is None:
        loss = C.losses.squared_error(q, outputs)
    else:
        loss = loss_function(q, outputs)

    return {
        'inputs': inputs,
        'outputs': outputs,
        'f': q,
        'loss': loss
    }