Source code for cntk.contrib.deeprl.agent.shared.policy_gradient_parameters

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Policy Gradient parameters."""

import configparser


[docs]class PolicyGradientParameters: """Parameters used by Policy Gradient algorithms.""" def __init__(self, config_file): """Read parameter values from config_file. Use default value if the parameter is not present. """ self.config = configparser.ConfigParser() self.config.optionxform = str self.config.read(config_file) # Discount factor. self.gamma = self.config.getfloat( 'General', 'Gamma', fallback=0.95) # Name of class that does preprocessing. self.preprocessing = self.config.get( 'General', 'PreProcessing', fallback='') # Arguments (except the first argument input_shape) of preprocessing as # a tuple. self.preprocessing_args = self.config.get( 'General', 'PreProcessingArgs', fallback='()') # If true, policy pi and value function V share all non-output layers. # PolicyRepresentation (and/or PolicyNetworkHiddenLayers) define # structure for all non-output layers. Policy then has one softmax # output layer, and value function has one linear output layer. If # false, all non-output layers of policy are still specified by # PolicyRepresentation. This is equivalent to defining unnormalized log # of policy pi. The value function, however, is completely specified by # ValueFunctionRepresentation (and/or ValueNetworkHiddenLayers), which # outputs a scalar. self.shared_representation = self.config.getboolean( 'PolicyGradient', 'SharedRepresentation', fallback=False) # Representation of policy. self.policy_representation = self.config.get( 'PolicyGradient', 'PolicyRepresentation', fallback='nn') # Suppose gradient of policy network is g, gradient of value network # is gv, during each update, policy network is updated as # \theta <- \theta + \eta * g where \eta is learning rate, and # value network is updated as # \theta_v <- \theta_v + \eta * relative_step_size * gv. This allows # policy network and value network to be updated at different learning # rates. Alternatively, this can be viewed as relative weight between # policy loss and value function loss. self.relative_step_size = self.config.getfloat( 'PolicyGradient', 'RelativeStepSize', fallback=0.5) # Weight of regularization term. self.regularization_weight = self.config.getfloat( 'PolicyGradient', 'RegularizationWeight', fallback=0.001) # Number of nodes in each hidden layer of policy network. self.policy_network_hidden_layers = self.config.get( 'NetworkModel', 'PolicyNetworkHiddenLayerNodes', fallback='[10]') # Representation of value function. self.value_function_representation = self.config.get( 'PolicyGradient', 'ValueFunctionRepresentation', fallback='nn') # Number of nodes in each hidden layer of value network. self.value_network_hidden_layers = self.config.get( 'NetworkModel', 'ValueNetworkHiddenLayerNodes', fallback='[10]') # Initial value of eta, which is the learning rate for gradient descent. self.initial_eta = self.config.getfloat( 'Optimization', 'InitialEta', fallback=0.001) # Number of steps before eta reaches minimum value. self.eta_decay_step_count = self.config.getint( 'Optimization', 'EtaDecayStepCount', fallback=100000) # Minimum value of eta. Since Adam is used as the optimizer, a good # starting point is to set EtaMinimum equal to InitialEta, which is # equivalent to using a constant global learning rate cap, while Adam # continuously adapts individual parameter learning rates. self.eta_minimum = self.config.getfloat( 'Optimization', 'EtaMinimum', fallback=0.001) # Momentum used by Adam. self.momentum = self.config.getfloat( 'Optimization', 'Momentum', fallback=0.95) # Update frequency for policy network and value network, in the number # of time steps. self.update_frequency = self.config.getint( 'PolicyGradient', 'UpdateFrequency', fallback=64) # Name of a file containing model of the same structure as policy # network (unnormalized log of policy pi), where model is obtained # through other methods (e.g. supervised learning), and saved by # cntk.ops.functions.Function.save(). Random initialization is # performed if value is empty. self.initial_policy_network = self.config.get( 'PolicyGradient', 'InitialPolicy', fallback='')
[docs] def save(self, config_file): with open(config_file, 'w') as c: self.config.write(c)