TD3 code implementation

Posted by poppy on Wed, 06 Oct 2021 00:29:52 +0200

TD3 code implementation

Code and explanation

0. Operating environment

Equipment / packageedition
python3.7.11
Graphics cardGTX 1050
CUDA10.2
cudnn7.6.5
cudatoolkit10.0.130
tensorflow-gpu2.2.0
tensorlayer2.2.3
tensorflow-probability0.9.0

1. Package introduction and parameter setting

import argparse
import os
import random
import time

import gym
import numpy as np
import tensorflow as tf

import tensorflow_probability as tfp
import tensorlayer as tl
from tensorlayer.layers import Dense
from tensorlayer.models import Model

from matplotlib import animation
import matplotlib.pyplot as plt

Normal = tfp.distributions.Normal

parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', default=False)
parser.add_argument('--render', dest='render', default=False)
parser.add_argument('--save_gif', type=bool, default=False)

parser.add_argument('--train_episodes', type=int, default=2000)
parser.add_argument('--test_episodes', type=int, default=10)
# During training, one game can play max_steps step
parser.add_argument('--max_steps', type=int, default=200)
# At the beginning of training, explore before_ The actions of steps are determined by random sampling
parser.add_argument('--explore_steps', type=int, default=500)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--replay_buffer_size', type=int, default=5e5)

# Hidden layer dimension
parser.add_argument('--hidden_dim', type=int, default=64)
# During training, the parameters are updated several times for each step
parser.add_argument('--update_itr', type=int, default=3)
# Policy network delay update
parser.add_argument('--delayed_update_itr', type=int, default=3)
parser.add_argument('--q_lr', type=float, default=3e-4)
parser.add_argument('--policy_lr', type=float, default=3e-4)
parser.add_argument('--gamma', type=float, default=0.95)
#Soft update parameters τ, Used when updating target
parser.add_argument('--tau', type=float, default=0.01)
# Explore strategic noise range
parser.add_argument('--explore_noise_scale', type=float, default=1.0)
# Strategy noise range when calculating gradient
parser.add_argument('--eval_noise_scale', type=float, default=0.5)
parser.add_argument('--reward_scale', type=float, default=1.0)

args = parser.parse_args()

ALG_NAME = 'TD3'
ENV_ID = 'Pendulum-v0'  # environment id
RANDOM_SEED = 2  # random seed
  • Normal = tfp.distributions.Normal defines a normal distribution. A specific normal distribution model can be generated through normal = Normal(0,1).

2.class ReplayBuffer

  • The specific implementation is the same as dqn (the code and explanation are at the end).
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = int((self.position + 1) % self.capacity)

    def sample(self, batch_size = args.batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

3.class QNetwork(Model)

  • QNetwork is used to approximate the state value function Q(s,a)

  • QNetwork needs to implement two methods.

    • _ init_: Neural network initialization.
    • Forward: the task of the forward function needs to link the input layer, network layer and output layer to realize the forward transmission of information. The forward method must be rewritten. It is the core to realize the function of the model and the connection relationship between various layers.
3.1._init_
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
    super(QNetwork, self).__init__()
    input_dim = state_dim + action_dim
    w_init = tf.random_uniform_initializer(-init_w, init_w)

    self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
    self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
    self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
3.2.forward
def forward(self, input):
    x = self.linear1(input)
    x = self.linear2(x)
    x = self.linear3(x)
    return x

4.class PolicyNetwork(Model)

  • PolicyNetwork is used to approximate the deterministic policy function π (a|s).

  • The PolicyNetwork class needs to implement five methods.

    • _ init_: Neural network initialization.

    • Forward: forward propagation function.

    • evaluate: get the action when calculating the gradient. This action is not a real action, usually at+1.

    • get_action: get the action when interacting with the environment.

    • sample_action: select an action at random.

4.1._init_
def __init__(self, state_dim, action_dim, hidden_dim, action_range=1., init_w=3e-3):
    super(PolicyNetwork, self).__init__()
    w_init = tf.random_uniform_initializer(-init_w, init_w)

    self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=state_dim, name='policy1')
    self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
    self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
    self.output_linear = Dense(
        n_units=action_dim, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
        in_channels=hidden_dim, name='policy_output'
    )
    self.action_range = action_range
    self.action_dim = action_dim
4.2.forward
def forward(self, state):
    x = self.linear1(state)
    x = self.linear2(x)
    x = self.linear3(x)
    output = tf.nn.tanh(self.output_linear(x))  # unit range output [-1, 1]
    return output
4.3.evaluate
def evaluate(self, state, eval_noise_scale):
    """
    generate action with state for calculating gradients;
    eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions.
        """
    state = state.astype(np.float32)
    action = self.forward(state)

    action = self.action_range * action

    # add noise
    normal = Normal(0, 1)
    noise = normal.sample(action.shape) * eval_noise_scale
    eval_noise_clip = 2 * eval_noise_scale
    noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip)
    action = action + noise
    return action
  • The evaluate method is used when updating the policy gradient. We don't want to sample actions that are too risky and too different from the actual situation. Therefore, the noise is truncated, otherwise there will be a small probability to sample large or small values.
4.4.get_action
def get_action(self, state, explore_noise_scale, greedy =False):
    """ generate action with state for interaction with environment """
    action = self.forward([state])
    action = self.action_range * action.numpy()[0]
    if greedy:
        return action
    # add noise
    normal = Normal(0, 1)
    noise = normal.sample(action.shape) * explore_noise_scale
    action += noise
    return action.numpy()
  • get_ The action method is used when interacting with the environment. It does not cut off the noise, but increases the exploration ability of the agent.
4.5.sample_action
def sample_action(self):
    """ generate random actions for exploration """
    a = tf.random.uniform([self.action_dim], -1, 1)
    return self.action_range * a.numpy()

5.class TD3

  • The TD3 algorithm class needs to implement 8 methods:
    • _ init_: Neural network initialization.
    • target_ini: target network initialization.
    • target_soft_update: soft update the target network.
    • Update: update all network parameters.
    • saveModel: save the model.
    • loadModel: load the model.
5.1._init_
  • Six networks need to be initialized: two different value networks and a policy network, as well as their corresponding target network.
def __init__(
    self, state_dim, action_dim, action_range, hidden_dim, replay_buffer, policy_target_update_interval=1,
    q_lr=3e-4, policy_lr=3e-4
):
    self.replay_buffer = replay_buffer

    # initialize all networks
    self.q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
    self.q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
    self.target_q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
    self.target_q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
    self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
    self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
    print('Q Network (1,2): ', self.q_net1)
    print('Policy Network: ', self.policy_net)

    # initialize weights of target networks
    self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1)
    self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2)
    self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net)

    # set train mode
    self.q_net1.train()
    self.q_net2.train()
    self.target_q_net1.eval()
    self.target_q_net2.eval()
    self.policy_net.train()
    self.target_policy_net.eval()

    self.update_cnt = 0
    self.policy_target_update_interval = policy_target_update_interval

    self.q_optimizer1 = tf.optimizers.Adam(q_lr)
    self.q_optimizer2 = tf.optimizers.Adam(q_lr)
    self.policy_optimizer = tf.optimizers.Adam(policy_lr)
5.2.target_ini
  • Hard update: directly copy the parameters of the neural network to the corresponding target network
def target_ini(self, net, target_net):
    """ hard-copy update for initializing target networks """
    for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
        target_param.assign(param)
    return target_net
5.3.target_soft_update
  • Soft update: θ-= (1- τ)·θ + τ·θ-
def target_soft_update(self, net, target_net, soft_tau):
    """ soft update the target net with Polyak averaging """
    for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
        target_param.assign(  # copy weight value into target parameters
            target_param * (1.0 - soft_tau) + param * soft_tau
        )
	return target_net
5.4.update
  • The update method mainly does the following things.
    • Fetch batch_size transition s from the replay buffer.
    • Find the next action with target policy network.
      • Like DQN with Target, target network is used for action selection and target network for value evaluation.
    • Standardized reward.
    • Given the transition, take the smaller values of the two networks and find the TD target.
    • Update two value networks using TD algorithm.
    • Update the network using the gradient rise strategy.
    • Soft update target network.
def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
    """ update all networks in TD3 """
    self.update_cnt += 1
    state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)

    reward = reward[:, np.newaxis]  # expand dim
    done = done[:, np.newaxis]

    new_next_action = self.target_policy_net.evaluate(
        next_state, eval_noise_scale=eval_noise_scale
    )  # clipped normal noise
    reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
        np.std(reward, axis=0) + 1e-6
    )  # normalize with batch mean and std; plus a small number to prevent numerical problem

    # Training Q Function
    target_q_input = tf.concat([next_state, new_next_action], 1)  # the dim 0 is number of samples
    target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))

    #TD target
    target_q_value = reward + (1 - done) * gamma * target_q_min  # if done==1, only reward
    q_input = tf.concat([state, action], 1)  # input of q_net

    with tf.GradientTape() as q1_tape:
        predicted_q_value1 = self.q_net1(q_input)
        q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
	q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
    self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))

    with tf.GradientTape() as q2_tape:
        predicted_q_value2 = self.q_net2(q_input)
        q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
	q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
    self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))

    # Training Policy Function
    if self.update_cnt % self.policy_target_update_interval == 0:
        with tf.GradientTape() as p_tape:
            # When updating the actor, we don't need to add noise. Here we hope that the actor can find the maximum value. Adding noise doesn't make any sense
            new_action = self.policy_net.evaluate(
                state, eval_noise_scale=0.0
            )  # no noise, deterministic policy gradients
            new_q_input = tf.concat([state, new_action], 1)
            # """ implementation 1 """
            # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input))
            """ implementation 2 """
            predicted_new_q_value = self.q_net1(new_q_input)
            policy_loss = -tf.reduce_mean(predicted_new_q_value)
		p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
    	self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))

            # Soft update the target nets
        self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
        self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
        self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
5.5.saveModel
def saveModel(self):
    path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
    if not os.path.exists(path):
        os.makedirs(path)
    extend_path = lambda s: os.path.join(path, s)
    tl.files.save_npz(self.q_net1.trainable_weights, extend_path('model_q_net1.npz'))
    tl.files.save_npz(self.q_net2.trainable_weights, extend_path('model_q_net2.npz'))
    tl.files.save_npz(self.target_q_net1.trainable_weights, extend_path('model_target_q_net1.npz'))
    tl.files.save_npz(self.target_q_net2.trainable_weights, extend_path('model_target_q_net2.npz'))
    tl.files.save_npz(self.policy_net.trainable_weights, extend_path('model_policy_net.npz'))
    tl.files.save_npz(self.target_policy_net.trainable_weights, extend_path('model_target_policy_net.npz'))
    print('Saved weights.')

5.6.loadModel
def loadModel(self):
    path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
    if os.path.exists(path):
        print('Load DQN Network parametets ...')
        extend_path = lambda s: os.path.join(path, s)
        tl.files.load_and_assign_npz(extend_path('model_q_net1.npz'), self.q_net1)
        tl.files.load_and_assign_npz(extend_path('model_q_net2.npz'), self.q_net2)
        tl.files.load_and_assign_npz(extend_path('model_target_q_net1.npz'), self.target_q_net1)
        tl.files.load_and_assign_npz(extend_path('model_target_q_net2.npz'), self.target_q_net2)
        tl.files.load_and_assign_npz(extend_path('model_policy_net.npz'), self.policy_net)
        tl.files.load_and_assign_npz(extend_path('model_target_policy_net.npz'), self.target_policy_net)
        print('Load weights!')
    else: 
        print("No model file find, please train model first...")

6. Main program

6.1. Convert frame image into gif function
def display_frames_as_gif(frames, path):
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=5)
    anim.save(path, writer='pillow', fps=30)
6.2.main function
if __name__ == '__main__':
    # initialization of env
    env = gym.make(ENV_ID)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_range = env.action_space.high  # scale action, [-action_range, action_range]

    # reproducible
    env.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)

    # initialization of buffer
    replay_buffer = ReplayBuffer(args.replay_buffer_size)
    # initialization of trainer
    agent = TD3(
        state_dim, action_dim, action_range, args.hidden_dim, replay_buffer, args.delayed_update_itr, args.q_lr, args.policy_lr
    )
    t0 = time.time()

    # training loop
    agent.loadModel()
    if args.train:
        frame_idx = 0
        all_episode_reward = []

        # need an extra call here to make inside functions be able to use model.forward
        state = env.reset().astype(np.float32)
        agent.policy_net([state])
        agent.target_policy_net([state])

        for episode in range(args.train_episodes):
            state = env.reset().astype(np.float32)
            episode_reward = 0

            for step in range(args.max_steps):
                if args.render:
                    env.render()
                if frame_idx > args.explore_steps:
                    action = agent.policy_net.get_action(state, args.explore_noise_scale)
                else:
                    action = agent.policy_net.sample_action()

                next_state, reward, done, _ = env.step(action)
                next_state = next_state.astype(np.float32)
                done = 1 if done is True else 0

                replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                frame_idx += 1

                if len(replay_buffer) > args.batch_size:
                    for i in range(args.update_itr):
                        agent.update(args.batch_size, args.eval_noise_scale, args.reward_scale, args.gamma, args.tau)
                if done:
                    break
            if episode == 0:
                all_episode_reward.append(episode_reward)
            else:
                all_episode_reward.append(all_episode_reward[-1] * 0.9 + episode_reward * 0.1)
            print(
                'Training  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                    episode + 1, args.train_episodes, episode_reward,
                    time.time() - t0
                )
            )
            # Save the model for a hundred rounds
            if episode % 100 == 0:
                agent.saveModel()

        plt.plot(all_episode_reward)
        if not os.path.exists('image'):
            os.makedirs('image')
        plt.savefig(os.path.join('image', '_'.join([ALG_NAME, ENV_ID])))
    else:
        # need an extra call here to make inside functions be able to use model.forward
        state = env.reset().astype(np.float32)
        agent.policy_net([state])

        for episode in range(args.test_episodes):
            state = env.reset().astype(np.float32)
            episode_reward = 0
            frames = []
            for step in range(args.max_steps):
                env.render()
                frames.append(env.render(mode='rgb_array'))

                action = agent.policy_net.get_action(state, args.explore_noise_scale, greedy=True)
                state, reward, done, info = env.step(action)
                state = state.astype(np.float32)
                episode_reward += reward
                if done:
                    break
            print(
                'Testing  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                    episode + 1, args.test_episodes, episode_reward,
                    time.time() - t0
                )
            )
            # Save this game as gif
            if args.save_gif:
                dir_path = os.path.join('testVideo', '_'.join([ALG_NAME, ENV_ID]))
                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                display_frames_as_gif(frames, dir_path + '\\' + str(episode) + ".gif")

    env.close()

Training results

2000 times


DQN with Target code implementation

Topics: neural networks TensorFlow Deep Learning