# [Baidu AI intensive learning series] IV. solving RL based on strategy gradient (Solving Pong with PG) In reinforcement learning, there are two kinds of methods, one is based on value, the other is based on policy. Policy Gradient belongs to the latter. PG does not need to fit the Q value like DQN, it directly fits the next action of Agent, which is the same as the end-to-end neural network, and there is no intermediate link, as shown in the figure below. ## Formula derivation

### Expected return

In an episod. The activity of an Agent uses a sequence τ = {s1, a1, s2, a2,..., sT, aT} \ tau = \ left \ {s_ 1,a_ 1,s_ 2,a_ 2 ,...,s_ T,a_ T \ right \} τ = {s1, a1, s2, a2,..., sT, aT}, where sTs_TsT represents the state of TTT step, sTs_TsT represents the action in TTT step. πθ=(aT∣sT)\pi_\theta=(a_T|s_T) π θ = (at ∣ st) is to predict the probability of the Agent's action through the network.

### Optimization objective and strategy gradient ③ Where ρ (s1) \ rho (s)_ 1) ρ (s1) and ρ (st+1 ∣ st, at0) \ rho (s)_ {t+1}|s_ t,a_ T0) ρ (st+1 ∣ st, at 0) will be reduced in the derivation of θ \ theta θ, so formula ④ is obtained.

# Project exercise (PG solving Pong)

Courses and exercises are provided by Baidu AI's 7-day card punching camp( Course links)

## practical combat requirement

The score gradually rises and converges from - 21. The score of Test reward can converge to more than 0 (indicating that the opponent has been defeated). The higher the score, the better.

## network structure

At the time of writing this article, I was still debugging crazily. With reference to other solutions, it is estimated that most of the solutions converged to better results after several rounds of training. I want to continue to try and see what can be done to speed up the convergence. Then put your own code out. Now I post the teacher's network structure:

class Model(parl.Model):
def __init__(self, act_dim):
act_dim = act_dim
hid1_size = 256
hid2_size = 64

self.fc1 = layers.fc(size=hid1_size, act='relu')
self.fc2 = layers.fc(size=hid2_size, act='relu')
self.fc3 = layers.fc(size=act_dim, act='softmax')

def forward(self, obs):
h1 = self.fc1(obs)
h2 = self.fc2(h1)
out = self.fc3(h2)
return out


Agent:

class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)

def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()

with fluid.program_guard(self.pred_program):  # Build calculation chart to predict actions and define input and output variables
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.act_prob = self.alg.predict(obs)

with fluid.program_guard(
self.learn_program):  # Build calculation chart to update policy network and define input and output variables
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(name='act', shape=, dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
self.cost = self.alg.learn(obs, act, reward)

def sample(self, obs):
obs = np.expand_dims(obs, axis=0)  # Add one dimension
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])
act_prob = np.squeeze(act_prob, axis=0)  # Reduce one dimension
act = np.random.choice(range(self.act_dim), p=act_prob)  # Select actions according to action probability
return act

def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])
act_prob = np.squeeze(act_prob, axis=0)
act = np.argmax(act_prob)  # Select the action with the highest probability according to the action probability
return act

def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int64'),
'reward': reward.astype('float32')
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])
return cost


Training settings:

def run_episode(env, agent):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()
while True:
obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
obs_list.append(obs)
action = agent.sample(obs)
action_list.append(action)

obs, reward, done, info = env.step(action)
reward_list.append(reward)

if done:
break
return obs_list, action_list, reward_list

# Evaluate agent, run 5 episode s, average total reward
def evaluate(env, agent, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
action = agent.predict(obs)
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if isOver:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)

def preprocess(image):
""" Pretreatment 210 x160x3 uint8 frame into 6400 (80x80) 1 dimension float vector """
image = image[35:195]  # Crop
image = image[::2, ::2, 0]  # Subsampling, 2x scaling
image[image == 144] = 0  # Erase background type 1
image[image == 109] = 0  # Erase background type 2
image[image != 0] = 1  # Turn to grayscale, white except black
return image.astype(np.float).ravel()

def calc_reward_to_go(reward_list, gamma=0.99):
"""calculate discounted reward"""
reward_arr = np.array(reward_list)
for i in range(len(reward_arr) - 2, -1, -1):
# G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
reward_arr[i] += gamma * reward_arr[i + 1]
# normalize episode rewards
reward_arr -= np.mean(reward_arr)
reward_arr /= np.std(reward_arr)
return reward_arr

def main():
env = gym.make('Pong-v0')
obs_dim = 80 * 80
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

# Building agent based on parl framework
model = Model(act_dim=act_dim)
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

# if os.path.exists('./model.ckpt'):
#     agent.restore('./model.ckpt')

for i in range(1000):
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
logger.info("Train Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))

batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)

agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
total_reward = evaluate(env, agent, render=False)
logger.info('Episode {}, Test reward: {}'.format(
i + 1, total_reward))

# save the parameters to ./model.ckpt
agent.save('./model.ckpt')



## Teacher solutions Github

There are several other practical projects in it (very useful), Practical project teacher solution You can go in and have a star!

Tags: network github

Posted on Thu, 25 Jun 2020 23:42:04 -0400 by laeelin