[Baidu AI intensive learning series] IV. solving RL based on strategy gradient (Solving Pong with PG)

catalog

Policy Gradient


In reinforcement learning, there are two kinds of methods, one is based on value, the other is based on policy. Policy Gradient belongs to the latter. PG does not need to fit the Q value like DQN, it directly fits the next action of Agent, which is the same as the end-to-end neural network, and there is no intermediate link, as shown in the figure below.

Formula derivation

Expected return

In an episod. The activity of an Agent uses a sequence τ = {s1, a1, s2, a2,..., sT, aT} \ tau = \ left \ {s_ 1,a_ 1,s_ 2,a_ 2 ,...,s_ T,a_ T \ right \} τ = {s1, a1, s2, a2,..., sT, aT}, where sTs_TsT represents the state of TTT step, sTs_TsT represents the action in TTT step.


πθ=(aT∣sT)\pi_\theta=(a_T|s_T) π θ = (at ∣ st) is to predict the probability of the Agent's action through the network.

Optimization objective and strategy gradient


③ Where ρ (s1) \ rho (s)_ 1) ρ (s1) and ρ (st+1 ∣ st, at0) \ rho (s)_ {t+1}|s_ t,a_ T0) ρ (st+1 ∣ st, at 0) will be reduced in the derivation of θ \ theta θ, so formula ④ is obtained.

Project exercise (PG solving Pong)

Courses and exercises are provided by Baidu AI's 7-day card punching camp( Course links)

practical combat requirement

The score gradually rises and converges from - 21. The score of Test reward can converge to more than 0 (indicating that the opponent has been defeated). The higher the score, the better.

network structure

At the time of writing this article, I was still debugging crazily. With reference to other solutions, it is estimated that most of the solutions converged to better results after several rounds of training. I want to continue to try and see what can be done to speed up the convergence. Then put your own code out. Now I post the teacher's network structure:

class Model(parl.Model):
    def __init__(self, act_dim):
        act_dim = act_dim
        hid1_size = 256
        hid2_size = 64

        self.fc1 = layers.fc(size=hid1_size, act='relu')
        self.fc2 = layers.fc(size=hid2_size, act='relu')
        self.fc3 = layers.fc(size=act_dim, act='softmax')

    def forward(self, obs):
        h1 = self.fc1(obs)
        h2 = self.fc2(h1)
        out = self.fc3(h2)
        return out

Agent:

class Agent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(Agent, self).__init__(algorithm)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):  # Build calculation chart to predict actions and define input and output variables
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.act_prob = self.alg.predict(obs)

        with fluid.program_guard(
                self.learn_program):  # Build calculation chart to update policy network and define input and output variables
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(name='act', shape=[1], dtype='int64')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            self.cost = self.alg.learn(obs, act, reward)

    def sample(self, obs):
        obs = np.expand_dims(obs, axis=0)  # Add one dimension
        act_prob = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.act_prob])[0]
        act_prob = np.squeeze(act_prob, axis=0)  # Reduce one dimension
        act = np.random.choice(range(self.act_dim), p=act_prob)  # Select actions according to action probability
        return act

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act_prob = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.act_prob])[0]
        act_prob = np.squeeze(act_prob, axis=0)
        act = np.argmax(act_prob)  # Select the action with the highest probability according to the action probability
        return act

    def learn(self, obs, act, reward):
        act = np.expand_dims(act, axis=-1)
        feed = {
            'obs': obs.astype('float32'),
            'act': act.astype('int64'),
            'reward': reward.astype('float32')
        }
        cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
        return cost

Training settings:

def run_episode(env, agent):
    obs_list, action_list, reward_list = [], [], []
    obs = env.reset()
    while True:
        obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
        obs_list.append(obs)
        action = agent.sample(obs)
        action_list.append(action)

        obs, reward, done, info = env.step(action)
        reward_list.append(reward)

        if done:
            break
    return obs_list, action_list, reward_list


# Evaluate agent, run 5 episode s, average total reward
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        episode_reward = 0
        while True:
            obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
            action = agent.predict(obs)
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


def preprocess(image):
    """ Pretreatment 210 x160x3 uint8 frame into 6400 (80x80) 1 dimension float vector """
    image = image[35:195]  # Crop
    image = image[::2, ::2, 0]  # Subsampling, 2x scaling
    image[image == 144] = 0  # Erase background type 1
    image[image == 109] = 0  # Erase background type 2
    image[image != 0] = 1  # Turn to grayscale, white except black
    return image.astype(np.float).ravel()


def calc_reward_to_go(reward_list, gamma=0.99):
    """calculate discounted reward"""
    reward_arr = np.array(reward_list)
    for i in range(len(reward_arr) - 2, -1, -1):
        # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
        reward_arr[i] += gamma * reward_arr[i + 1]
    # normalize episode rewards
    reward_arr -= np.mean(reward_arr)
    reward_arr /= np.std(reward_arr)
    return reward_arr


def main():
    env = gym.make('Pong-v0')
    obs_dim = 80 * 80
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # Building agent based on parl framework
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # Load model
    # if os.path.exists('./model.ckpt'):
    #     agent.restore('./model.ckpt')

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Train Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=False)
            logger.info('Episode {}, Test reward: {}'.format(
                i + 1, total_reward))

    # save the parameters to ./model.ckpt
    agent.save('./model.ckpt')

Teacher solutions Github

There are several other practical projects in it (very useful), Practical project teacher solution You can go in and have a star!

Tags: network github

Posted on Thu, 25 Jun 2020 23:42:04 -0400 by laeelin