Saturday, February 17, 2018

Reinforcement learning of atari (breakout)





At first, install dependencies.

This is the code of reinforcement learning of atari (breakout). Save this as "atari.py":
from __future__ import division
import argparse
import numpy as np
import gym
from gym import wrappers
import os.path
import pickle

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K
from PIL import Image
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

ENV_NAME = 'BreakoutDeterministic-v4'
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4
weights_filename = 'dqn_weights.h5f'

class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

env = gym.make(ENV_NAME)
env = wrappers.Monitor(env, './breakout', force=True)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
def create_model(weights_filename, input_shape) :
    weights_filename="./"+weights_filename
    model = Sequential()
    model.add(Permute((2, 3, 1), input_shape=input_shape))
    model.add(Convolution2D(32, 8, 8, subsample=(4, 4)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    return model

model = create_model(weights_filename, input_shape);
print(model.summary())


memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

processor = AtariProcessor()
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000)
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

checkpoint_weights_filename = 'dqn_'+ ENV_NAME +'_weights_{step}.h5f'
log_filename = 'dqn_{}_log.json'.format(ENV_NAME)

parser = argparse.ArgumentParser()
parser.add_argument('--mode', choices=['train', 'test'], default='train')
args = parser.parse_args()

if args.mode == 'test':
    dqn.test(env, nb_episodes=10, visualize=True)
else:
    checkpoint_weights_filename = 'dqn_weights_{step}.h5f'
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)]
    callbacks += [FileLogger(log_filename, interval=100000)]
    if os.path.isfile(weights_filename):
        print('\n\n\n\nSaved parameters found. I will use this file...\n'+ weights_filename +'\n\n\n\n')
        dqn.load_weights(weights_filename)
    else:
        ('\n\n\n\nSaved parameters Not found. Creating new one...\n\n\n\n')
    dqn.fit(env, callbacks=callbacks, nb_steps=4000000, log_interval=50000, visualize=True, verbose=1)
    dqn.save_weights('dqn_weights.h5f'.format(ENV_NAME), overwrite=True)

To train this model, use "python3.5 atari.py "
To test your trained model, use "python3.5 atari.py --mode test".

Also, as of May 2018, seems like keras-rl doesn't have a functionality to save the information of memory, reward and so on. Even if you save the weights, it might not reproduce the state before saving. You need to keep it running to make it improve. To increase the duration of the training, increase the nb_steps of fit function.

If you want something that can save & load weights, use this:
https://noteoneverything.blogspot.jp/2018/05/reinforcement-learning-with-tensorflow.html

References


pathway's comment
https://github.com/keras-rl/keras-rl/issues/186

matthiasplappert/keras-rl examples (Latest commit 3dcd547  on Nov 30, 2017)
https://github.com/matthiasplappert/keras-rl/tree/master/examples
Visited Feb 17 2018