From a07b9913b86bf9c41c883c861a6b8ba10ba7b3b5 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Thu, 18 Jul 2019 04:47:33 +0100 Subject: [PATCH] hand-crafted curriculum. Also plotting is much better --- automatic_cl.py | 95 ++++++++++++++++++++++++++++++++-- train_agent.py | 132 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 188 insertions(+), 39 deletions(-) diff --git a/automatic_cl.py b/automatic_cl.py index 0416d96..599a8d5 100644 --- a/automatic_cl.py +++ b/automatic_cl.py @@ -18,14 +18,103 @@ from stable_baselines.results_plotter import load_results, ts2xy import xml.etree.ElementTree as ET +best_mean_reward, n_steps = -np.inf, 0 + +def callback(_locals, _globals): + """ + Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) + :param _locals: (dict) + :param _globals: (dict) + """ + global n_steps, best_mean_reward + # Print stats every 1000 calls + if (n_steps + 1) % 1000 == 0: + # Evaluate policy training performance + x, y = ts2xy(load_results(log_dir), 'timesteps') + if len(x) > 0: + mean_reward = np.mean(y[-100:]) + print(x[-1], 'timesteps') + print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) + + # New best model, you could save the agent here + if mean_reward > best_mean_reward: + best_mean_reward = mean_reward + # Example for saving best model + print("Saving new best model") + _locals['self'].save(log_dir + 'best_model_prof.pkl') + n_steps += 1 + # Returning False will stop training early + return True + + +def moving_average(values, window): + """ + Smooth values by doing a moving average + :param values: (numpy array) + :param window: (int) + :return: (numpy array) + """ + weights = np.repeat(1.0, window) / window + return np.convolve(values, weights, 'valid') + + +def plot_results(log_folder, model_name, plt_dir, title='Learning Curve'): + """ + plot the results + + :param log_folder: (str) the save location of the results to plot + :param title: (str) the title of the task to plot + """ + m_name_csv = model_name + ".csv" + old_file_name = os.path.join(log_folder, "monitor.csv") + new_file_name = os.path.join(log_folder, m_name_csv) + save_name = os.path.join(plt_dir, model_name) + + x, y = ts2xy(load_results(log_folder), 'timesteps') + shutil.copy(old_file_name, new_file_name) + y = moving_average(y, window=1) + # Truncate x + x = x[lfen(x) - len(y):] + + fig = plt.figure(title) + plt.plot(x, y) + plt.xlabel('Number of Timesteps') + plt.ylabel('Rewards') + plt.title(title + " Smoothed") + print('Saving plot at:', save_name) + plt.savefig(save_name + ".png") + plt.savefig(save_name + ".eps") + print("plots saved...") + +models_dir = os.path.join(os.path.dirname( + os.path.realpath(__file__)), "prof/models/") +models_tmp_dir = os.path.join(os.path.dirname( + os.path.realpath(__file__)), "prof/models_tmp/") +log_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "prof/tmp") +gif_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "prof/tmp_gif/") +plt_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "prof/plot") +os.makedirs(log_dir, exist_ok=True) +os.makedirs(gif_dir, exist_ok=True) +os.makedirs(models_dir, exist_ok=True) +os.makedirs(models_tmp_dir, exist_ok=True) +os.makedirs(plt_dir, exist_ok=True) + +step_total = 1000 env_name = 'CurriculumLearning-v0' save_path = os.path.dirname( os.path.realpath(__file__)) env = gym.make(env_name) env.save_path = save_path n_cpu = 8 -# env = Monitor(env, log_dir, allow_early_resets=True) -# env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) +env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO2(MlpPolicy, env, verbose=1) -model.learn(total_timesteps=100000) +model.learn(total_timesteps=step_total) + +stamp = ' {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) +model_name = "Prof" + "_" + \ + str(step_total) + "_" + stamp +model_loc = os.path.join(models_dir, model_name) +model.save(model_loc) + +plot_results(log_dir, model_name, plt_dir) \ No newline at end of file diff --git a/train_agent.py b/train_agent.py index 2d0aab3..69c8ed6 100644 --- a/train_agent.py +++ b/train_agent.py @@ -18,13 +18,13 @@ import xml.etree.ElementTree as ET best_mean_reward, n_steps, old_steps, total_gif_time = -np.inf, 0, 0, 0 -step_total = 50000 +step_total = 250000 if step_total >= 1000000: - n_gifs = 5 + n_gifs = 2 else: n_gifs = 2 -log_incs = np.round((step_total / n_gifs) * 60 / 60000) +log_incs = np.round((step_total / n_gifs) / 2560) env_name = 'Real-v0' ##############################################Functions################### @@ -126,6 +126,7 @@ def plot_results(log_folder, model_name, plt_dir, title='Learning Curve'): def alter_leg(leg_length): xml_path = os.path.join(gym_real.__path__[0], "envs/assets/real.xml") + print(xml_path) tree = ET.parse(xml_path) root = tree.getroot() @@ -134,7 +135,7 @@ def alter_leg(leg_length): print(geom.get("fromto")) for pos in root.findall("worldbody/body/[@name='torso']"): - pos.set("pos", "0 0 " + str(abs(leg_length) + 0.7)) + pos.set("pos", "-10.0 0 " + str(abs(leg_length) + 0.7)) print(pos.get('pos')) tree.write(xml_path) @@ -150,53 +151,112 @@ def alter_leg(leg_length): log_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp") gif_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp_gif/") plt_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "plot") +# tensor_dir = os.path.join(os.path.dirname( +# os.path.realpath(__file__)), "tensorboard/") os.makedirs(log_dir, exist_ok=True) os.makedirs(gif_dir, exist_ok=True) os.makedirs(models_dir, exist_ok=True) os.makedirs(models_tmp_dir, exist_ok=True) os.makedirs(plt_dir, exist_ok=True) +# os.makedirs(tensor_dir, exist_ok=True) + +# print(tensor_dir) + +# alter_leg(-0.1) + # Create and wrap the environment env = gym.make(env_name) # env = Monitor(env, log_dir, allow_early_resets=True) # env = DummyVecEnv([lambda: env]) - # multiprocess environment -n_cpu = 8 +n_cpu = 20 env = Monitor(env, log_dir, allow_early_resets=True) env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) # Add some param noise for exploration -alter_leg(-5.0) - -model = PPO2(MlpPolicy, env, verbose=1) -start = time.time() -model.learn(total_timesteps=step_total, callback=callback) -end = time.time() - -# del model - -alter_leg(-0.3) - -model = PPO2(MlpPolicy, env, verbose=1) -start = time.time() -model.learn(total_timesteps=step_total, callback=callback) -end = time.time() +# alter_leg(-5.0) +lengths = [i * -0.1 for i in range(1, 10)] +model_created = False + +print(lengths) +counter = 0 +all_x = [] +all_y = [] +vert_x = [] + +for i in lengths: + counter += 1 + alter_leg(i) + env = gym.make(env_name) + n_cpu = 20 + env = Monitor(env, log_dir, allow_early_resets=True) + env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) + if not model_created: + # , tensorboard_log="./a2c_cartpole_tensorboard/ + model = PPO2(MlpPolicy, env, verbose=1) + else: + model = PPO2.load(model_loc, env=env) + start = time.time() + model.learn(total_timesteps=step_total) + model_loc = os.path.join(models_dir, 'hand') + + x, y = ts2xy(load_results(log_dir), 'timesteps') + y = moving_average(y, window=50) + x = x[len(x) - len(y):] + for i in x: + if model_created: + all_x.append(i + vert_x[-1]) + appended_val = x[-1] + vert_x[-1] + else: + all_x.append(i) + appended_val = x[-1] + + vert_x.append(appended_val) + for i in y: + all_y.append(i) + os.remove(os.path.join(log_dir, "monitor.csv")) + + model.save(model_loc) + env.close() + model_created = True + del env + del model + end = time.time() training_time = end - start - total_gif_time - -stamp = ' {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) -model_name = "PPO2_" + env_name + "_" + \ - str(step_total) + "_" + stamp + "_" + str(training_time) -model_loc = os.path.join(models_dir, model_name) -print(model_loc) -model.save(model_loc) - -print("Training time:", training_time) -print("model saved as: " + model_name) - -plot_results(log_dir, model_name, plt_dir) - -del model # remove to demonstrate saving and loading +print(counter) +print(lengths) +print(all_x) +print(all_y) +print(vert_x) + +save_name = os.path.join(plt_dir, 'hand' + str(step_total)) + +fig = plt.figure('hand' + str(step_total)) +plt.plot(all_x, all_y) +for i in vert_x: + plt.axvline(x=i, linestyle='--', color='#ccc5c6', label='leg increment') +plt.xlabel('Number of Timesteps') +plt.ylabel('Rewards') +plt.title('hand' + " Smoothed") +plt.savefig(save_name + ".png") +plt.savefig(save_name + ".eps") +print("plots saved...") +plt.show() + +# stamp = ' {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) +# model_name = "PPO2_" + env_name + "_" + \ +# str(step_total) + "_" + stamp + "_" + str(training_time) +# model_loc = os.path.join(models_dir, model_name) +# print(model_loc) +# model.save(model_loc) + +# print("Training time:", training_time) +# print("model saved as: " + model_name) + +# plot_results(log_dir, 'hand', plt_dir) + +# del model # remove to demonstrate saving and loading env = gym.make(env_name) # Enjoy trained agent @@ -206,5 +266,5 @@ def alter_leg(leg_length): print("********************************************************************") while watch_agent == "y" or "Y": subprocess.Popen( - '''export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-410/libGL.so; python load_agent.py '%s' '%s' ''' % (env_name, model_name), shell=True) + '''export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-410/libGL.so; python load_agent.py '%s' '%s' ''' % (env_name, 'hand'), shell=True) watch_agent = input("Do you want to watch your sick gaits? (Y/n):")