From 52f82cf2cff2d7811ec4998edc318d82fb766566 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Mon, 15 Mar 2021 15:55:13 +0200 Subject: [PATCH 01/16] trying to create a zoo. this can also be used for unit testing. logging to wandb the config file used to run the experiment as well as the ALL config file. removed a bug with calculating discounted rewards. --- requirements.txt | 1 + squiRL/a2c/a2c.py | 4 ++- squiRL/a2c/config_file.json | 13 ++++++++ squiRL/ppo/config_file.json | 15 +++++++++ squiRL/ppo/ppo.py | 12 +++---- squiRL/vpg/config_file.json | 66 ++++--------------------------------- squiRL/vpg/vpg.py | 3 +- train.py | 7 +++- 8 files changed, 53 insertions(+), 68 deletions(-) create mode 100644 squiRL/a2c/config_file.json create mode 100644 squiRL/ppo/config_file.json diff --git a/requirements.txt b/requirements.txt index 9fe83cd..f629f32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ torchtext==0.4.0 torchvision==0.7.0 typing==3.7.4.1 typing-extensions==3.7.4.1 +wandb==0.10.22 diff --git a/squiRL/a2c/a2c.py b/squiRL/a2c/a2c.py index 5ec2cfb..d072aa2 100644 --- a/squiRL/a2c/a2c.py +++ b/squiRL/a2c/a2c.py @@ -50,6 +50,9 @@ def __init__(self, hparams: argparse.Namespace) -> None: self.actor = reg_policies[self.hparams.policy](obs_size, n_actions) self.critic = reg_policies[self.hparams.policy](obs_size, 1) + if hparams.logger: + hparams.logger.watch(self.actor) + hparams.logger.watch(self.critic) self.replay_buffer = RolloutCollector(self.hparams.episodes_per_batch) self.agent = Agent(self.env, self.replay_buffer) @@ -120,7 +123,6 @@ def a2c_loss( actions] discounted_rewards = reward_to_go(rewards, self.gamma) - discounted_rewards = torch.tensor(discounted_rewards).float() advantage = discounted_rewards - values advantage = advantage.type_as(log_probs) diff --git a/squiRL/a2c/config_file.json b/squiRL/a2c/config_file.json new file mode 100644 index 0000000..070c04c --- /dev/null +++ b/squiRL/a2c/config_file.json @@ -0,0 +1,13 @@ +{ + "project": "squirl", + "algorithm": "A2C", + "policy": "MLP", + "env": "CartPole-v0", + "seed": 42, + "lr_actor": 0.0005, + "lr_critic": 0.0005, + "gamma": 0.99, + "episodes_per_batch": 5, + "num_envs": 5, + "max_epochs": 500 +} diff --git a/squiRL/ppo/config_file.json b/squiRL/ppo/config_file.json new file mode 100644 index 0000000..53f1654 --- /dev/null +++ b/squiRL/ppo/config_file.json @@ -0,0 +1,15 @@ +{ + "project": "squirl", + "algorithm": "PPO", + "policy": "MLP", + "env": "CartPole-v0", + "actor_updates_per_iter": 10, + "clip_rt": 0.1, + "seed": 42, + "lr_actor": 0.0005, + "lr_critic": 0.0005, + "gamma": 0.99, + "episodes_per_batch": 5, + "num_envs": 5, + "max_epochs": 500 +} diff --git a/squiRL/ppo/ppo.py b/squiRL/ppo/ppo.py index 0b3dcaa..f57a3eb 100644 --- a/squiRL/ppo/ppo.py +++ b/squiRL/ppo/ppo.py @@ -45,12 +45,17 @@ def __init__(self, hparams: argparse.Namespace) -> None: env_kwargs={"id": self.hparams.env}) self.gamma = self.hparams.gamma self.eps = self.hparams.eps + self.actor_updates_per_iter = self.hparams.actor_updates_per_iter obs_size = self.env.ob_space.size n_actions = self.env.ac_space.eltype.n self.actor = reg_policies[self.hparams.policy](obs_size, n_actions) self.new_actor = reg_policies[self.hparams.policy](obs_size, n_actions) self.critic = reg_policies[self.hparams.policy](obs_size, 1) + if hparams.logger: + hparams.logger.watch(self.actor) + hparams.logger.watch(self.new_actor) + hparams.logger.watch(self.critic) self.replay_buffer = RolloutCollector(self.hparams.episodes_per_batch) self.agent = Agent(self.env, self.replay_buffer) @@ -70,10 +75,6 @@ def add_model_specific_args( type=str, default='MLP', help="NN policy used by agent") - parser.add_argument("--custom_optimizers", - type=bool, - default=True, - help="this value must not be changed") parser.add_argument("--actor_updates_per_iter", type=int, default=10, @@ -131,13 +132,12 @@ def ppo_loss(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], dim=-1).squeeze(0)[range(len(actions)), actions] discounted_rewards = reward_to_go(rewards, self.gamma) - discounted_rewards = torch.tensor(discounted_rewards).float() advantage = discounted_rewards - values advantage = advantage.type_as(log_probs) criterion = torch.nn.MSELoss() critic_loss = criterion(discounted_rewards, values.view(-1).float()) - for _ in range(self.hparams.actor_updates_per_iter): + for _ in range(self.actor_updates_per_iter): actor_optimizer.zero_grad() new_action_logits = self.new_actor(states.float()) new_log_probs = F.log_softmax( diff --git a/squiRL/vpg/config_file.json b/squiRL/vpg/config_file.json index d4a8d3e..5ba0419 100644 --- a/squiRL/vpg/config_file.json +++ b/squiRL/vpg/config_file.json @@ -1,64 +1,12 @@ { - "save_config": true, - "load_config": null, - "seed": 42, - "debug": false, - "algorithm": "VPG", "project": "squirl", - "env": "CartPole-v0", - "observation_space": 4, - "action_space": 2, - "max_reward": 195.0, + "algorithm": "VPG", "policy": "MLP", - "lr": 0.0009, - "eps": 1.1920928955078125e-07, + "env": "CartPole-v0", + "seed": 42, + "lr": 0.0005, "gamma": 0.99, - "num_workers": 20, - "episodes_per_batch": 1, - "num_envs": 1, - "checkpoint_callback": true, - "default_root_dir": null, - "gradient_clip_val": 0, - "process_position": 0, - "num_nodes": 1, - "num_processes": 1, - "auto_select_gpus": false, - "log_gpu_memory": null, - "progress_bar_refresh_rate": 1, - "overfit_batches": 0.0, - "track_grad_norm": -1, - "check_val_every_n_epoch": 1, - "fast_dev_run": false, - "accumulate_grad_batches": 1, - "max_epochs": 1000, - "min_epochs": 1, - "max_steps": null, - "min_steps": null, - "limit_train_batches": 1.0, - "limit_val_batches": 1.0, - "limit_test_batches": 1.0, - "val_check_interval": 1.0, - "flush_logs_every_n_steps": 100, - "log_every_n_steps": 50, - "accelerator": null, - "sync_batchnorm": false, - "precision": 32, - "weights_summary": "top", - "weights_save_path": null, - "num_sanity_val_steps": 2, - "truncated_bptt_steps": null, - "resume_from_checkpoint": null, - "profiler": null, - "benchmark": false, - "deterministic": false, - "reload_dataloaders_every_epoch": false, - "auto_lr_find": false, - "replace_sampler_ddp": true, - "terminate_on_nan": false, - "auto_scale_batch_size": false, - "prepare_data_per_node": true, - "amp_backend": "native", - "amp_level": "O2", - "distributed_backend": null, - "automatic_optimization": true + "episodes_per_batch": 5, + "num_envs": 5, + "max_epochs": 500 } diff --git a/squiRL/vpg/vpg.py b/squiRL/vpg/vpg.py index 3696bcb..8e68f1a 100644 --- a/squiRL/vpg/vpg.py +++ b/squiRL/vpg/vpg.py @@ -47,6 +47,8 @@ def __init__(self, hparams: argparse.Namespace) -> None: n_actions = self.env.ac_space.eltype.n self.net = reg_policies[self.hparams.policy](obs_size, n_actions) + if hparams.logger: + hparams.logger.watch(self.net) self.replay_buffer = RolloutCollector(self.hparams.episodes_per_batch) self.agent = Agent(self.env, self.replay_buffer) @@ -114,7 +116,6 @@ def vpg_loss( actions] discounted_rewards = reward_to_go(rewards, self.gamma) - discounted_rewards = torch.tensor(discounted_rewards) advantage = (discounted_rewards - discounted_rewards.mean()) / ( discounted_rewards.std() + self.eps) advantage = advantage.type_as(log_probs) diff --git a/train.py b/train.py index 57e3ad5..355ea5b 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ import os import json import argparse +from shutil import copyfile from pytorch_lightning.loggers import WandbLogger from pytorch_lightning.utilities.seed import seed_everything import pytorch_lightning as pl @@ -39,12 +40,16 @@ def train(hparams) -> None: os.mkdir(path) path = os.path.join(path, hparams.logger.version) if hparams.save_config: - with open(path + '.json', 'wt') as f: + with open(path + '_init.json', 'wt') as f: config = vars(hparams).copy() config.pop("logger") config.pop("gpus") config.pop("tpu_cores") json.dump(config, f, indent=4) + copyfile(path + '_init.json', + hparams.logger.save_dir + "/config_all.json") + copyfile(hparams.load_config, + hparams.logger.save_dir + "/config_init.json") seed_everything(hparams.seed) algorithm = squiRL.reg_algorithms[hparams.algorithm](hparams) From 5a05c5e1968ea03680e7938cee5a3104c6c0eec6 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Mon, 29 Mar 2021 16:45:52 +0200 Subject: [PATCH 02/16] ready to test performance checker. check that a pull request doesnt affect algorithm performance. will log all runs under their respective git commit on project squirl on wandb. squirl will be an open project on wandb --- requirements.txt | 1 + squiRL/a2c/config_file.json | 13 ------------- squiRL/ppo/config_file.json | 15 --------------- squiRL/vpg/config_file.json | 12 ------------ train.py | 11 ++++++++++- 5 files changed, 11 insertions(+), 41 deletions(-) delete mode 100644 squiRL/a2c/config_file.json delete mode 100644 squiRL/ppo/config_file.json delete mode 100644 squiRL/vpg/config_file.json diff --git a/requirements.txt b/requirements.txt index f629f32..79cf04a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ torchvision==0.7.0 typing==3.7.4.1 typing-extensions==3.7.4.1 wandb==0.10.22 +python-git-info==0.6.1 diff --git a/squiRL/a2c/config_file.json b/squiRL/a2c/config_file.json deleted file mode 100644 index 070c04c..0000000 --- a/squiRL/a2c/config_file.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "project": "squirl", - "algorithm": "A2C", - "policy": "MLP", - "env": "CartPole-v0", - "seed": 42, - "lr_actor": 0.0005, - "lr_critic": 0.0005, - "gamma": 0.99, - "episodes_per_batch": 5, - "num_envs": 5, - "max_epochs": 500 -} diff --git a/squiRL/ppo/config_file.json b/squiRL/ppo/config_file.json deleted file mode 100644 index 53f1654..0000000 --- a/squiRL/ppo/config_file.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "project": "squirl", - "algorithm": "PPO", - "policy": "MLP", - "env": "CartPole-v0", - "actor_updates_per_iter": 10, - "clip_rt": 0.1, - "seed": 42, - "lr_actor": 0.0005, - "lr_critic": 0.0005, - "gamma": 0.99, - "episodes_per_batch": 5, - "num_envs": 5, - "max_epochs": 500 -} diff --git a/squiRL/vpg/config_file.json b/squiRL/vpg/config_file.json deleted file mode 100644 index 5ba0419..0000000 --- a/squiRL/vpg/config_file.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "project": "squirl", - "algorithm": "VPG", - "policy": "MLP", - "env": "CartPole-v0", - "seed": 42, - "lr": 0.0005, - "gamma": 0.99, - "episodes_per_batch": 5, - "num_envs": 5, - "max_epochs": 500 -} diff --git a/train.py b/train.py index 355ea5b..0e6022b 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ import os import json import argparse +import gitinfo from shutil import copyfile from pytorch_lightning.loggers import WandbLogger from pytorch_lightning.utilities.seed import seed_everything @@ -58,12 +59,20 @@ def train(hparams) -> None: if __name__ == '__main__': - __spec__ = "ModuleSpec(name='builtins', loader=)" + # enables pdb debugging + __spec__ = '''ModuleSpec(name='builtins', loader=)''' + parser = argparse.ArgumentParser(add_help=False) group_prog = parser.add_argument_group("program_args") group_env = parser.add_argument_group("environment_args") # add PROGRAM level args + parser.add_argument( + '--git_commit', + type=str, + default=gitinfo.get_git_info()['commit'], + help='current git commit') parser.add_argument( '--save_config', type=bool, From 894481aa3988655bca68830e7955dbc9c02a910f Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Mon, 29 Mar 2021 16:47:12 +0200 Subject: [PATCH 03/16] moved config files to config folder. all configs here will be run automatically to test for performance. --- configs/cartpole_a2c.json | 12 ++++++++++++ configs/cartpole_ppo.json | 14 ++++++++++++++ configs/cartpole_vpg.json | 11 +++++++++++ 3 files changed, 37 insertions(+) create mode 100644 configs/cartpole_a2c.json create mode 100644 configs/cartpole_ppo.json create mode 100644 configs/cartpole_vpg.json diff --git a/configs/cartpole_a2c.json b/configs/cartpole_a2c.json new file mode 100644 index 0000000..c95e89d --- /dev/null +++ b/configs/cartpole_a2c.json @@ -0,0 +1,12 @@ +{ + "project": "squirl", + "algorithm": "A2C", + "policy": "MLP", + "env": "CartPole-v0", + "lr_actor": 0.0005, + "lr_critic": 0.0005, + "gamma": 0.99, + "episodes_per_batch": 5, + "num_envs": 5, + "max_epochs": 500 +} diff --git a/configs/cartpole_ppo.json b/configs/cartpole_ppo.json new file mode 100644 index 0000000..c15d51d --- /dev/null +++ b/configs/cartpole_ppo.json @@ -0,0 +1,14 @@ +{ + "project": "squirl", + "algorithm": "PPO", + "policy": "MLP", + "env": "CartPole-v0", + "actor_updates_per_iter": 10, + "clip_rt": 0.1, + "lr_actor": 0.0005, + "lr_critic": 0.0005, + "gamma": 0.99, + "episodes_per_batch": 1, + "num_envs": 1, + "max_epochs": 500 +} diff --git a/configs/cartpole_vpg.json b/configs/cartpole_vpg.json new file mode 100644 index 0000000..794c118 --- /dev/null +++ b/configs/cartpole_vpg.json @@ -0,0 +1,11 @@ +{ + "project": "squirl", + "algorithm": "VPG", + "policy": "MLP", + "env": "CartPole-v0", + "lr": 0.0005, + "gamma": 0.99, + "episodes_per_batch": 5, + "num_envs": 5, + "max_epochs": 500 +} From 1c840065fd3c5a6869d46d23bd57b7343ff6a2df Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Mon, 29 Mar 2021 17:08:48 +0200 Subject: [PATCH 04/16] moving perf_breaker to this branch --- .github/workflows/perf_breaker.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/perf_breaker.yml diff --git a/.github/workflows/perf_breaker.yml b/.github/workflows/perf_breaker.yml new file mode 100644 index 0000000..03ced3e --- /dev/null +++ b/.github/workflows/perf_breaker.yml @@ -0,0 +1,27 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + pull_request: + branches: [ labeled ] + +jobs: + build: + if: ${{ github.event.label.name == 'breaker' }} + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test performance of all algorithms to ensure nothing broke + run: | + python ~/train.py --load_config ~/configs/* From 8a1fb5dcbaf5d7df9b14ee6207b1fd0ebb05b3fc Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Tue, 30 Mar 2021 14:57:12 +0200 Subject: [PATCH 05/16] modified git_commit arg to work with github workflow --- train.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 0e6022b..35ea4f9 100644 --- a/train.py +++ b/train.py @@ -34,6 +34,8 @@ def train(hparams) -> None: profiler = None cwd = os.getcwd() path = os.path.join(cwd, 'models') + if hparams.git_commit is None: + args.git_commit = gitinfo.get_git_info()['commit'] if not os.path.exists(path): os.mkdir(path) path = os.path.join(path, hparams.logger.version) @@ -68,11 +70,10 @@ def train(hparams) -> None: group_env = parser.add_argument_group("environment_args") # add PROGRAM level args - parser.add_argument( - '--git_commit', - type=str, - default=gitinfo.get_git_info()['commit'], - help='current git commit') + parser.add_argument('--git_commit', + type=str, + default=None, + help='current git commit') parser.add_argument( '--save_config', type=bool, From aa4c10179f335e04f22d87e68376c1eb13ea3e52 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 12:03:03 +0200 Subject: [PATCH 06/16] changing config file --- configs/cartpole_ppo.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/cartpole_ppo.json b/configs/cartpole_ppo.json index c15d51d..de8db7e 100644 --- a/configs/cartpole_ppo.json +++ b/configs/cartpole_ppo.json @@ -3,7 +3,7 @@ "algorithm": "PPO", "policy": "MLP", "env": "CartPole-v0", - "actor_updates_per_iter": 10, + "actor_updates_per_iter": 15, "clip_rt": 0.1, "lr_actor": 0.0005, "lr_critic": 0.0005, From 7a0b09811e4d94d4d647870bcfc3996278e03563 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 14:19:46 +0200 Subject: [PATCH 07/16] updated ppo config. training is much more stable now --- configs/cartpole_ppo.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/cartpole_ppo.json b/configs/cartpole_ppo.json index de8db7e..5c0ce2c 100644 --- a/configs/cartpole_ppo.json +++ b/configs/cartpole_ppo.json @@ -3,9 +3,9 @@ "algorithm": "PPO", "policy": "MLP", "env": "CartPole-v0", - "actor_updates_per_iter": 15, - "clip_rt": 0.1, - "lr_actor": 0.0005, + "actor_updates_per_iter": 10, + "clip_rt": 0.05, + "lr_actor": 0.0001, "lr_critic": 0.0005, "gamma": 0.99, "episodes_per_batch": 1, From 15d967d0976dcdf5078f14d9d6f83ac9922ab9cb Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 15:39:12 +0200 Subject: [PATCH 08/16] making seeds random. updating ppo config --- configs/cartpole_ppo.json | 6 +++--- train.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/configs/cartpole_ppo.json b/configs/cartpole_ppo.json index 5c0ce2c..4e4b6d8 100644 --- a/configs/cartpole_ppo.json +++ b/configs/cartpole_ppo.json @@ -3,9 +3,9 @@ "algorithm": "PPO", "policy": "MLP", "env": "CartPole-v0", - "actor_updates_per_iter": 10, - "clip_rt": 0.05, - "lr_actor": 0.0001, + "actor_updates_per_iter": 20, + "clip_rt": 0.1, + "lr_actor": 0.0005, "lr_critic": 0.0005, "gamma": 0.99, "episodes_per_batch": 1, diff --git a/train.py b/train.py index 35ea4f9..71d5982 100644 --- a/train.py +++ b/train.py @@ -8,6 +8,7 @@ """ import os import json +import random import argparse import gitinfo from shutil import copyfile @@ -84,7 +85,7 @@ def train(hparams) -> None: help='Load from json file. Command line override.') group_prog.add_argument('--seed', type=int, - default=42, + default=random.randint(0, 1000), help="experiment seed") group_prog.add_argument( '--debug', From 057f0ca82645ce84c1463301e17eab5bf5d0e752 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 17:06:13 +0200 Subject: [PATCH 09/16] a python script for testing performance thresholds --- performance_checker.py | 46 +++++++++++++++++++++++++++++++++++++++++ performance_thresh.json | 3 +++ 2 files changed, 49 insertions(+) create mode 100644 performance_checker.py create mode 100644 performance_thresh.json diff --git a/performance_checker.py b/performance_checker.py new file mode 100644 index 0000000..5bc901c --- /dev/null +++ b/performance_checker.py @@ -0,0 +1,46 @@ +import wandb +import json +import os +import numpy as np + +with open("performance_thresh.json", 'rt') as f: + thresh = json.load(f) + print("Thresholds on file:") + print(thresh) + +api = wandb.Api() +failures = {} +alg_means = {} +data = {} +for model in os.listdir("models"): + config_file = "models/" + model + "/" + model + "_init.json" + data[model] = {} + with open(config_file, 'rt') as f: + data[model] = json.load(f) + algorithm = data[model]['algorithm'] + if not data[model]['algorithm'] in alg_means: + alg_means[algorithm] = {} + failures[algorithm] = {} + failures[algorithm][model] = {} + run = api.run("agkhalil/squiRL/" + model) + mean_reward = np.mean( + run.history(keys=["mean_episode_reward"]) + ["mean_episode_reward"].tolist()[-100:]) + print(model, mean_reward) + alg_means[data[model]['algorithm']][model] = mean_reward + if mean_reward < thresh[data[model]['env']]: + failures[algorithm][model]["env"] = data[model]['env'] + failures[algorithm][model]["threshold"] = thresh[data[model]['env']] + failures[algorithm][model]["mean_last_100_steps"] = mean_reward + +alg_failures = {} +for k, v in alg_means.items(): + means = [] + for nv in v.values(): + means.append(nv) + alg_mean = np.mean(means) + if alg_mean < thresh[data[model]['env']]: + alg_failures[k] = alg_mean + +assert not bool( + alg_failures) == True, "The following algorithms have failed:\n" + str(alg_failures.keys()) + "\nHere are all failed runs of each algorithm:\n" + str(failures) diff --git a/performance_thresh.json b/performance_thresh.json new file mode 100644 index 0000000..c94ca4d --- /dev/null +++ b/performance_thresh.json @@ -0,0 +1,3 @@ +{ + "CartPole-v0": 170 +} From b5199168ff76aa28d05126e0c85e4eddfbbbc812 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 17:49:27 +0200 Subject: [PATCH 10/16] changing how data is viewed from wandb cause github actions is a pain --- performance_checker.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/performance_checker.py b/performance_checker.py index 5bc901c..712ed9c 100644 --- a/performance_checker.py +++ b/performance_checker.py @@ -23,14 +23,16 @@ failures[algorithm] = {} failures[algorithm][model] = {} run = api.run("agkhalil/squiRL/" + model) + wandb_mean_rewards = run.history(keys=['mean_episode_reward'], + pandas=False) mean_reward = np.mean( - run.history(keys=["mean_episode_reward"]) - ["mean_episode_reward"].tolist()[-100:]) + [i['mean_episode_reward'] for i in wandb_mean_rewards]) print(model, mean_reward) alg_means[data[model]['algorithm']][model] = mean_reward if mean_reward < thresh[data[model]['env']]: failures[algorithm][model]["env"] = data[model]['env'] - failures[algorithm][model]["threshold"] = thresh[data[model]['env']] + failures[algorithm][model]["threshold"] = thresh[data[model] + ['env']] failures[algorithm][model]["mean_last_100_steps"] = mean_reward alg_failures = {} @@ -43,4 +45,6 @@ alg_failures[k] = alg_mean assert not bool( - alg_failures) == True, "The following algorithms have failed:\n" + str(alg_failures.keys()) + "\nHere are all failed runs of each algorithm:\n" + str(failures) + alg_failures) == True, "The following algorithms have failed:\n" + str( + alg_failures.keys( + )) + "\nHere are all failed runs of each algorithm:\n" + str(failures) From 1dec5c05b099b0ee418030301586f8dc1f39412d Mon Sep 17 00:00:00 2001 From: Ahmed Khalil Date: Wed, 31 Mar 2021 18:18:20 +0200 Subject: [PATCH 11/16] Lowering threshhold --- performance_thresh.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/performance_thresh.json b/performance_thresh.json index c94ca4d..2bae26e 100644 --- a/performance_thresh.json +++ b/performance_thresh.json @@ -1,3 +1,3 @@ { - "CartPole-v0": 170 + "CartPole-v0": 150 } From 57f89dec7e17601e9b32e8a9cea6524cb48ea5b3 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 18:47:29 +0200 Subject: [PATCH 12/16] fixed bug --- performance_checker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/performance_checker.py b/performance_checker.py index 712ed9c..ee848df 100644 --- a/performance_checker.py +++ b/performance_checker.py @@ -40,9 +40,10 @@ means = [] for nv in v.values(): means.append(nv) - alg_mean = np.mean(means) - if alg_mean < thresh[data[model]['env']]: - alg_failures[k] = alg_mean + alg_mean = np.mean(means) + print(alg_mean) + if alg_mean < thresh[data[model]['env']]: + alg_failures[k] = alg_mean assert not bool( alg_failures) == True, "The following algorithms have failed:\n" + str( From 8ac4c94ad74902e36c3c088b02306b459a5f0bb5 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 18:50:11 +0200 Subject: [PATCH 13/16] only taking mean of the last 100 steps --- performance_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/performance_checker.py b/performance_checker.py index ee848df..fdcbab3 100644 --- a/performance_checker.py +++ b/performance_checker.py @@ -26,7 +26,7 @@ wandb_mean_rewards = run.history(keys=['mean_episode_reward'], pandas=False) mean_reward = np.mean( - [i['mean_episode_reward'] for i in wandb_mean_rewards]) + [i['mean_episode_reward'] for i in wandb_mean_rewards][-100:]) print(model, mean_reward) alg_means[data[model]['algorithm']][model] = mean_reward if mean_reward < thresh[data[model]['env']]: From 129770a763b78eadd098946c85c1eab5bb8f4a04 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Wed, 31 Mar 2021 23:22:08 +0200 Subject: [PATCH 14/16] entity name changed from agkhalil to squirl --- performance_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/performance_checker.py b/performance_checker.py index fdcbab3..2195440 100644 --- a/performance_checker.py +++ b/performance_checker.py @@ -22,7 +22,7 @@ alg_means[algorithm] = {} failures[algorithm] = {} failures[algorithm][model] = {} - run = api.run("agkhalil/squiRL/" + model) + run = api.run("squirl/squirl/" + model) wandb_mean_rewards = run.history(keys=['mean_episode_reward'], pandas=False) mean_reward = np.mean( From ee1cdd84a95c484dcc7e9dc78936a949f5da1826 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Thu, 1 Apr 2021 01:06:15 +0200 Subject: [PATCH 15/16] updated readme and organized files --- README.md | 13 +++++++++++++ .../cartpole/a2c.json} | 2 +- .../cartpole/ppo.json} | 0 .../cartpole/vpg.json} | 0 4 files changed, 14 insertions(+), 1 deletion(-) rename configs/{cartpole_a2c.json => openai_gym/cartpole/a2c.json} (90%) rename configs/{cartpole_ppo.json => openai_gym/cartpole/ppo.json} (100%) rename configs/{cartpole_vpg.json => openai_gym/cartpole/vpg.json} (100%) diff --git a/README.md b/README.md index a081b5c..d6bdda5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,19 @@ # squiRL An RL library in PyTorch embedded within the PyTorch Lightning framework. Aiming to provide a comprehensive platform for the development and testing of RL algorithms. +## Performance checker +DRL research is painful. Writing DRL code is even more so. Throughout development, this repos is bound to go through many changes and some of those changes may break the performance of older code. + +To ensure major pull requests don't have undesirable conequences, and to build a comprehensive zoo of algorithms and envs, we introduce the `performance checker` feature. This is a Github workflow automatically triggered on a pull request if labelled `check_performance`. + +The workflow runs all experiments specified in the `configs` folder (5 random seeds each). It then compares the average `mean_episode_reward` of the 5 seeds against the respective `env` thresholds specified in `performance_thresh.json`. + +For example `configs/cartpole_ppo.json` has the experiment configurations to run `PPO` on Gym's `CartPole-v0`. The workflow runs 5 random seeds. Getting a mean reward larger than `150` means the env is solved. This value, `150` is saved in `performance_thresh.json` under the env name `CartPole-v0`. So the workflow knows that if the mean reward of the 5 seeds doesn't exceed `150`, something is wrong and an error is returned including the specific runs that failed to meet the threshold. + +All runs can be found [here](https://wandb.ai/squirl/squirl). They are grouped under their respective git commits. + +We ask that any new algorithm implemented be provided with a respective config file as a benchmark. Also any pull request benchmarking on any new env is more than welcome. + ## Branch names Branches should be using one of these groups to start with: wip - Works in progress; stuff I know won't be finished soon (like a release) diff --git a/configs/cartpole_a2c.json b/configs/openai_gym/cartpole/a2c.json similarity index 90% rename from configs/cartpole_a2c.json rename to configs/openai_gym/cartpole/a2c.json index c95e89d..b3d5d5d 100644 --- a/configs/cartpole_a2c.json +++ b/configs/openai_gym/cartpole/a2c.json @@ -8,5 +8,5 @@ "gamma": 0.99, "episodes_per_batch": 5, "num_envs": 5, - "max_epochs": 500 + "max_epochs": 50 } diff --git a/configs/cartpole_ppo.json b/configs/openai_gym/cartpole/ppo.json similarity index 100% rename from configs/cartpole_ppo.json rename to configs/openai_gym/cartpole/ppo.json diff --git a/configs/cartpole_vpg.json b/configs/openai_gym/cartpole/vpg.json similarity index 100% rename from configs/cartpole_vpg.json rename to configs/openai_gym/cartpole/vpg.json From 2aabdec06f6d235b8568736463a86af04801b902 Mon Sep 17 00:00:00 2001 From: AGKhalil Date: Thu, 1 Apr 2021 01:39:00 +0200 Subject: [PATCH 16/16] fixed typo --- configs/openai_gym/cartpole/a2c.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/openai_gym/cartpole/a2c.json b/configs/openai_gym/cartpole/a2c.json index b3d5d5d..c95e89d 100644 --- a/configs/openai_gym/cartpole/a2c.json +++ b/configs/openai_gym/cartpole/a2c.json @@ -8,5 +8,5 @@ "gamma": 0.99, "episodes_per_batch": 5, "num_envs": 5, - "max_epochs": 50 + "max_epochs": 500 }