rl ppo

aab877bd · Matthias König · 5cac8e05 · aab877bd · aab877bd · aab877bd
Commit aab877bd authored 2 years ago by Matthias König
--- a/rl/algorithm/ppo.py
+++ b/rl/algorithm/ppo.py
@@ -55,9 +55,9 @@ class PPOAgent(BaseAgent):
        )
        # init critic layer
        for i in range(len(self.critic.mlp) - 1):
-            layer_init(self.critic.mlp[i])
+            self.critic.mlp[i] = layer_init(self.critic.mlp[i])
        # init last critic layer
-        layer_init(self.critic.mlp[-2], std=1.0)
+        self.critic.mlp[-2] = layer_init(self.critic.mlp[-2], std=1.0)

        # actor model
        self.actor_mean = MLPNet(
@@ -67,21 +67,16 @@ class PPOAgent(BaseAgent):
        )
        # init actor model
        for i in range(len(self.actor_mean.mlp) - 1):
-            layer_init(self.actor_mean.mlp[i])
+            self.actor_mean.mlp[i] = layer_init(self.actor_mean.mlp[i])

        # init last actor layer
-        layer_init(self.actor_mean.mlp[-2], std=0.01)
+        self.actor_mean.mlp[-2] = layer_init(self.actor_mean.mlp[-2], std=0.01)

        self.actor_logstd = nn.Parameter(torch.zeros(1, self.act_dim))

    def get_value(self, x):
        return self.critic(x)

-    def get_action(self, obs):
-        with torch.no_grad():
-            action = self.actor_mean(obs).cpu()
-        return action
-
    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
@@ -95,6 +90,14 @@ class PPOAgent(BaseAgent):
            probs.entropy().sum(1),
            self.critic(x),
        )
+    def get_action(self, x):
+        x = torch.FloatTensor(x)
+        with torch.no_grad():
+            action = self.actor_mean(x)
+
+        return action.cpu().numpy()
+
+

    def evaluate_agent(self, test_env, num_eval_episodes=5, max_step=2000):
        mean_ep_return = []
@@ -143,6 +146,7 @@ class PPOAgent(BaseAgent):
        num_eval_episodes=5,
        save_freq=10,
        seed=1,
+        torch_deterministic=True,
    ):

        batch_size = int(num_envs * num_steps)
@@ -153,6 +157,7 @@ class PPOAgent(BaseAgent):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
+        torch.backends.cudnn.deterministic = torch_deterministic

        device = self.device

@@ -207,12 +212,12 @@ class PPOAgent(BaseAgent):
                with torch.no_grad():
                    action, logprob, _, value = self.get_action_and_value(next_obs)
                    values[step] = value.flatten()
-                actions[step] = action.to(device)
+                actions[step] = action
                logprobs[step] = logprob

                # execute step in env
                action = action.cpu().numpy()
-                action[0] = np.clip(action[0], -0.2, 1)
+                #action[0] = np.clip(action[0], -0.2, 1)
                next_obs, reward, done, info = envs.step(action)
                rewards[step] = torch.tensor(reward).to(device).view(-1)
                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(

--- a/rl/ppo_test.py
+++ b/rl/ppo_test.py
--- a/rl/train_ppo.yaml
+++ b/rl/train_ppo.yaml
 algo: PPO
 seed: 42
 model:
-  hidden_sizes: [128,128]
+  hidden_sizes: [64,64]
  use_lstm: False
  act_limit: 1.0
 params:
@@ -20,13 +20,13 @@ params:

 env:
  path: ../simulations/rc_car_foyer_track
-  num_envs: 8
+  num_envs: 4
  num_stacked_obs: 4
  num_features: 2
 
 train:
  total_steps: 2000000
-  num_steps: 1024
+  num_steps: 2048
  num_minibatches: 32
  update_epochs: 10
  eval_every: 10

--- a/rl/yaml_args.yaml
+++ b/rl/yaml_args.yaml
+algo: PPO
+seed: 42
+torch_deterministic: true
+
+model:
+  hidden_sizes: [64,64]
+  use_lstm: False
+  act_limit: 1.0
+
+params:
+  learning_rate: 0.0003
+  anneal_lr: true
+  ent_coef: 0.0
+  gae: true
+  gae_lambda: 0.95
+  gamma: 0.99
+  max_grad_norm: 0.5
+  norm_adv: true
+  target_kl: null
+  vf_coef: 0.5
+  clip_coef: 0.2
+  clip_vloss: true
+
+env:
+  path: ../simulations/rc_car_foyer_track
+  num_envs: 4
+  num_stacked_obs: 4
+  num_features: 2
+
+train:
+  total_steps: 2000000
+  num_steps: 1024
+  num_minibatches: 32
+  update_epochs: 10