Skip to content
Snippets Groups Projects
Commit aab877bd authored by Matthias König's avatar Matthias König
Browse files

rl ppo

parent 5cac8e05
No related branches found
No related tags found
No related merge requests found
......@@ -55,9 +55,9 @@ class PPOAgent(BaseAgent):
)
# init critic layer
for i in range(len(self.critic.mlp) - 1):
layer_init(self.critic.mlp[i])
self.critic.mlp[i] = layer_init(self.critic.mlp[i])
# init last critic layer
layer_init(self.critic.mlp[-2], std=1.0)
self.critic.mlp[-2] = layer_init(self.critic.mlp[-2], std=1.0)
# actor model
self.actor_mean = MLPNet(
......@@ -67,21 +67,16 @@ class PPOAgent(BaseAgent):
)
# init actor model
for i in range(len(self.actor_mean.mlp) - 1):
layer_init(self.actor_mean.mlp[i])
self.actor_mean.mlp[i] = layer_init(self.actor_mean.mlp[i])
# init last actor layer
layer_init(self.actor_mean.mlp[-2], std=0.01)
self.actor_mean.mlp[-2] = layer_init(self.actor_mean.mlp[-2], std=0.01)
self.actor_logstd = nn.Parameter(torch.zeros(1, self.act_dim))
def get_value(self, x):
return self.critic(x)
def get_action(self, obs):
with torch.no_grad():
action = self.actor_mean(obs).cpu()
return action
def get_action_and_value(self, x, action=None):
action_mean = self.actor_mean(x)
action_logstd = self.actor_logstd.expand_as(action_mean)
......@@ -95,6 +90,14 @@ class PPOAgent(BaseAgent):
probs.entropy().sum(1),
self.critic(x),
)
def get_action(self, x):
x = torch.FloatTensor(x)
with torch.no_grad():
action = self.actor_mean(x)
return action.cpu().numpy()
def evaluate_agent(self, test_env, num_eval_episodes=5, max_step=2000):
mean_ep_return = []
......@@ -143,6 +146,7 @@ class PPOAgent(BaseAgent):
num_eval_episodes=5,
save_freq=10,
seed=1,
torch_deterministic=True,
):
batch_size = int(num_envs * num_steps)
......@@ -153,6 +157,7 @@ class PPOAgent(BaseAgent):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = torch_deterministic
device = self.device
......@@ -207,12 +212,12 @@ class PPOAgent(BaseAgent):
with torch.no_grad():
action, logprob, _, value = self.get_action_and_value(next_obs)
values[step] = value.flatten()
actions[step] = action.to(device)
actions[step] = action
logprobs[step] = logprob
# execute step in env
action = action.cpu().numpy()
action[0] = np.clip(action[0], -0.2, 1)
#action[0] = np.clip(action[0], -0.2, 1)
next_obs, reward, done, info = envs.step(action)
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(
......
This diff is collapsed.
algo: PPO
seed: 42
model:
hidden_sizes: [128,128]
hidden_sizes: [64,64]
use_lstm: False
act_limit: 1.0
params:
......@@ -20,13 +20,13 @@ params:
env:
path: ../simulations/rc_car_foyer_track
num_envs: 8
num_envs: 4
num_stacked_obs: 4
num_features: 2
train:
total_steps: 2000000
num_steps: 1024
num_steps: 2048
num_minibatches: 32
update_epochs: 10
eval_every: 10
......
algo: PPO
seed: 42
torch_deterministic: true
model:
hidden_sizes: [64,64]
use_lstm: False
act_limit: 1.0
params:
learning_rate: 0.0003
anneal_lr: true
ent_coef: 0.0
gae: true
gae_lambda: 0.95
gamma: 0.99
max_grad_norm: 0.5
norm_adv: true
target_kl: null
vf_coef: 0.5
clip_coef: 0.2
clip_vloss: true
env:
path: ../simulations/rc_car_foyer_track
num_envs: 4
num_stacked_obs: 4
num_features: 2
train:
total_steps: 2000000
num_steps: 1024
num_minibatches: 32
update_epochs: 10
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment