From 12904eb868370dd4158600be89bf921f15125407 Mon Sep 17 00:00:00 2001 From: StoneT2000 Date: Sun, 21 Jan 2024 13:38:24 -0800 Subject: [PATCH] work --- .../cleanrl_ppo_liftcube_state_gpu.py | 10 +++++++--- mani_skill2/envs/sapien_env.py | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py b/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py index abf348413..8558ebce6 100644 --- a/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py +++ b/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py @@ -215,7 +215,7 @@ def get_action_and_value(self, x, action=None): rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) values = torch.zeros((args.num_steps, args.num_envs)).to(device) - + # TRY NOT TO MODIFY: start the game global_step = 0 start_time = time.time() @@ -234,6 +234,7 @@ def get_action_and_value(self, x, action=None): def clip_action(action: torch.Tensor): return torch.clamp(action.detach(), action_space_low, action_space_high) for iteration in range(1, args.num_iterations + 1): + timeout_bonus = torch.zeros((args.num_steps, args.num_envs), device=device) with torch.inference_mode(): if iteration % 25 == 1: # evaluate @@ -301,6 +302,9 @@ def clip_action(action: torch.Tensor): # next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device) if truncations.any(): # TODO make truncations a tensor, which should all be the same value really... + final_obs = next_obs + final_value = agent.get_value(final_obs) + timeout_bonus[step] = final_value.flatten() next_obs, _ = envs.reset() # writer.add_scalar("charts/episodic_is_grasped", is_grasped.mean().cpu().numpy(), global_step) # writer.add_scalar("charts/episodic_place_rew", place_rew.mean().cpu().numpy(), global_step) @@ -316,11 +320,11 @@ def clip_action(action: torch.Tensor): print(f"global_step={global_step}, episodic_return={info['episode']['r']}") writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) - # bootstrap value if not done with torch.no_grad(): next_value = agent.get_value(next_obs).reshape(1, -1) advantages = torch.zeros_like(rewards).to(device) + rewards_ = rewards + timeout_bonus lastgaelam = 0 for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: @@ -329,7 +333,7 @@ def clip_action(action: torch.Tensor): else: nextnonterminal = 1.0 - dones[t + 1] nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] + delta = rewards_[t] + args.gamma * nextvalues * nextnonterminal - values[t] advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam returns = advantages + values diff --git a/mani_skill2/envs/sapien_env.py b/mani_skill2/envs/sapien_env.py index 33a1bffe3..f6851c570 100644 --- a/mani_skill2/envs/sapien_env.py +++ b/mani_skill2/envs/sapien_env.py @@ -685,6 +685,7 @@ def _set_scene_config(self): # NOTE (fxiang): smaller contact_offset is faster as less contacts are considered, but some contacts may be missed if distance changes too fast # NOTE (fxiang): solver iterations 15 is recommended to balance speed and accuracy. If stable grasps are necessary >= 20 is preferred. # NOTE (fxiang): can try using more cpu_workers as it may also make it faster if there are a lot of collisions, collision filtering is on CPU + # NOTE (fxiang): enable_enhanced_determinism is for CPU probably. If there are 10 far apart sub scenes, this being True makes it so they do not impact each other at all physx.set_scene_config( cpu_workers=0, enable_pcm=True,