From 12904eb868370dd4158600be89bf921f15125407 Mon Sep 17 00:00:00 2001
From: StoneT2000 <stonezt2019@gmail.com>
Date: Sun, 21 Jan 2024 13:38:24 -0800
Subject: [PATCH] work

---
 .../cleanrl_ppo_liftcube_state_gpu.py                  | 10 +++++++---
 mani_skill2/envs/sapien_env.py                         |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py b/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py
index abf348413..8558ebce6 100644
--- a/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py
+++ b/examples/tutorials/reinforcement-learning/cleanrl_ppo_liftcube_state_gpu.py
@@ -215,7 +215,7 @@ def get_action_and_value(self, x, action=None):
     rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
     dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
     values = torch.zeros((args.num_steps, args.num_envs)).to(device)
-
+    
     # TRY NOT TO MODIFY: start the game
     global_step = 0
     start_time = time.time()
@@ -234,6 +234,7 @@ def get_action_and_value(self, x, action=None):
     def clip_action(action: torch.Tensor):
         return torch.clamp(action.detach(), action_space_low, action_space_high)
     for iteration in range(1, args.num_iterations + 1):
+        timeout_bonus = torch.zeros((args.num_steps, args.num_envs), device=device)
         with torch.inference_mode():
             if iteration % 25 == 1:
                 # evaluate
@@ -301,6 +302,9 @@ def clip_action(action: torch.Tensor):
             # next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
             if truncations.any():
                 # TODO make truncations a tensor, which should all be the same value really...
+                final_obs = next_obs
+                final_value = agent.get_value(final_obs)
+                timeout_bonus[step] = final_value.flatten()
                 next_obs, _ = envs.reset()
                 # writer.add_scalar("charts/episodic_is_grasped", is_grasped.mean().cpu().numpy(), global_step)
                 # writer.add_scalar("charts/episodic_place_rew", place_rew.mean().cpu().numpy(), global_step)
@@ -316,11 +320,11 @@ def clip_action(action: torch.Tensor):
                         print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                         writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                         writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
-
         # bootstrap value if not done
         with torch.no_grad():
             next_value = agent.get_value(next_obs).reshape(1, -1)
             advantages = torch.zeros_like(rewards).to(device)
+            rewards_ = rewards + timeout_bonus
             lastgaelam = 0
             for t in reversed(range(args.num_steps)):
                 if t == args.num_steps - 1:
@@ -329,7 +333,7 @@ def clip_action(action: torch.Tensor):
                 else:
                     nextnonterminal = 1.0 - dones[t + 1]
                     nextvalues = values[t + 1]
-                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
+                delta = rewards_[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                 advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
             returns = advantages + values
 
diff --git a/mani_skill2/envs/sapien_env.py b/mani_skill2/envs/sapien_env.py
index 33a1bffe3..f6851c570 100644
--- a/mani_skill2/envs/sapien_env.py
+++ b/mani_skill2/envs/sapien_env.py
@@ -685,6 +685,7 @@ def _set_scene_config(self):
         # NOTE (fxiang): smaller contact_offset is faster as less contacts are considered, but some contacts may be missed if distance changes too fast
         # NOTE (fxiang): solver iterations 15 is recommended to balance speed and accuracy. If stable grasps are necessary >= 20 is preferred.
         # NOTE (fxiang): can try using more cpu_workers as it may also make it faster if there are a lot of collisions, collision filtering is on CPU
+        # NOTE (fxiang): enable_enhanced_determinism is for CPU probably. If there are 10 far apart sub scenes, this being True makes it so they do not impact each other at all
         physx.set_scene_config(
             cpu_workers=0,
             enable_pcm=True,