Skip to content

Commit

Permalink
work
Browse files Browse the repository at this point in the history
  • Loading branch information
StoneT2000 committed Jan 24, 2024
1 parent edc4024 commit f327ce9
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 19 deletions.
6 changes: 3 additions & 3 deletions examples/tutorials/reinforcement-learning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ Pass in `--help` for more options (e.g. logging, number of parallel environments
python cleanrl_ppo_liftcube_state_gpu.py --gamma=0.8 --gae_lambda=0.95 --total_timesteps=10000000
python cleanrl_ppo_liftcube_state_gpu.py --gamma=0.8 --gae_lambda=0.95 --total_timesteps=10000000
```bash
python cleanrl_ppo_liftcube_state_gpu.py --num_envs=512 --gamma=0.8 --gae_lambda=0.9 --update_epochs=8 --target_kl=0.1 --num_minibatches=16 --env_id="PickCube-v0" --total_timesteps=100000000 --num_steps=100
```
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_continuous_actionpy
# python cleanrl_ppo_liftcube_state_gpu.py --num_envs=512 --gamma=0.8 --gae_lambda=0.9 --update_epochs=1 --num_minibatches=128 --env_id="PickCube-v0" --total_timesteps=100000000
# python cleanrl_ppo_liftcube_state_gpu.py --num_envs=512 --gamma=0.8 --gae_lambda=0.9 --update_epochs=8 --target_kl=0.1 --num_minibatches=16 --env_id="PickCube-v0" --total_timesteps=100000000 --num_steps=100
# python cleanrl_ppo_liftcube_state_gpu.py --num_envs=512 --gamma=0.8 --gae_lambda=0.9 --update_epochs=4 --num_minibatches=16 --env_id="PickCube-v0" --total_timesteps=100000000
# python cleanrl_ppo_liftcube_state_gpu.py --num_envs=2048 --gamma=0.8 --gae_lambda=0.9 --update_epochs=1 --num_minibatches=32 --env_id="PushCube-v0" --total_timesteps=100000000 --num-steps=12
# TODO: train shorter horizon to leverage parallelization more.
import os
import random
Expand Down Expand Up @@ -47,7 +47,7 @@ class Args:
"""the user or org name of the model repository from the Hugging Face Hub"""

# Algorithm specific arguments
env_id: str = "HalfCheetah-v4"
env_id: str = "PickCube-v1"
"""the id of the environment"""
total_timesteps: int = 10000000
"""total timesteps of the experiments"""
Expand All @@ -60,9 +60,9 @@ class Args:
"""the number of steps to run in each environment per policy rollout"""
anneal_lr: bool = False
"""Toggle learning rate annealing for policy and value networks"""
gamma: float = 0.99
gamma: float = 0.8
"""the discount factor gamma"""
gae_lambda: float = 0.95
gae_lambda: float = 0.9
"""the lambda for the general advantage estimation"""
num_minibatches: int = 32
"""the number of mini-batches"""
Expand Down Expand Up @@ -194,9 +194,6 @@ def get_action_and_value(self, x, action=None):
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

# env setup
# envs = gym.vector.SyncVectorEnv(
# [make_env(args.env_id, i, args.capture_video, run_name, args.gamma) for i in range(args.num_envs)]
# )
import mani_skill2.envs
sapien.physx.set_gpu_memory_config(found_lost_pairs_capacity=2**26, max_rigid_patch_count=200000)
sim_freq, control_freq = 100, 20
Expand Down Expand Up @@ -224,7 +221,6 @@ def get_action_and_value(self, x, action=None):
next_done = torch.zeros(args.num_envs, device=device)
eps_returns = torch.zeros(args.num_envs, dtype=torch.float, device=device)
eps_lens = np.zeros(args.num_envs)
is_grasped = torch.zeros(args.num_envs, device=device)
place_rew = torch.zeros(args.num_envs, device=device)
print(f"####")
print(f"args.num_iterations={args.num_iterations} args.num_envs={args.num_envs} args.num_eval_envs={args.num_eval_envs}")
Expand Down Expand Up @@ -255,11 +251,9 @@ def clip_action(action: torch.Tensor):
eval_returns += reward
eval_eps_lens += 1
truncations = torch.ones_like(terminations) * truncations
# eval_is_grasped += infos["is_grasped"]
if truncations.any():
# TODO make truncations a tensor, which should all be the same value really...
next_eval_obs, _ = eval_envs.reset()
# writer.add_scalar("charts/eval_is_grasped", eval_is_grasped.mean().cpu().numpy(), global_step)
writer.add_scalar("charts/eval_success_rate", infos["success"].float().mean().cpu().numpy(), global_step)
writer.add_scalar("charts/eval_episodic_return", eval_returns.mean().cpu().numpy(), global_step)
writer.add_scalar("charts/eval_episodic_length", eval_eps_lens.mean(), global_step)
Expand Down Expand Up @@ -306,14 +300,10 @@ def clip_action(action: torch.Tensor):
final_value = agent.get_value(final_obs)
timeout_bonus[step] = final_value.flatten()
next_obs, _ = envs.reset()
# writer.add_scalar("charts/episodic_is_grasped", is_grasped.mean().cpu().numpy(), global_step)
# writer.add_scalar("charts/episodic_place_rew", place_rew.mean().cpu().numpy(), global_step)
writer.add_scalar("charts/episodic_return", eps_returns.mean().cpu().numpy(), global_step)
writer.add_scalar("charts/episodic_length", eps_lens.mean(), global_step)
eps_returns = eps_returns * 0
eps_lens = eps_lens * 0
place_rew = place_rew * 0
is_grasped = is_grasped * 0
if "final_info" in infos:
for info in infos["final_info"]:
if info and "episode" in info:
Expand Down
2 changes: 1 addition & 1 deletion mani_skill2/envs/sapien_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ def step(self, action: Union[None, np.ndarray, Dict]):
reward = self.get_reward(obs=obs, action=action, info=info)
terminated = info["success"]
if physx.is_gpu_enabled():
return obs, reward, terminated, torch.Tensor([False]), info
return obs, reward, terminated, False, info
else:
# On CPU sim mode, we always return numpy / python primitives without any batching.
return unbatch(
Expand Down

0 comments on commit f327ce9

Please sign in to comment.