Add tutorials for benchmark experiment on customized dataset.

opendilab · Aug 13, 2024 · d97a2fb · d97a2fb
1 parent 3694716
commit d97a2fb
Show file tree

Hide file tree

Showing 4 changed files with 491 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -21,6 +21,8 @@ English | [简体中文(Simplified Chinese)](https://github.com/opendilab/Genera
 - [Integrated Algorithms](#integrated-algorithms)
 - [Installation](#installation)
 - [Quick Start](#quick-start)
+- [Tutorials](#tutorials)
+- [Benchmark experiments](#benchmark-experiments)
 
 ## Features
 

diff --git a/README.zh.md b/README.zh.md
@@ -21,6 +21,8 @@
 - [已集成的生成式强化学习算法](#已集成的生成式强化学习算法)
 - [安装](#安装)
 - [快速开始](#快速开始)
+- [教程](#教程)
+- [基线实验](#基线实验)
 
 ## 特性
 

diff --git a/grl_pipelines/benchmark/README.md b/grl_pipelines/benchmark/README.md
@@ -64,3 +64,246 @@ pip install -e .
 pip install lockfile
 pip install "Cython<3.0"
 ```
+
+## Benchmark experiment for new datasets
+
+GenerativeRL support benchmarking for new datasets or customized datasets if it has not been integrated into the framework.
+You can follow the steps below to conduct experiments for your dataset using GMPO and GMPG algorithms.
+
+### Step 1: Prepare the dataset
+
+Prepare your dataset in the following format:
+
+```python
+import torch
+import numpy as np
+
+# Sample PyTorch tensors
+obs = torch.tensor([...])
+action = torch.tensor([...])
+next_obs = torch.tensor([...])
+reward = torch.tensor([...])
+done = torch.tensor([...])
+
+# Convert tensors to numpy arrays
+obs_np = obs.numpy()
+action_np = action.numpy()
+next_obs_np = next_obs.numpy()
+reward_np = reward.numpy()
+done_np = done.numpy()
+
+# Save to a .npz file
+np.savez('data.npz', obs=obs_np, action=action_np, next_obs=next_obs_np, reward=reward_np, done=done_np)
+```
+
+An example of a dataset for the LunarLanderContinuous-v2 environment is provided [here](https://drive.google.com/file/d/1YnT-Oeu9LPKuS_ZqNc5kol_pMlJ1DwyG/view?usp=drive_link).
+
+### Step 2: Run the benchmark experiment
+
+Run the following command to start the benchmark experiment:
+
+```python
+import torch
+from easydict import EasyDict
+
+env_id = "LunarLanderContinuous-v2" #TODO: Specify the environment ID
+action_size = 2 #TODO: Specify the action size
+state_size = 8 #TODO: Specify the state size
+algorithm_type = "GMPO" #TODO: Specify the algorithm type
+solver_type = "ODESolver" #TODO: Specify the solver type
+model_type = "DiffusionModel" #TODO: Specify the model type
+generative_model_type = "GVP" #TODO: Specify the generative model type
+path = dict(type="gvp") #TODO: Specify the path
+model_loss_type = "flow_matching" #TODO: Specify the model loss type
+data_path = "./data.npz" #TODO: Specify the data path
+project_name = f"{env_id}-{algorithm_type}-{generative_model_type}"
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+t_embedding_dim = 32
+t_encoder = dict(
+    type="GaussianFourierProjectionTimeEncoder",
+    args=dict(
+        embed_dim=t_embedding_dim,
+        scale=30.0,
+    ),
+)
+model = dict(
+    device=device,
+    x_size=action_size,
+    solver=dict(
+        type="ODESolver",
+        args=dict(
+            library="torchdiffeq",
+        ),
+    ),
+    path=path,
+    reverse_path=path,
+    model=dict(
+        type="velocity_function",
+        args=dict(
+            t_encoder=t_encoder,
+            backbone=dict(
+                type="TemporalSpatialResidualNet",
+                args=dict(
+                    hidden_sizes=[512, 256, 128],
+                    output_dim=action_size,
+                    t_dim=t_embedding_dim,
+                    condition_dim=state_size,
+                    condition_hidden_dim=32,
+                    t_condition_hidden_dim=128,
+                ),
+            ),
+        ),
+    ),
+)
+
+config = EasyDict(
+    train=dict(
+        project=project_name,
+        device=device,
+        wandb=dict(project=f"IQL-{env_id}-{algorithm_type}-{generative_model_type}"),
+        simulator=dict(
+            type="GymEnvSimulator",
+            args=dict(
+                env_id=env_id,
+            ),
+        ),
+        dataset=dict(
+            type="GPCustomizedDataset",
+            args=dict(
+                env_id=env_id,
+                numpy_data_path=data_path,
+            ),
+        ),
+        model=dict(
+            GPPolicy=dict(
+                device=device,
+                model_type=model_type,
+                model_loss_type=model_loss_type,
+                model=model,
+                critic=dict(
+                    device=device,
+                    q_alpha=1.0,
+                    DoubleQNetwork=dict(
+                        backbone=dict(
+                            type="ConcatenateMLP",
+                            args=dict(
+                                hidden_sizes=[action_size + state_size, 256, 256],
+                                output_size=1,
+                                activation="relu",
+                            ),
+                        ),
+                    ),
+                    VNetwork=dict(
+                        backbone=dict(
+                            type="MultiLayerPerceptron",
+                            args=dict(
+                                hidden_sizes=[state_size, 256, 256],
+                                output_size=1,
+                                activation="relu",
+                            ),
+                        ),
+                    ),
+                ),
+            ),
+            GuidedPolicy=dict(
+                model_type=model_type,
+                model=model,
+            ),
+        ),
+        parameter=dict(
+            algorithm_type=algorithm_type,
+            behaviour_policy=dict(
+                batch_size=4096,
+                learning_rate=1e-4,
+                epochs=0,
+            ),
+            t_span=32,
+            critic=dict(
+                batch_size=4096,
+                epochs=2000,
+                learning_rate=3e-4,
+                discount_factor=0.99,
+                update_momentum=0.005,
+                tau=0.7,
+                method="iql",
+            ),
+            guided_policy=dict(
+                batch_size=4096,
+                epochs=10000,
+                learning_rate=1e-4,
+                beta=1.0,
+                weight_clamp=100,
+            ),
+            evaluation=dict(
+                eval=True,
+                repeat=5,
+                interval=100,
+            ),
+            checkpoint_path=f"./{project_name}/checkpoint",
+            checkpoint_freq=10,
+        ),
+    ),
+    deploy=dict(
+        device=device,
+        env=dict(
+            env_id=env_id,
+            seed=0,
+        ),
+        t_span=32,
+    ),
+)
+
+if __name__ == "__main__":
+
+    import gym
+    import d4rl
+    import numpy as np
+
+    from grl.algorithms.gmpg import GMPGAlgorithm
+    from grl.utils.log import log
+
+    def gp_pipeline(config):
+
+        gp = GMPGAlgorithm(config)
+
+        # ---------------------------------------
+        # Customized train code ↓
+        # ---------------------------------------
+        gp.train()
+        # ---------------------------------------
+        # Customized train code ↑
+        # ---------------------------------------
+
+        # ---------------------------------------
+        # Customized deploy code ↓
+        # ---------------------------------------
+
+        agent = gp.deploy()
+        env = gym.make(config.deploy.env.env_id)
+        total_reward_list = []
+        for i in range(100):
+            observation = env.reset()
+            total_reward = 0
+            while True:
+                # env.render()
+                observation, reward, done, _ = env.step(agent.act(observation))
+                total_reward += reward
+                if done:
+                    observation = env.reset()
+                    print(f"Episode {i}, total_reward: {total_reward}")
+                    total_reward_list.append(total_reward)
+                    break
+
+        print(
+            f"Average total reward: {np.mean(total_reward_list)}, std: {np.std(total_reward_list)}"
+        )
+
+        # ---------------------------------------
+        # Customized deploy code ↑
+        # ---------------------------------------
+
+    log.info("config: \n{}".format(config))
+    gp_pipeline(config)
+```
+
+For performance evaluation, youhave to register the environment in the `gym` library. You can refer to the [gym documentation](https://www.gymlibrary.dev/) for more information.