Skip to content

Commit

Permalink
Add tutorials for benchmark experiment on customized dataset.
Browse files Browse the repository at this point in the history
  • Loading branch information
zjowowen committed Aug 13, 2024
1 parent 3694716 commit d97a2fb
Show file tree
Hide file tree
Showing 4 changed files with 491 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ English | [简体中文(Simplified Chinese)](https://github.com/opendilab/Genera
- [Integrated Algorithms](#integrated-algorithms)
- [Installation](#installation)
- [Quick Start](#quick-start)
- [Tutorials](#tutorials)
- [Benchmark experiments](#benchmark-experiments)

## Features

Expand Down
2 changes: 2 additions & 0 deletions README.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
- [已集成的生成式强化学习算法](#已集成的生成式强化学习算法)
- [安装](#安装)
- [快速开始](#快速开始)
- [教程](#教程)
- [基线实验](#基线实验)

## 特性

Expand Down
243 changes: 243 additions & 0 deletions grl_pipelines/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,246 @@ pip install -e .
pip install lockfile
pip install "Cython<3.0"
```

## Benchmark experiment for new datasets

GenerativeRL support benchmarking for new datasets or customized datasets if it has not been integrated into the framework.
You can follow the steps below to conduct experiments for your dataset using GMPO and GMPG algorithms.

### Step 1: Prepare the dataset

Prepare your dataset in the following format:

```python
import torch
import numpy as np

# Sample PyTorch tensors
obs = torch.tensor([...])
action = torch.tensor([...])
next_obs = torch.tensor([...])
reward = torch.tensor([...])
done = torch.tensor([...])

# Convert tensors to numpy arrays
obs_np = obs.numpy()
action_np = action.numpy()
next_obs_np = next_obs.numpy()
reward_np = reward.numpy()
done_np = done.numpy()

# Save to a .npz file
np.savez('data.npz', obs=obs_np, action=action_np, next_obs=next_obs_np, reward=reward_np, done=done_np)
```

An example of a dataset for the LunarLanderContinuous-v2 environment is provided [here](https://drive.google.com/file/d/1YnT-Oeu9LPKuS_ZqNc5kol_pMlJ1DwyG/view?usp=drive_link).

### Step 2: Run the benchmark experiment

Run the following command to start the benchmark experiment:

```python
import torch
from easydict import EasyDict

env_id = "LunarLanderContinuous-v2" #TODO: Specify the environment ID
action_size = 2 #TODO: Specify the action size
state_size = 8 #TODO: Specify the state size
algorithm_type = "GMPO" #TODO: Specify the algorithm type
solver_type = "ODESolver" #TODO: Specify the solver type
model_type = "DiffusionModel" #TODO: Specify the model type
generative_model_type = "GVP" #TODO: Specify the generative model type
path = dict(type="gvp") #TODO: Specify the path
model_loss_type = "flow_matching" #TODO: Specify the model loss type
data_path = "./data.npz" #TODO: Specify the data path
project_name = f"{env_id}-{algorithm_type}-{generative_model_type}"
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
t_embedding_dim = 32
t_encoder = dict(
type="GaussianFourierProjectionTimeEncoder",
args=dict(
embed_dim=t_embedding_dim,
scale=30.0,
),
)
model = dict(
device=device,
x_size=action_size,
solver=dict(
type="ODESolver",
args=dict(
library="torchdiffeq",
),
),
path=path,
reverse_path=path,
model=dict(
type="velocity_function",
args=dict(
t_encoder=t_encoder,
backbone=dict(
type="TemporalSpatialResidualNet",
args=dict(
hidden_sizes=[512, 256, 128],
output_dim=action_size,
t_dim=t_embedding_dim,
condition_dim=state_size,
condition_hidden_dim=32,
t_condition_hidden_dim=128,
),
),
),
),
)

config = EasyDict(
train=dict(
project=project_name,
device=device,
wandb=dict(project=f"IQL-{env_id}-{algorithm_type}-{generative_model_type}"),
simulator=dict(
type="GymEnvSimulator",
args=dict(
env_id=env_id,
),
),
dataset=dict(
type="GPCustomizedDataset",
args=dict(
env_id=env_id,
numpy_data_path=data_path,
),
),
model=dict(
GPPolicy=dict(
device=device,
model_type=model_type,
model_loss_type=model_loss_type,
model=model,
critic=dict(
device=device,
q_alpha=1.0,
DoubleQNetwork=dict(
backbone=dict(
type="ConcatenateMLP",
args=dict(
hidden_sizes=[action_size + state_size, 256, 256],
output_size=1,
activation="relu",
),
),
),
VNetwork=dict(
backbone=dict(
type="MultiLayerPerceptron",
args=dict(
hidden_sizes=[state_size, 256, 256],
output_size=1,
activation="relu",
),
),
),
),
),
GuidedPolicy=dict(
model_type=model_type,
model=model,
),
),
parameter=dict(
algorithm_type=algorithm_type,
behaviour_policy=dict(
batch_size=4096,
learning_rate=1e-4,
epochs=0,
),
t_span=32,
critic=dict(
batch_size=4096,
epochs=2000,
learning_rate=3e-4,
discount_factor=0.99,
update_momentum=0.005,
tau=0.7,
method="iql",
),
guided_policy=dict(
batch_size=4096,
epochs=10000,
learning_rate=1e-4,
beta=1.0,
weight_clamp=100,
),
evaluation=dict(
eval=True,
repeat=5,
interval=100,
),
checkpoint_path=f"./{project_name}/checkpoint",
checkpoint_freq=10,
),
),
deploy=dict(
device=device,
env=dict(
env_id=env_id,
seed=0,
),
t_span=32,
),
)

if __name__ == "__main__":

import gym
import d4rl
import numpy as np

from grl.algorithms.gmpg import GMPGAlgorithm
from grl.utils.log import log

def gp_pipeline(config):

gp = GMPGAlgorithm(config)

# ---------------------------------------
# Customized train code ↓
# ---------------------------------------
gp.train()
# ---------------------------------------
# Customized train code ↑
# ---------------------------------------

# ---------------------------------------
# Customized deploy code ↓
# ---------------------------------------

agent = gp.deploy()
env = gym.make(config.deploy.env.env_id)
total_reward_list = []
for i in range(100):
observation = env.reset()
total_reward = 0
while True:
# env.render()
observation, reward, done, _ = env.step(agent.act(observation))
total_reward += reward
if done:
observation = env.reset()
print(f"Episode {i}, total_reward: {total_reward}")
total_reward_list.append(total_reward)
break

print(
f"Average total reward: {np.mean(total_reward_list)}, std: {np.std(total_reward_list)}"
)

# ---------------------------------------
# Customized deploy code ↑
# ---------------------------------------

log.info("config: \n{}".format(config))
gp_pipeline(config)
```

For performance evaluation, youhave to register the environment in the `gym` library. You can refer to the [gym documentation](https://www.gymlibrary.dev/) for more information.
Loading

0 comments on commit d97a2fb

Please sign in to comment.