This repository has been archived by the owner on Jan 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path3_FrozenLake_Q_Learn.py
69 lines (53 loc) · 1.59 KB
/
3_FrozenLake_Q_Learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import random
import gym
import matplotlib.pyplot as plt
import numpy as np
env = gym.make("FrozenLake-v1")
random.seed(0)
np.random.seed(0)
env.seed(0)
print("## Frozen Lake ##")
print("Start state:")
env.render()
no_states = env.observation_space.n
no_actions = env.action_space.n
q_values = np.zeros((no_states, no_actions))
def play_episode(q, eps):
alpha = 0.5
gamma = 1.0
state = env.reset()
done = False
r_s = []
s_a = []
while not done:
eps_greedy = (
lambda s, eps: np.random.choice([i for i, v in enumerate(q_values[state]) if v == q_values[state, :].max()])
if random.random() > eps
else np.random.randint(0, 3)
)
action = eps_greedy(state, eps)
s_a.append((state, action))
new_state, reward, done, _ = env.step(action)
r_s.append(reward)
q_values[state, action] += alpha * (reward + gamma * q_values[new_state, :].max() - q_values[state, action])
state = new_state
return s_a, r_s
def main():
no_episodes = 1000
plot_data = []
# q-learning
for eps in [0.01, 0.1, 0.5, 1.0]:
rewards = []
for _ in range(0, no_episodes):
s_a, r_s = play_episode(q_values, eps)
rewards.append(sum(r_s))
plot_data.append((eps, np.cumsum(rewards), "Q-learn (eps={})".format(eps)))
# plot the rewards
plt.figure()
plt.xlabel("No. of episodes")
plt.ylabel("Sum of rewards")
for (eps, data, label) in plot_data:
plt.plot(range(0, no_episodes), data, label=label)
plt.legend()
plt.show()
main()