If you use Gitkraken, immediately update to version 8.1 (or later) remove your SSH key from https://gitlab.ai.vub.ac.be/-/profile/keys and generate a new one. SSH keys generated with a vulnerable Gitkraken version are compromised.

Commit 63b672ab authored by Mathieu Reymond's avatar Mathieu Reymond
Browse files

pdqn on both DST and Minecart

parent e221dc95
import numpy as np
import math
EPS_SPEED = 0.001 # Minimum speed to be considered in motion
HOME_X = .0
HOME_Y = .0
HOME_POS = (HOME_X, HOME_Y)
ROTATION = 10
MAX_SPEED = 1.
FUEL_MINE = -.05
FUEL_ACC = -.025
FUEL_IDLE = -0.005
CAPACITY = 1
ACT_MINE = 0
ACT_LEFT = 1
ACT_RIGHT = 2
ACT_ACCEL = 3
ACT_BRAKE = 4
ACT_NONE = 5
ACTIONS = ["Mine", "Left", "Right", "Accelerate", "Brake", "None"]
ACTION_COUNT = len(ACTIONS)
MINE_RADIUS = 0.14
BASE_RADIUS = 0.15
WIDTH = 480
HEIGHT = 480
# Color definitions
WHITE = (255, 255, 255)
GRAY = (150, 150, 150)
C_GRAY = (150 / 255., 150 / 255., 150 / 255.)
DARK_GRAY = (100, 100, 100)
BLACK = (0, 0, 0)
RED = (255, 70, 70)
C_RED = (1., 70 / 255., 70 / 255.)
FPS = 24
MINE_LOCATION_TRIES = 100
MINE_SCALE = 1.
BASE_SCALE = 1.
CART_SCALE = 1.
MARGIN = 0.16 * CART_SCALE
ACCELERATION = 0.0075 * CART_SCALE
DECELERATION = 1
def clip(val, lo, hi):
return lo if val <= lo else hi if val >= hi else val
def mag(vector2d):
return np.sqrt(np.dot(vector2d,vector2d))
class Cart():
"""Class representing the actual minecart
"""
def __init__(self, ore_cnt):
self.ore_cnt = ore_cnt
self.pos = np.array([HOME_X, HOME_Y])
self.speed = 0
self.angle = 45
self.content = np.zeros(self.ore_cnt)
self.departed = False # Keep track of whether the agent has left the base
def accelerate(self, acceleration):
self.speed = clip(self.speed + acceleration, 0, MAX_SPEED)
def rotate(self, rotation):
self.angle = (self.angle + rotation) % 360
def step(self):
"""
Update cart's position, taking the current speed into account
Colliding with a border at anything but a straight angle will cause
cart to "slide" along the wall.
"""
pre = np.copy(self.pos)
if self.speed < EPS_SPEED:
return False
x_velocity = self.speed * math.cos(self.angle * math.pi / 180)
y_velocity = self.speed * math.sin(self.angle * math.pi / 180)
x, y = self.pos
if y != 0 and y != 1 and (y_velocity > 0 + EPS_SPEED or
y_velocity < 0 - EPS_SPEED):
if x == 1 and x_velocity > 0:
self.angle += math.copysign(ROTATION, y_velocity)
if x == 0 and x_velocity < 0:
self.angle -= math.copysign(ROTATION, y_velocity)
if x != 0 and x != 1 and (x_velocity > 0 + EPS_SPEED or
x_velocity < 0 - EPS_SPEED):
if y == 1 and y_velocity > 0:
self.angle -= math.copysign(ROTATION, x_velocity)
if y == 0 and y_velocity < 0:
self.angle += math.copysign(ROTATION, x_velocity)
self.pos[0] = clip(x + x_velocity, 0, 1)
self.pos[1] = clip(y + y_velocity, 0, 1)
self.speed = mag(pre - self.pos)
return True
class MinecartReward(object):
def __init__(self):
self.frame_skip = 4
self.nO = 2
self.fuel_mine = -.05
self.fuel_acc = -.025
self.fuel_idle = -0.005
self.base_radius = .15
self.base_scale = 1.
self.rotation = 10
self.acc = 0.0075
self.dec = 1
def __call__(self, state, action, use_target_network=False):
r = np.zeros((len(state), self.nO))
actions = np.zeros(6)
actions[3] = self.fuel_acc * self.frame_skip
actions[0] = self.fuel_mine * self.frame_skip
r[:, -1] = self.fuel_idle * self.frame_skip
r[:, -1] += actions[action]
next_pos = np.zeros((len(state), 2))
for i in range(len(next_pos)):
c = Cart(1)
# s['position'], [s['speed'], s['orientation'], *s['content']
c.pos = state[i, :2]
c.speed = state[i, 2]
c.angle = state[i, 3]
a = action[i]
for _ in range(self.frame_skip):
if a == 1:
c.rotate(-self.rotation)
elif a == 2:
c.rotate(self.rotation)
elif a == 3:
c.accelerate(self.acc)
elif a == 4:
c.accelerate(-self.dec)
c.step()
next_pos[i] = c.pos
if np.sqrt(np.dot(c.pos, c.pos)) < self.base_scale*self.base_radius:
r[i, :(self.nO-1)] += state[i, 4:]
break
return r
def update(self, *args, **kwargs):
return 0
......@@ -155,7 +155,7 @@ class PrioritizedMemory(Memory):
class Estimator(object):
def __init__(self, model, lr=1e-3, tau=1., copy_every=0, clamp=None, device='cpu'):
def __init__(self, model, lr=1e-3, tau=1., copy_every=0, clamp=None, scheduler_steps=1000, device='cpu'):
self.model = model
self.target_model = copy.deepcopy(model)
self.device = device
......@@ -164,6 +164,10 @@ class Estimator(object):
self.tau = tau
self.clamp = clamp
self.opt = torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=0)
if scheduler_steps is None:
self.lr_scheduler = None
else:
self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.opt, lambda e: np.amax([(scheduler_steps-e)/scheduler_steps, 0.]))
self.loss = nn.MSELoss(reduction='none')
def should_copy(self, step):
......@@ -205,6 +209,8 @@ class Estimator(object):
self.total_norm = total_norm
self.opt.step()
if self.lr_scheduler is not None:
self.lr_scheduler.step()
if self.should_copy(step):
self.update_target(self.tau)
......@@ -688,12 +694,13 @@ class ParetoApproximator2(nn.Module):
self.nO = nO
self.device = device
hidden = 64
hidden = 32
self.fc1 = nn.Linear(nS, hidden)
self.fco = nn.Linear(nO-1, hidden)
self.fca = nn.Linear(nA, hidden)
self.fc2 = nn.Linear(3*hidden, hidden)
self.fc3 = nn.Linear(hidden, hidden)
self.fc4 = nn.Linear(hidden, hidden)
self.out = nn.Linear(hidden, 1)
def forward(self, state, point, action):
......@@ -716,8 +723,9 @@ class ParetoApproximator2(nn.Module):
fc2 = F.relu(fc2)
fc3 = F.relu(self.fc3(fc2))
fc4 = F.relu(self.fc4(fc3))
out = self.out(fc3)
out = self.out(fc4)
# ensure length is always positive
out = F.softplus(out)
return out
......@@ -858,6 +866,9 @@ def evaluate_agent(agent, env, save_dir, true_non_dominated=None):
if __name__ == '__main__':
import deep_sea_treasure
import minecart
from cart import MinecartReward
from pdqn_minecart import MinecartPDQN
import gym
import argparse
......@@ -869,62 +880,89 @@ if __name__ == '__main__':
parser.add_argument('--copy-reward', default=1, type=int)
parser.add_argument('--copy-pareto', default=100, type=int)
parser.add_argument('--mem-size', default=250000, type=int)
parser.add_argument('--gamma', default=1., type=float)
parser.add_argument('--normalize', action='store_true')
parser.add_argument('--epsilon-decrease', default=0.999, type=float)
parser.add_argument('--rew-estimator', type=str)
parser.add_argument('--par-estimator', type=str)
parser.add_argument('--n-samples', type=int, default=10)
parser.add_argument('--per', action='store_true')
parser.add_argument('--env', default='deep-sea-treasure-v0', type=str)
parser.add_argument('--scheduler-steps', default=None, type=float)
parser.set_defaults(normalize=False, per=False)
args = parser.parse_args()
print(args)
device = 'cpu'
env = gym.make('deep-sea-treasure-v0')
env = gym.make(args.env)
env_args = {'deep-sea-treasure-v0':
{'nS': 110,
'max_steps': np.inf,
'nA': 4,
'scale': np.array([124., 19]),
'observe': lambda s: one_hot(env, s)},
'MinecartSimpleDeterministic-v0':
{'nS': 5,
'max_steps': np.inf,
'nA': 6,
'scale': np.array([2., 80]),
'observe': lambda s: np.append(s['position'], [s['speed'], s['orientation'], *s['content']])},}
nS = env_args[args.env]['nS']
nA = env_args[args.env]['nA']
env.nA = nA
max_steps = env_args[args.env]['max_steps']
nO = 2
ref_point = np.array([-2, -2])
normalize = {'min': np.array([0,0]), 'scale': env_args[args.env]['scale']} if args.normalize else None
def make_q_estimator():
if args.par_estimator is not None:
par_model = torch.load(args.par_estimator)
par_model.device = 'cpu'
else:
par_model = ParetoApproximator2(env.nS, env.nA, nO, device=device).to(device)
par_est = Estimator(par_model, lr=args.lr_pareto, copy_every=args.copy_pareto)
par_model = ParetoApproximator2(nS, nA, nO, device=device).to(device)
par_est = Estimator(par_model, lr=args.lr_pareto, copy_every=args.copy_pareto, scheduler_steps=args.scheduler_steps)
return par_est
if args.rew_estimator is not None:
rew_model = torch.load(args.rew_estimator)
else:
rew_model = RewardApproximator(env.nS, env.nA, nO, device=device).to(device)
rew_model = RewardApproximator(nS, nA, nO, device=device).to(device)
rew_est = Estimator(rew_model, lr=args.lr_reward, copy_every=args.copy_reward)
rew_est = DSTReward(env)
if args.env == 'deep-sea-treasure-v0':
rew_est = DSTReward(env)
true_non_dominated = dst_non_dominated(env, normalize)
ALG = PDQN
else:
rew_est = MinecartReward()
ALG = MinecartPDQN
if not args.per:
memory = Memory(size=args.mem_size)
else:
memory = PrioritizedMemory(n_steps=1e5, size=args.mem_size)
ref_point = np.array([-2, -2])
normalize = {'min': np.array([0,0]), 'scale': np.array([124, 19])} if args.normalize else None
epsilon_decrease = args.epsilon_decrease
true_non_dominated = dst_non_dominated(env, normalize)
agent = PDQN(env, policy=lambda s, q, e: action_selection(s, q, e, ref_point),
agent = ALG(env, policy=lambda s, q, e: action_selection(s, q, e, ref_point),
memory=memory,
observe=lambda s: one_hot(env, s),
observe=env_args[args.env]['observe'],
estimate_reward=rew_est,
make_q_estimator=make_q_estimator,
normalize_reward=normalize,
nO=nO,
learn_start=args.learn_start,
batch_size=args.batch_size,
gamma=1.,
gamma=args.gamma,
n_samples=args.n_samples)
logdir = '/tmp/runs/pdqn/per_{}/lr_reward_{:.2E}/copy_reward_{}/lr_pareto_{:.2E}/copy_pareto_{}/epsilon_dec_{}/samples_{}/'.format(
int(args.per), args.lr_reward, args.copy_reward, args.lr_pareto, args.copy_pareto, args.epsilon_decrease, args.n_samples
)
logdir = (f'/tmp/runs/pdqn/per_{int(args.per)}'
'/lr_reward_{args.lr_reward:.2E}'
'/copy_reward_{args.copy_reward}'
'/lr_pareto_{args.lr_pareto:.2E}'
'/copy_pareto_{args.copy_pareto}'
'/epsilon_dec_{args.epsilon_decrease}'
'/samples_{args.n_samples}/')
# evaluate_agent(agent, env, logdir, true_non_dominated)
agent.train(100000, logdir=logdir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment