Commit 63b672ab authored by Mathieu Reymond's avatar Mathieu Reymond
Browse files

pdqn on both DST and Minecart

parent e221dc95
import numpy as np
import math
EPS_SPEED = 0.001 # Minimum speed to be considered in motion
HOME_X = .0
HOME_Y = .0
HOME_POS = (HOME_X, HOME_Y)
ROTATION = 10
MAX_SPEED = 1.
FUEL_MINE = -.05
FUEL_ACC = -.025
FUEL_IDLE = -0.005
CAPACITY = 1
ACT_MINE = 0
ACT_LEFT = 1
ACT_RIGHT = 2
ACT_ACCEL = 3
ACT_BRAKE = 4
ACT_NONE = 5
ACTIONS = ["Mine", "Left", "Right", "Accelerate", "Brake", "None"]
ACTION_COUNT = len(ACTIONS)
MINE_RADIUS = 0.14
BASE_RADIUS = 0.15
WIDTH = 480
HEIGHT = 480
# Color definitions
WHITE = (255, 255, 255)
GRAY = (150, 150, 150)
C_GRAY = (150 / 255., 150 / 255., 150 / 255.)
DARK_GRAY = (100, 100, 100)
BLACK = (0, 0, 0)
RED = (255, 70, 70)
C_RED = (1., 70 / 255., 70 / 255.)
FPS = 24
MINE_LOCATION_TRIES = 100
MINE_SCALE = 1.
BASE_SCALE = 1.
CART_SCALE = 1.
MARGIN = 0.16 * CART_SCALE
ACCELERATION = 0.0075 * CART_SCALE
DECELERATION = 1
def clip(val, lo, hi):
return lo if val <= lo else hi if val >= hi else val
def mag(vector2d):
return np.sqrt(np.dot(vector2d,vector2d))
class Cart():
"""Class representing the actual minecart
"""
def __init__(self, ore_cnt):
self.ore_cnt = ore_cnt
self.pos = np.array([HOME_X, HOME_Y])
self.speed = 0
self.angle = 45
self.content = np.zeros(self.ore_cnt)
self.departed = False # Keep track of whether the agent has left the base
def accelerate(self, acceleration):
self.speed = clip(self.speed + acceleration, 0, MAX_SPEED)
def rotate(self, rotation):
self.angle = (self.angle + rotation) % 360
def step(self):
"""
Update cart's position, taking the current speed into account
Colliding with a border at anything but a straight angle will cause
cart to "slide" along the wall.
"""
pre = np.copy(self.pos)
if self.speed < EPS_SPEED:
return False
x_velocity = self.speed * math.cos(self.angle * math.pi / 180)
y_velocity = self.speed * math.sin(self.angle * math.pi / 180)
x, y = self.pos
if y != 0 and y != 1 and (y_velocity > 0 + EPS_SPEED or
y_velocity < 0 - EPS_SPEED):
if x == 1 and x_velocity > 0:
self.angle += math.copysign(ROTATION, y_velocity)
if x == 0 and x_velocity < 0:
self.angle -= math.copysign(ROTATION, y_velocity)
if x != 0 and x != 1 and (x_velocity > 0 + EPS_SPEED or
x_velocity < 0 - EPS_SPEED):
if y == 1 and y_velocity > 0:
self.angle -= math.copysign(ROTATION, x_velocity)
if y == 0 and y_velocity < 0:
self.angle += math.copysign(ROTATION, x_velocity)
self.pos[0] = clip(x + x_velocity, 0, 1)
self.pos[1] = clip(y + y_velocity, 0, 1)
self.speed = mag(pre - self.pos)
return True
class MinecartReward(object):
def __init__(self):
self.frame_skip = 4
self.nO = 2
self.fuel_mine = -.05
self.fuel_acc = -.025
self.fuel_idle = -0.005
self.base_radius = .15
self.base_scale = 1.
self.rotation = 10
self.acc = 0.0075
self.dec = 1
def __call__(self, state, action, use_target_network=False):
r = np.zeros((len(state), self.nO))
actions = np.zeros(6)
actions[3] = self.fuel_acc * self.frame_skip
actions[0] = self.fuel_mine * self.frame_skip
r[:, -1] = self.fuel_idle * self.frame_skip
r[:, -1] += actions[action]
next_pos = np.zeros((len(state), 2))
for i in range(len(next_pos)):
c = Cart(1)
# s['position'], [s['speed'], s['orientation'], *s['content']
c.pos = state[i, :2]
c.speed = state[i, 2]
c.angle = state[i, 3]
a = action[i]
for _ in range(self.frame_skip):
if a == 1:
c.rotate(-self.rotation)
elif a == 2:
c.rotate(self.rotation)
elif a == 3:
c.accelerate(self.acc)
elif a == 4:
c.accelerate(-self.dec)
c.step()
next_pos[i] = c.pos
if np.sqrt(np.dot(c.pos, c.pos)) < self.base_scale*self.base_radius:
r[i, :(self.nO-1)] += state[i, 4:]
break
return r
def update(self, *args, **kwargs):
return 0
......@@ -155,7 +155,7 @@ class PrioritizedMemory(Memory):
class Estimator(object):
def __init__(self, model, lr=1e-3, tau=1., copy_every=0, clamp=None, device='cpu'):
def __init__(self, model, lr=1e-3, tau=1., copy_every=0, clamp=None, scheduler_steps=1000, device='cpu'):
self.model = model
self.target_model = copy.deepcopy(model)
self.device = device
......@@ -164,6 +164,10 @@ class Estimator(object):
self.tau = tau
self.clamp = clamp
self.opt = torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=0)
if scheduler_steps is None:
self.lr_scheduler = None
else:
self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.opt, lambda e: np.amax([(scheduler_steps-e)/scheduler_steps, 0.]))
self.loss = nn.MSELoss(reduction='none')
def should_copy(self, step):
......@@ -205,6 +209,8 @@ class Estimator(object):
self.total_norm = total_norm
self.opt.step()
if self.lr_scheduler is not None:
self.lr_scheduler.step()
if self.should_copy(step):
self.update_target(self.tau)
......@@ -688,12 +694,13 @@ class ParetoApproximator2(nn.Module):
self.nO = nO
self.device = device
hidden = 64
hidden = 32
self.fc1 = nn.Linear(nS, hidden)
self.fco = nn.Linear(nO-1, hidden)
self.fca = nn.Linear(nA, hidden)
self.fc2 = nn.Linear(3*hidden, hidden)
self.fc3 = nn.Linear(hidden, hidden)
self.fc4 = nn.Linear(hidden, hidden)
self.out = nn.Linear(hidden, 1)
def forward(self, state, point, action):
......@@ -716,8 +723,9 @@ class ParetoApproximator2(nn.Module):
fc2 = F.relu(fc2)
fc3 = F.relu(self.fc3(fc2))
fc4 = F.relu(self.fc4(fc3))
out = self.out(fc3)
out = self.out(fc4)
# ensure length is always positive
out = F.softplus(out)
return out
......@@ -858,6 +866,9 @@ def evaluate_agent(agent, env, save_dir, true_non_dominated=None):
if __name__ == '__main__':
import deep_sea_treasure
import minecart
from cart import MinecartReward
from pdqn_minecart import MinecartPDQN
import gym
import argparse
......@@ -869,62 +880,89 @@ if __name__ == '__main__':
parser.add_argument('--copy-reward', default=1, type=int)
parser.add_argument('--copy-pareto', default=100, type=int)
parser.add_argument('--mem-size', default=250000, type=int)
parser.add_argument('--gamma', default=1., type=float)
parser.add_argument('--normalize', action='store_true')
parser.add_argument('--epsilon-decrease', default=0.999, type=float)
parser.add_argument('--rew-estimator', type=str)
parser.add_argument('--par-estimator', type=str)
parser.add_argument('--n-samples', type=int, default=10)
parser.add_argument('--per', action='store_true')
parser.add_argument('--env', default='deep-sea-treasure-v0', type=str)
parser.add_argument('--scheduler-steps', default=None, type=float)
parser.set_defaults(normalize=False, per=False)
args = parser.parse_args()
print(args)
device = 'cpu'
env = gym.make('deep-sea-treasure-v0')
env = gym.make(args.env)
env_args = {'deep-sea-treasure-v0':
{'nS': 110,
'max_steps': np.inf,
'nA': 4,
'scale': np.array([124., 19]),
'observe': lambda s: one_hot(env, s)},
'MinecartSimpleDeterministic-v0':
{'nS': 5,
'max_steps': np.inf,
'nA': 6,
'scale': np.array([2., 80]),
'observe': lambda s: np.append(s['position'], [s['speed'], s['orientation'], *s['content']])},}
nS = env_args[args.env]['nS']
nA = env_args[args.env]['nA']
env.nA = nA
max_steps = env_args[args.env]['max_steps']
nO = 2
ref_point = np.array([-2, -2])
normalize = {'min': np.array([0,0]), 'scale': env_args[args.env]['scale']} if args.normalize else None
def make_q_estimator():
if args.par_estimator is not None:
par_model = torch.load(args.par_estimator)
par_model.device = 'cpu'
else:
par_model = ParetoApproximator2(env.nS, env.nA, nO, device=device).to(device)
par_est = Estimator(par_model, lr=args.lr_pareto, copy_every=args.copy_pareto)
par_model = ParetoApproximator2(nS, nA, nO, device=device).to(device)
par_est = Estimator(par_model, lr=args.lr_pareto, copy_every=args.copy_pareto, scheduler_steps=args.scheduler_steps)
return par_est
if args.rew_estimator is not None:
rew_model = torch.load(args.rew_estimator)
else:
rew_model = RewardApproximator(env.nS, env.nA, nO, device=device).to(device)
rew_model = RewardApproximator(nS, nA, nO, device=device).to(device)
rew_est = Estimator(rew_model, lr=args.lr_reward, copy_every=args.copy_reward)
rew_est = DSTReward(env)
if args.env == 'deep-sea-treasure-v0':
rew_est = DSTReward(env)
true_non_dominated = dst_non_dominated(env, normalize)
ALG = PDQN
else:
rew_est = MinecartReward()
ALG = MinecartPDQN
if not args.per:
memory = Memory(size=args.mem_size)
else:
memory = PrioritizedMemory(n_steps=1e5, size=args.mem_size)
ref_point = np.array([-2, -2])
normalize = {'min': np.array([0,0]), 'scale': np.array([124, 19])} if args.normalize else None
epsilon_decrease = args.epsilon_decrease
true_non_dominated = dst_non_dominated(env, normalize)
agent = PDQN(env, policy=lambda s, q, e: action_selection(s, q, e, ref_point),
agent = ALG(env, policy=lambda s, q, e: action_selection(s, q, e, ref_point),
memory=memory,
observe=lambda s: one_hot(env, s),
observe=env_args[args.env]['observe'],
estimate_reward=rew_est,
make_q_estimator=make_q_estimator,
normalize_reward=normalize,
nO=nO,
learn_start=args.learn_start,
batch_size=args.batch_size,
gamma=1.,
gamma=args.gamma,
n_samples=args.n_samples)
logdir = '/tmp/runs/pdqn/per_{}/lr_reward_{:.2E}/copy_reward_{}/lr_pareto_{:.2E}/copy_pareto_{}/epsilon_dec_{}/samples_{}/'.format(
int(args.per), args.lr_reward, args.copy_reward, args.lr_pareto, args.copy_pareto, args.epsilon_decrease, args.n_samples
)
logdir = (f'/tmp/runs/pdqn/per_{int(args.per)}'
'/lr_reward_{args.lr_reward:.2E}'
'/copy_reward_{args.copy_reward}'
'/lr_pareto_{args.lr_pareto:.2E}'
'/copy_pareto_{args.copy_pareto}'
'/epsilon_dec_{args.epsilon_decrease}'
'/samples_{args.n_samples}/')
# evaluate_agent(agent, env, logdir, true_non_dominated)
agent.train(100000, logdir=logdir)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment