Skip to content
Snippets Groups Projects
Commit 42a77ab8 authored by Denis Steckelmacher's avatar Denis Steckelmacher
Browse files

One ProgramOptimizer per action dimension, to help with convergence

parent 3d659075
No related branches found
No related tags found
No related merge requests found
......@@ -75,12 +75,12 @@ class Args:
"""noise clip parameter of the Target Policy Smoothing Regularization"""
# Parameters for the program optimizer
num_individuals: int = 100
num_individuals: int = 50
num_genes: int = 4
num_eval_runs: int = 10
num_generations: int = 20
num_parents_mating: int = 50
num_parents_mating: int = 20
keep_parents: int = 5
mutation_percent_genes: int = 10
......@@ -115,24 +115,19 @@ class QNetwork(nn.Module):
return x
def get_state_actions(program_optimizer, obs, env, args, grad_required=False):
def get_state_actions(program_optimizers, obs, env, args):
program_actions = []
obs = obs.detach().numpy()
for i, o in enumerate(obs):
action = np.zeros(env.action_space.shape, dtype=np.float32)
for eval_run in range(1):
action += program_optimizer.get_actions_from_solution(
program_optimizer.best_solution,
o
)
for eval_run in range(args.num_eval_runs):
for action_index in range(env.action_space.shape[0]):
action[action_index] += program_optimizers[action_index].get_action(o)
program_actions.append(action / args.num_eval_runs)
program_actions = torch.tensor(np.array(program_actions), requires_grad=grad_required)
return program_actions
return np.array(program_actions)
@pyrallis.wrap()
def run_synthesis(args: Args):
......@@ -168,7 +163,7 @@ def run_synthesis(args: Args):
assert isinstance(env.action_space, gym.spaces.Box), "only continuous action space is supported"
# Actor is a learnable program
program_optimizer = ProgramOptimizer(args, env.action_space.shape)
program_optimizers = [ProgramOptimizer(args) for i in range(env.action_space.shape[0])]
qf1 = QNetwork(env).to(device)
qf2 = QNetwork(env).to(device)
......@@ -198,10 +193,8 @@ def run_synthesis(args: Args):
action = env.action_space.sample()
else:
with torch.no_grad():
action = program_optimizer.get_actions_from_solution(
program_optimizer.best_solution,
obs
)
action = get_state_actions(program_optimizers, obs[None, :], env, args)[0]
print('ACTION', action)
# TRY NOT TO MODIFY: execute the game and log data.
next_obs, reward, termination, truncation, info = env.step(action)
......@@ -227,7 +220,8 @@ def run_synthesis(args: Args):
)
# Go over all observations the buffer provides
next_state_actions = get_state_actions(program_optimizer, data.next_observations, env, args)
next_state_actions = get_state_actions(program_optimizers, data.next_observations.detach().numpy(), env, args)
next_state_actions = torch.tensor(next_state_actions)
next_state_actions = (next_state_actions + clipped_noise).clamp(
env.action_space.low[0], env.action_space.high[0]).float()
......@@ -251,7 +245,8 @@ def run_synthesis(args: Args):
# Optimize the program
if global_step % args.policy_frequency == 0:
program_actions = get_state_actions(program_optimizer, data.observations, env, args, grad_required=True)
program_actions = get_state_actions(program_optimizers, data.observations.detach().numpy(), env, args)
program_actions = torch.tensor(program_actions, requires_grad=True)
program_objective = qf1(data.observations, program_actions).mean()
program_objective.backward()
......@@ -259,11 +254,16 @@ def run_synthesis(args: Args):
improved_actions = program_actions + 0.1 * program_actions.grad
RES.append(improved_actions[0].detach().numpy())
program_optimizer.fit(states=data.observations.detach().numpy(),
actions=improved_actions.detach().numpy())
# Fit the program optimizers on all the action dimensions
states = data.observations.detach().numpy()
actions = improved_actions.detach().numpy()
for action_index in range(env.action_space.shape[0]):
program_optimizers[action_index].fit(states, actions[:, action_index])
# Print program
program_optimizer.print_best_solution()
program_optimizers.print_best_solution()
# update the target network
for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
......
......@@ -8,11 +8,10 @@ from dataclasses import dataclass
from postfix_program import Program, NUM_OPERATORS
class ProgramOptimizer:
def __init__(self, config, action_shape):
def __init__(self, config):
# Create the initial population
self.action_shape = action_shape
self.initial_program = [0.0] * (config.num_genes * 2 * action_shape[0]) # Mean and log_std for each gene, for each action dimension
self.initial_program = [0.0] * (config.num_genes * 2) # Mean and log_std for each gene
self.best_solution = self.initial_program
self.best_fitness = None
......@@ -20,31 +19,20 @@ class ProgramOptimizer:
self.config = config
self.initial_population = [np.array(self.initial_program) for i in range(config.num_individuals)]
def get_actions_from_solution(self, solution, state):
# One program per action dimension
program_length = self.config.num_genes * 2
programs = [
Program(genome=solution[i*program_length : (i+1)*program_length])
for i in range(self.action_shape[0])
]
return np.array([p(state) for p in programs], dtype=np.float32)
def print_best_solution(self):
program_length = self.config.num_genes * 2
for i in range(self.action_shape[0]):
p = Program(genome=self.best_solution[i*program_length : (i+1)*program_length])
print(f'a[{i}] =', p.run_program([0.0], do_print=True))
def get_action(self, state):
program = Program(genome=self.best_solution)
return program(state)
def _fitness_func(self, ga_instance, solution, solution_idx):
batch_size = self.states.shape[0]
sum_error = 0.0
program = Program(genome=solution)
# Evaluate the program several times, because evaluations are stochastic
for eval_run in range(self.config.num_eval_runs):
for index in range(batch_size):
action = self.get_actions_from_solution(solution, self.states[index])
action = program(self.states[index])
desired_action = self.actions[index]
sum_error += np.mean((action - desired_action) ** 2)
......@@ -55,7 +43,9 @@ class ProgramOptimizer:
def fit(self, states, actions):
""" states is a batch of states, shape (N, state_shape)
actions is a batch of actions, shape (N, action_shape), we assume continuous actions
actions is a batch of actions, shape (N,), we assume continuous actions
NOTE: One ProgramOptimizer has to be used for each action dimension
"""
self.states = states # picklable self._fitness_func needs these instance variables
self.actions = actions
......@@ -88,47 +78,3 @@ class ProgramOptimizer:
# Best solution for now
self.best_solution = self.ga_instance.best_solution()[0]
@dataclass
class Config:
num_individuals: int = 1000
num_genes: int = 10
num_generations: int = 20
num_parents_mating: int = 10
keep_parents: int = 5
mutation_percent_genes: int = 10
keep_elites: int = 5
@pyrallis.wrap()
def main(config: Config):
optim = ProgramOptimizer(config)
# Sample states and actions
#states = np.array([
# [1.0],
# [2.0],
# [-5.0],
# [10.0],
#])
#states = np.array([[1.0, 2.0], [2.0, 4.0]])
#actions = np.array([[3.0], [6.0]])
states = np.random.random_sample((10, 2))
actions = np.sum(states, axis=1)
actions = np.reshape(actions, (10, 1))
#states = np.load('runs/InvertedPendulum-v4__TD3__1__1720706887/TD3.cleanrl_model_OBSERVATIONS.npy')
#actions = np.load('runs/InvertedPendulum-v4__TD3__1__1720706887/TD3.cleanrl_model_ACTIONS.npy')
# Fit
optim.fit(states, actions)
# Plot
optim.ga_instance.plot_fitness()
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment