From 4e48bfcab12e7033c158e2389f4d36cd33c7f90e Mon Sep 17 00:00:00 2001 From: Denis Steckelmacher <steckdenis@yahoo.fr> Date: Fri, 23 Aug 2024 12:58:53 +0200 Subject: [PATCH] Learns on SimpleGoal :D --- TD3_program_synthesis.py | 9 +++++++-- optim.py | 14 +++++++------- postfix_program.py | 11 ++++++----- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/TD3_program_synthesis.py b/TD3_program_synthesis.py index 4a1f830..245b161 100644 --- a/TD3_program_synthesis.py +++ b/TD3_program_synthesis.py @@ -152,7 +152,12 @@ def run_synthesis(args: Args): assert isinstance(env.action_space, gym.spaces.Box), "only continuous action space is supported" # Actor is a learnable program - program_optimizers = [ProgramOptimizer(args, env.observation_space, env.action_space) for i in range(env.action_space.shape[0])] + program_optimizers = [ProgramOptimizer( + args, + env.observation_space, + env.action_space.low[i], + env.action_space.high[i] + ) for i in range(env.action_space.shape[0])] for action_index in range(env.action_space.shape[0]): print(f"a[{action_index}] = {program_optimizers[action_index].get_best_solution_str()}") @@ -247,7 +252,7 @@ def run_synthesis(args: Args): cur_program_actions = np.copy(orig_program_actions) print('BEFORE ACTIONS', orig_program_actions[0]) - for i in range(10): + for i in range(50): program_actions = torch.tensor(cur_program_actions, requires_grad=True) program_objective_1 = qf1(data.observations, program_actions).mean() diff --git a/optim.py b/optim.py index 7948542..8b0424c 100644 --- a/optim.py +++ b/optim.py @@ -11,9 +11,11 @@ def print_fitness(ga, fitnesses): print('F', fitnesses.mean(), file=sys.stderr) class ProgramOptimizer: - def __init__(self, config, state_space, action_space): + def __init__(self, config, state_space, low, high): + self.config = config self.state_dim = state_space.shape[0] - self.action_space = action_space + self.low = low + self.high = high # Create the initial population # We create it so these random programs try all the operators and read all the state variables @@ -23,10 +25,8 @@ class ProgramOptimizer: self.best_solution = self.initial_population[0] self.best_fitness = None - self.config = config - def get_action(self, state): - program = Program(self.best_solution, self.state_dim, self.action_space) + program = Program(self.best_solution, self.state_dim, self.low, self.high) try: return program(state) @@ -34,7 +34,7 @@ class ProgramOptimizer: return np.random.normal() def get_best_solution_str(self): - program = Program(self.best_solution, self.state_dim, self.action_space) + program = Program(self.best_solution, self.state_dim, self.low, self.high) try: return program.to_string() @@ -42,7 +42,7 @@ class ProgramOptimizer: return '<invalid program>' def _fitness_func(self, ga_instance, solution, solution_idx): - program = Program(solution, self.state_dim, self.action_space) + program = Program(solution, self.state_dim, self.low, self.high) try: # Num input variables looked at diff --git a/postfix_program.py b/postfix_program.py index 1aff651..ad73769 100644 --- a/postfix_program.py +++ b/postfix_program.py @@ -41,10 +41,11 @@ class InvalidProgramException(Exception): pass class Program: - def __init__(self, genome, state_dim, action_space): + def __init__(self, genome, state_dim, low, high): self.tokens = genome self.state_dim = state_dim - self.action_space = action_space + self.low = low + self.high = high def to_string(self): def on_literal_func(stack, token): @@ -96,7 +97,7 @@ class Program: x /= AVG # Clip action - x = np.clip(x, self.action_space.low, self.action_space.high) + x = np.clip(x, self.low, self.high) return x def num_inputs_looked_at(self): @@ -157,7 +158,7 @@ def dbg_average(): for i in range(100000): dna = np.random.random((l,)) dna *= -(NUM_OPERATORS + 1) # Tokens between -NUM_OPERATORS - state_dim and 0 - p = Program(dna, 1) + p = Program(dna, 1, -1.0, 1.0) try: values.append(p([0.0])) @@ -177,7 +178,7 @@ def dbg_random_functions(): dna = np.random.random((5,)) dna *= -(NUM_OPERATORS + 1) # Tokens between -NUM_OPERATORS - state_dim and 0 - p = Program(dna, 2, gym.spaces.Box(low=0.0, high=1.0, shape=(1,))) + p = Program(dna, 2, 0.0, 1.0) print(p.to_string()) -- GitLab