From 4e48bfcab12e7033c158e2389f4d36cd33c7f90e Mon Sep 17 00:00:00 2001
From: Denis Steckelmacher <steckdenis@yahoo.fr>
Date: Fri, 23 Aug 2024 12:58:53 +0200
Subject: [PATCH] Learns on SimpleGoal :D

---
 TD3_program_synthesis.py |  9 +++++++--
 optim.py                 | 14 +++++++-------
 postfix_program.py       | 11 ++++++-----
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/TD3_program_synthesis.py b/TD3_program_synthesis.py
index 4a1f830..245b161 100644
--- a/TD3_program_synthesis.py
+++ b/TD3_program_synthesis.py
@@ -152,7 +152,12 @@ def run_synthesis(args: Args):
     assert isinstance(env.action_space, gym.spaces.Box), "only continuous action space is supported"
 
     # Actor is a learnable program
-    program_optimizers = [ProgramOptimizer(args, env.observation_space, env.action_space) for i in range(env.action_space.shape[0])]
+    program_optimizers = [ProgramOptimizer(
+        args,
+        env.observation_space,
+        env.action_space.low[i],
+        env.action_space.high[i]
+    ) for i in range(env.action_space.shape[0])]
 
     for action_index in range(env.action_space.shape[0]):
         print(f"a[{action_index}] = {program_optimizers[action_index].get_best_solution_str()}")
@@ -247,7 +252,7 @@ def run_synthesis(args: Args):
                 cur_program_actions = np.copy(orig_program_actions)
                 print('BEFORE ACTIONS', orig_program_actions[0])
 
-                for i in range(10):
+                for i in range(50):
                     program_actions = torch.tensor(cur_program_actions, requires_grad=True)
 
                     program_objective_1 = qf1(data.observations, program_actions).mean()
diff --git a/optim.py b/optim.py
index 7948542..8b0424c 100644
--- a/optim.py
+++ b/optim.py
@@ -11,9 +11,11 @@ def print_fitness(ga, fitnesses):
     print('F', fitnesses.mean(), file=sys.stderr)
 
 class ProgramOptimizer:
-    def __init__(self, config, state_space, action_space):
+    def __init__(self, config, state_space, low, high):
+        self.config = config
         self.state_dim = state_space.shape[0]
-        self.action_space = action_space
+        self.low = low
+        self.high = high
 
         # Create the initial population
         # We create it so these random programs try all the operators and read all the state variables
@@ -23,10 +25,8 @@ class ProgramOptimizer:
         self.best_solution = self.initial_population[0]
         self.best_fitness = None
 
-        self.config = config
-
     def get_action(self, state):
-        program = Program(self.best_solution, self.state_dim, self.action_space)
+        program = Program(self.best_solution, self.state_dim, self.low, self.high)
 
         try:
             return program(state)
@@ -34,7 +34,7 @@ class ProgramOptimizer:
             return np.random.normal()
 
     def get_best_solution_str(self):
-        program = Program(self.best_solution, self.state_dim, self.action_space)
+        program = Program(self.best_solution, self.state_dim, self.low, self.high)
 
         try:
             return program.to_string()
@@ -42,7 +42,7 @@ class ProgramOptimizer:
             return '<invalid program>'
 
     def _fitness_func(self, ga_instance, solution, solution_idx):
-        program = Program(solution, self.state_dim, self.action_space)
+        program = Program(solution, self.state_dim, self.low, self.high)
 
         try:
             # Num input variables looked at
diff --git a/postfix_program.py b/postfix_program.py
index 1aff651..ad73769 100644
--- a/postfix_program.py
+++ b/postfix_program.py
@@ -41,10 +41,11 @@ class InvalidProgramException(Exception):
     pass
 
 class Program:
-    def __init__(self, genome, state_dim, action_space):
+    def __init__(self, genome, state_dim, low, high):
         self.tokens = genome
         self.state_dim = state_dim
-        self.action_space = action_space
+        self.low = low
+        self.high = high
 
     def to_string(self):
         def on_literal_func(stack, token):
@@ -96,7 +97,7 @@ class Program:
         x /= AVG
 
         # Clip action
-        x = np.clip(x, self.action_space.low, self.action_space.high)
+        x = np.clip(x, self.low, self.high)
         return x
 
     def num_inputs_looked_at(self):
@@ -157,7 +158,7 @@ def dbg_average():
         for i in range(100000):
             dna = np.random.random((l,))
             dna *= -(NUM_OPERATORS + 1)                 # Tokens between -NUM_OPERATORS - state_dim and 0
-            p = Program(dna, 1)
+            p = Program(dna, 1, -1.0, 1.0)
 
             try:
                 values.append(p([0.0]))
@@ -177,7 +178,7 @@ def dbg_random_functions():
 
         dna = np.random.random((5,))
         dna *= -(NUM_OPERATORS + 1)                 # Tokens between -NUM_OPERATORS - state_dim and 0
-        p = Program(dna, 2, gym.spaces.Box(low=0.0, high=1.0, shape=(1,)))
+        p = Program(dna, 2, 0.0, 1.0)
 
         print(p.to_string())
 
-- 
GitLab