One ProgramOptimizer per action dimension, to help with convergence

42a77ab8 · Denis Steckelmacher · 3d659075 · 42a77ab8 · 42a77ab8
Commit 42a77ab8 authored 8 months ago by Denis Steckelmacher
--- a/TD3_program_synthesis.py
+++ b/TD3_program_synthesis.py
@@ -75,12 +75,12 @@ class Args:
    """noise clip parameter of the Target Policy Smoothing Regularization"""

    # Parameters for the program optimizer
-    num_individuals: int = 100
+    num_individuals: int = 50
    num_genes: int = 4
    num_eval_runs: int = 10

    num_generations: int = 20
-    num_parents_mating: int = 50
+    num_parents_mating: int = 20
    keep_parents: int = 5
    mutation_percent_genes: int = 10

@@ -115,24 +115,19 @@ class QNetwork(nn.Module):
        return x


-def get_state_actions(program_optimizer, obs, env, args, grad_required=False):
+def get_state_actions(program_optimizers, obs, env, args):
    program_actions = []
-    obs = obs.detach().numpy()

    for i, o in enumerate(obs):
        action = np.zeros(env.action_space.shape, dtype=np.float32)

-        for eval_run in range(1):
-            action += program_optimizer.get_actions_from_solution(
-                program_optimizer.best_solution,
-                o
-            )
+        for eval_run in range(args.num_eval_runs):
+            for action_index in range(env.action_space.shape[0]):
+                action[action_index] += program_optimizers[action_index].get_action(o)

        program_actions.append(action / args.num_eval_runs)

-    program_actions = torch.tensor(np.array(program_actions), requires_grad=grad_required)
-    return program_actions
-
+    return np.array(program_actions)

 @pyrallis.wrap()
 def run_synthesis(args: Args):
@@ -168,7 +163,7 @@ def run_synthesis(args: Args):
    assert isinstance(env.action_space, gym.spaces.Box), "only continuous action space is supported"

    # Actor is a learnable program
-    program_optimizer = ProgramOptimizer(args, env.action_space.shape)
+    program_optimizers = [ProgramOptimizer(args) for i in range(env.action_space.shape[0])]

    qf1 = QNetwork(env).to(device)
    qf2 = QNetwork(env).to(device)
@@ -198,10 +193,8 @@ def run_synthesis(args: Args):
            action = env.action_space.sample()
        else:
            with torch.no_grad():
-                action = program_optimizer.get_actions_from_solution(
-                    program_optimizer.best_solution,
-                    obs
-                )
+                action = get_state_actions(program_optimizers, obs[None, :], env, args)[0]
+                print('ACTION', action)

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, termination, truncation, info = env.step(action)
@@ -227,7 +220,8 @@ def run_synthesis(args: Args):
                )

                # Go over all observations the buffer provides
-                next_state_actions = get_state_actions(program_optimizer, data.next_observations, env, args)
+                next_state_actions = get_state_actions(program_optimizers, data.next_observations.detach().numpy(), env, args)
+                next_state_actions = torch.tensor(next_state_actions)
                next_state_actions = (next_state_actions + clipped_noise).clamp(
                    env.action_space.low[0], env.action_space.high[0]).float()

@@ -251,7 +245,8 @@ def run_synthesis(args: Args):

            # Optimize the program
            if global_step % args.policy_frequency == 0:
-                program_actions = get_state_actions(program_optimizer, data.observations, env, args, grad_required=True)
+                program_actions = get_state_actions(program_optimizers, data.observations.detach().numpy(), env, args)
+                program_actions = torch.tensor(program_actions, requires_grad=True)

                program_objective = qf1(data.observations, program_actions).mean()
                program_objective.backward()
@@ -259,11 +254,16 @@ def run_synthesis(args: Args):
                improved_actions = program_actions + 0.1 * program_actions.grad

                RES.append(improved_actions[0].detach().numpy())
-                program_optimizer.fit(states=data.observations.detach().numpy(),
-                                      actions=improved_actions.detach().numpy())
+
+                # Fit the program optimizers on all the action dimensions
+                states = data.observations.detach().numpy()
+                actions = improved_actions.detach().numpy()
+
+                for action_index in range(env.action_space.shape[0]):
+                    program_optimizers[action_index].fit(states, actions[:, action_index])

                # Print program
-                program_optimizer.print_best_solution()
+                program_optimizers.print_best_solution()

            # update the target network
            for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):

--- a/optim.py
+++ b/optim.py
@@ -8,11 +8,10 @@ from dataclasses import dataclass
 from postfix_program import Program, NUM_OPERATORS

 class ProgramOptimizer:
-    def __init__(self, config, action_shape):
+    def __init__(self, config):

        # Create the initial population
-        self.action_shape = action_shape
-        self.initial_program = [0.0] * (config.num_genes * 2 * action_shape[0])  # Mean and log_std for each gene, for each action dimension
+        self.initial_program = [0.0] * (config.num_genes * 2)  # Mean and log_std for each gene

        self.best_solution = self.initial_program
        self.best_fitness = None
@@ -20,31 +19,20 @@ class ProgramOptimizer:
        self.config = config
        self.initial_population = [np.array(self.initial_program) for i in range(config.num_individuals)]

-    def get_actions_from_solution(self, solution, state):
-        # One program per action dimension
-        program_length = self.config.num_genes * 2
-        programs = [
-            Program(genome=solution[i*program_length : (i+1)*program_length])
-            for i in range(self.action_shape[0])
-        ]
-
-        return np.array([p(state) for p in programs], dtype=np.float32)
-
-    def print_best_solution(self):
-        program_length = self.config.num_genes * 2
-
-        for i in range(self.action_shape[0]):
-            p = Program(genome=self.best_solution[i*program_length : (i+1)*program_length])
-            print(f'a[{i}] =', p.run_program([0.0], do_print=True))
+    def get_action(self, state):
+        program = Program(genome=self.best_solution)
+        return program(state)

    def _fitness_func(self, ga_instance, solution, solution_idx):
        batch_size = self.states.shape[0]
        sum_error = 0.0

+        program = Program(genome=solution)
+
        # Evaluate the program several times, because evaluations are stochastic
        for eval_run in range(self.config.num_eval_runs):
            for index in range(batch_size):
-                action = self.get_actions_from_solution(solution, self.states[index])
+                action = program(self.states[index])
                desired_action = self.actions[index]

                sum_error += np.mean((action - desired_action) ** 2)
@@ -55,7 +43,9 @@ class ProgramOptimizer:

    def fit(self, states, actions):
        """ states is a batch of states, shape (N, state_shape)
-            actions is a batch of actions, shape (N, action_shape), we assume continuous actions
+            actions is a batch of actions, shape (N,), we assume continuous actions
+
+            NOTE: One ProgramOptimizer has to be used for each action dimension
        """
        self.states = states        # picklable self._fitness_func needs these instance variables
        self.actions = actions
@@ -88,47 +78,3 @@ class ProgramOptimizer:

        # Best solution for now
        self.best_solution = self.ga_instance.best_solution()[0]
-
-@dataclass
-class Config:
-    num_individuals: int = 1000
-    num_genes: int = 10
-
-    num_generations: int = 20
-    num_parents_mating: int = 10
-    keep_parents: int = 5
-    mutation_percent_genes: int = 10
-    keep_elites: int = 5
-
-
-@pyrallis.wrap()
-def main(config: Config):
-    optim = ProgramOptimizer(config)
-
-    # Sample states and actions
-    #states = np.array([
-    #    [1.0],
-    #    [2.0],
-    #    [-5.0],
-    #    [10.0],
-    #])
-
-    #states = np.array([[1.0, 2.0], [2.0, 4.0]])
-    #actions = np.array([[3.0], [6.0]])
-
-    states = np.random.random_sample((10, 2))
-    actions = np.sum(states, axis=1)
-    actions = np.reshape(actions, (10, 1))
-
-    #states = np.load('runs/InvertedPendulum-v4__TD3__1__1720706887/TD3.cleanrl_model_OBSERVATIONS.npy')
-    #actions = np.load('runs/InvertedPendulum-v4__TD3__1__1720706887/TD3.cleanrl_model_ACTIONS.npy')
-
-    # Fit
-    optim.fit(states, actions)
-
-    # Plot
-    optim.ga_instance.plot_fitness()
-
-
-if __name__ == '__main__':
-    main()