diff --git a/TD3_program_synthesis.py b/TD3_program_synthesis.py index 2627fade1fd5804a40031eeb8d7b08bef7877f14..ef42ced83e5ebcd74ee9faf48d2f8cd6d5e8bc1c 100644 --- a/TD3_program_synthesis.py +++ b/TD3_program_synthesis.py @@ -105,6 +105,11 @@ class QNetwork(nn.Module): return x +# [!1] This is a helper function to retrieve proposed actions for each action variable, given a list of observations. +# A program is in our implementation is responsible for only 1 action variable. So, to optimize the actor for an action +# space of size two, 2 separate optimizers are used, each having a population to be optimized for the respective +# variable. + def get_state_actions(program_optimizers, obs, env, args): program_actions = [] @@ -205,6 +210,8 @@ def run_synthesis(args: Args): # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` real_next_obs = next_obs.copy() + + # [!1] Buffer is filled with experiences, also used for the genetic algorithm rb.add(obs, real_next_obs, action, reward, termination, info) # RESET @@ -248,10 +255,12 @@ def run_synthesis(args: Args): # Optimize the program if global_step % args.policy_frequency == 0: + # [!1] Use helper function to retrieve proposed actions from the optimizers orig_program_actions = get_state_actions(program_optimizers, data.observations.detach().numpy(), env, args) cur_program_actions = np.copy(orig_program_actions) print('BEFORE ACTIONS', orig_program_actions[0]) + # [!1][!2] The calculation of a more optimal action, given the proposed action of the actor for i in range(500): program_actions = torch.tensor(cur_program_actions, requires_grad=True) @@ -261,7 +270,7 @@ def run_synthesis(args: Args): program_objective.backward() with torch.no_grad(): - cur_program_actions += program_actions.grad.numpy() + cur_program_actions += program_actions.grad.numpy() # [!2] Actual calculation using the gradient if np.abs(cur_program_actions - orig_program_actions).mean() > 0.5: break @@ -276,7 +285,12 @@ def run_synthesis(args: Args): print('Best program:') writer.add_scalar("losses/program_objective", program_objective.item(), global_step) + # [!1] Each action variable has an optimizer to generate a program controlling that variable for action_index in range(env.action_space.shape[0]): + # [!2] The fitting is the actual optimization process, where the genetic algorithm iterates on the + # population of candidates inside each optimizer (--> see ./optim.py). + # Given actions to the fit method are the ones more calculated with the gradients above. Inside the + # optimizer, the program proposed actions are retrieved for all states of the states argument. program_optimizers[action_index].fit(states, actions[:, action_index]) print(f"a[{action_index}] = {program_optimizers[action_index].get_best_solution_str()}") diff --git a/optim.py b/optim.py index 7da82c1112c24c5438f90a3c6836fc544baa4f1e..db0d4b0649850e86cd367cd75f52ce36add0b3ce 100644 --- a/optim.py +++ b/optim.py @@ -58,6 +58,7 @@ class ProgramOptimizer: for index in range(batch_size): # MSE for the loss + #[!2] Retrieve proposed actions and calculate the distance with more optimal actions action = program(self.states[index]) desired_action = self.actions[index] @@ -88,6 +89,9 @@ class ProgramOptimizer: self.fitness_pop[solution_idx].append(fitness) return fitness + # [!2] Optimization is to minimize the distance between program proposed action and the improved action, calculated + # using the gradients of the critic. The actions parameter is the list of optimal actions. Proposed actions are + # retrieved in the fitness function (line 61 above) def fit(self, states, actions): """ states is a batch of states, shape (N, state_shape) actions is a batch of actions, shape (N,), we assume continuous actions