网站建设好了怎么在百度可以搜到网页设计岗位介绍
2026/5/19 0:54:35 网站建设 项目流程
网站建设好了怎么在百度可以搜到,网页设计岗位介绍,服装网站建设分析,庐江网站制作理论基础#xff1a;on policy#xff1a;behavior policytarget policyoff policy#xff1a;behavior policy!target policy注意#xff1a;behavior policy的初始化最好具有较强的随机性#xff0c;就能尽可能遍历到所有的(s, a)pair。强化学习的数据基础这种书中有不同…理论基础on policybehavior policytarget policyoff policybehavior policy!target policy注意behavior policy的初始化最好具有较强的随机性就能尽可能遍历到所有的(s, a)pair。强化学习的数据基础这种书中有不同的behavior policy导致的不同的探索路径的图代码可运行import numpy as np from env import GridWorldEnv from utils import drow_policy class Q_Learning(object): def __init__(self, env: GridWorldEnv, gamma0.9, alpha0.001, epsilon0.1, samples1, start_state(0, 0),modeon policy): :param env: 定义了网格的基础配置 :param gamma: discount rate :param alpha: learning rate :param samples: 从起点到终点采样的路径数 :param start_state: 起点 :param mode: 模式 self.env env self.action_space_size self.env.num_actions # 上下左右原地 self.state_space_size self.env.num_states self.reward_list self.env.reward_list self.gamma gamma self.samples samples self.alpha alpha self.epsilon epsilon self.modemode self.start_state self.env.state_id(start_state[0], start_state[1]) self.behavior_policy np.ones( (self.state_space_size, self.action_space_size)) / self.action_space_size # 探索性很强 self.target_policy np.zeros((self.state_space_size, self.action_space_size)) self.qvalues np.zeros((self.state_space_size, self.action_space_size)) def update_qvalues(self,s_t,a_t,s_next,r_next): max_q_next np.max(self.qvalues[s_next]) td_target r_next self.gamma * max_q_next td_error td_target - self.qvalues[s_t][a_t] # 负号提出去 self.qvalues[s_t][a_t] self.alpha * td_error def solve(self): if self.modeoff policy: for _ in range(self.samples): s self.start_state a np.random.choice(self.action_space_size, pself.behavior_policy[s]) episode self.env.generate_episodes(self.behavior_policy, s, a) for i in range(len(episode)): s_t, a_t, r_next_t, s_next_t episode[i] self.update_qvalues(s_t,a_t,s_next_t,r_next_t) # greedy best_a np.argmax(self.qvalues[s_t]) self.target_policy[s_t] np.eye(self.action_space_size)[best_a] elif self.modeon policy: # target_policybehavior_policy for _ in range(self.samples): s self.start_state while s not in self.env.terminal: a np.random.choice(self.action_space_size, pself.behavior_policy[s]) # generate at following πt(st) next_s, next_r, _ self.env.step(s, a) # generate rt1, st1 by interacting with the environment # updata q-value for (s_t,a_t) # qt1(st, at) qt(st, at) − αt(st, at) [ qt(st, at) − (rt1 γ max(qt(st1, a)))] self.update_qvalues(s,a,next_s,next_r) # update policy for s_t: epsilon greedy 因为要用policy生成数据因此需要策略具有一定的探索性因此使用epsilon greedy best_a np.argmax(self.qvalues[s]) self.behavior_policy[s] self.epsilon / self.action_space_size self.behavior_policy[s, best_a] 1 - self.epsilon self.target_policyself.behavior_policy s next_s else: raise Exception(Invalid mode) if __name__ __main__: env GridWorldEnv( size5, forbidden[(1, 2), (3, 3)], terminal[(4, 4)], r_boundary-1, r_other-0.04, r_terminal1, r_forbidden-1, r_stay-0.1 ) # 注意samples要大一点否则每个state被访问到的概率很小 vi Q_Learning(envenv, gamma0.8, alpha0.01, samples1000, start_state(0, 0),modeoff policy) vi.solve() print(\n state value: ) print(vi.qvalues) drow_policy(vi.target_policy, env)

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询