#!/usr/bin/env python import random #from chainer import cuda import time, threading import sys import math import copy # set of states states = [0,1,2] # Q-functioin: mapping from state-action pairs to expected sum of discounted rewards Qfn = dict() # create a hash key for a particular state action pair def getHashKey(s,a): return "s"+str(s)+"-a"+str(a) # return the available actions for a particular state def getActionSet(state): return [0,1] # policy which maps state to action using the Q-function def q_policy(s_t): # acts according to q -function actions = getActionSet(s_t) best_action = -1 best_action_q = -1 for a in actions: q_sa = Qfn[ getHashKey(s_t,a) ] if best_action == -1 or q_sa > best_action_q: best_action = a best_action_q = q_sa return best_action # policy that acts randomly def random_policy(s_t): actions = getActionSet(s_t) action_index = random.randrange(len(actions)) return actions[action_index] # simultes taking action a_t in state s_t def sim_action(s_t, a_t): next_state = -1 reward = -1 r_number = random.uniform(0.0,1.0) if s_t == 0: if a_t == 1: next_state = 1 reward = 0 elif a_t == 0: if r_number < 0.5: next_state = s_t # stay in same state else: next_state = 1 reward = 0 elif s_t == 1: if a_t == 0: if r_number < 0.1: next_state = s_t # stay in same state reward = 0 elif r_number < 0.3: next_state = 2 reward = 0 else: next_state = 0 reward = 5 elif a_t == 1: if r_number < 0.05: next_state = 2 reward = 0 else: next_state = s_t # stay in same state reward = 0 elif s_t == 2: if a_t == 0: if r_number < 0.4: next_state = 0 reward = 0 else: next_state = s_t # stay in current state reward = 0 elif a_t == 1: if r_number < 0.3: next_state = 0 reward = -1 elif r_number < 0.7: next_state = s_t reward = 0 else: next_state = 1 reward = 0 return [next_state, reward] # SimQ function def SimQ(s_t, a_t, policy_fn, h): total_reward = 0 # simulate taking action a_t in current state s_t [next_state, reward] = sim_action(s_t,a_t) total_reward += reward s_current = next_state # simulate taking h-1 actions using the policy_fn for t in range(0,h-1): action = policy_fn(s_current) [next_state,reward] = sim_action(s_current,action) s_current = next_state total_reward += reward return total_reward def runsim(): # horizon h = 30 print("Episode length = "+str(h)); # how many rollouts (i.e., w) num_rollouts = 100 print("Number of rollouts = "+str(num_rollouts)); # for each state.. for s in states: A_s = getActionSet(s) for a in A_s: # ... and for each action # compute the average sum of discounted reward q_sa = 0 for rollout in range(0,num_rollouts): q_sa += SimQ(s,a,random_policy,h) q_sa = q_sa / num_rollouts print("Q for s = " +str(s) + " and a = "+str(a)+" is "+str(q_sa)) Qfn[ getHashKey(s,a) ] = q_sa print("Q function with random policy:") print(str(Qfn)) # create Q function for improved policy Qfn_prime = dict() print("\n") for s in states: A_s = getActionSet(s) for a in A_s: q_sa = 0 for rollout in range(0,num_rollouts): q_sa += SimQ(s,a,q_policy,h) q_sa = q_sa / num_rollouts print("Q for s = " +str(s) + " and a = "+str(a)+" is "+str(q_sa)) Qfn_prime[ getHashKey(s,a) ] = q_sa print("\n\nQ function with improved policy:") print(str(Qfn_prime)) runsim()