#!/usr/bin/env python


import random

#from chainer import cuda
import time, threading
import sys
import math
import copy

# set of states
states = [0,1,2]

# Q-functioin: mapping from state-action pairs to expected sum of discounted rewards
Qfn = dict()

# create a hash key for a particular state action pair
def getHashKey(s,a):
	return "s"+str(s)+"-a"+str(a)

# return the available actions for a particular state
def getActionSet(state):
	return [0,1]

# policy which maps state to action using the Q-function
def q_policy(s_t): # acts according to q -function
	actions = getActionSet(s_t)
	
	best_action = -1
	best_action_q = -1
	
	for a in actions:
		q_sa = Qfn[ getHashKey(s_t,a) ]
		if best_action == -1 or q_sa > best_action_q:
			best_action = a
			best_action_q = q_sa
			
	return best_action

# policy that acts randomly
def random_policy(s_t):
	actions = getActionSet(s_t)
	action_index = random.randrange(len(actions))
	return actions[action_index]

# simultes taking action a_t in state s_t		
def sim_action(s_t, a_t):
	next_state = -1
	reward = -1
	
	r_number = random.uniform(0.0,1.0)
	
	if s_t == 0:
		if a_t == 1:
			next_state = 1
			reward = 0
		elif a_t == 0:
			if r_number < 0.5:
				next_state = s_t # stay in same state
			else:
				next_state = 1
			reward = 0
	elif s_t == 1:
		if a_t == 0:
			if r_number < 0.1:
				next_state = s_t # stay in same state
				reward = 0
			elif r_number < 0.3:
				next_state = 2
				reward = 0
			else:
				next_state = 0
				reward = 5
		elif a_t == 1:
			if r_number < 0.05:
				next_state = 2
				reward = 0
			else:
				next_state = s_t # stay in same state
				reward = 0
	elif s_t == 2:
		if a_t == 0:
			if r_number < 0.4:
				next_state = 0
				reward = 0
			else:
				next_state = s_t # stay in current state
				reward = 0	
		elif a_t == 1:
			if r_number < 0.3:
				next_state = 0
				reward = -1
			elif r_number < 0.7:
				next_state = s_t
				reward = 0
			else:
				next_state = 1
				reward = 0
	return [next_state, reward]				
		
# SimQ function		
def SimQ(s_t, a_t, policy_fn, h):
	total_reward = 0
	
	# simulate taking action a_t in current state s_t
	[next_state, reward] = sim_action(s_t,a_t)
	total_reward += reward
	
	s_current = next_state
	
	# simulate taking h-1 actions using the policy_fn
	for t in range(0,h-1):
		action = policy_fn(s_current)
		[next_state,reward] = sim_action(s_current,action)
		s_current = next_state
		total_reward += reward
	
	return total_reward


def runsim():
	
	# horizon
	h = 30
	
	print("Episode length = "+str(h));
	
	# how many rollouts (i.e., w)
	num_rollouts = 100
	
	print("Number of rollouts = "+str(num_rollouts));
	
	
	# for each state..
	for s in states:
		A_s = getActionSet(s)
		for a in A_s: # ... and for each action
			
			# compute the average sum of discounted reward
			q_sa = 0
			for rollout in range(0,num_rollouts):
				q_sa += SimQ(s,a,random_policy,h)
			q_sa = q_sa / num_rollouts
			
			print("Q for s = " +str(s) + " and a = "+str(a)+" is "+str(q_sa))
			
			Qfn[ getHashKey(s,a) ] = q_sa

	print("Q function with random policy:")
	print(str(Qfn))
	
	
	# create Q function for improved policy
	Qfn_prime = dict()
	
	print("\n")
	for s in states:
		A_s = getActionSet(s)
		for a in A_s:
			q_sa = 0
			for rollout in range(0,num_rollouts):
				q_sa += SimQ(s,a,q_policy,h)
			q_sa = q_sa / num_rollouts
			
			print("Q for s = " +str(s) + " and a = "+str(a)+" is "+str(q_sa))
			
			Qfn_prime[ getHashKey(s,a) ] = q_sa
	print("\n\nQ function with improved policy:")
	print(str(Qfn_prime))
	
runsim()