In [11]:
import numpy as np
import time

#from chainer import cuda

#import cupy as cp

#backend
#be = "gpu"
#device = 0


be = "cpu"

class SimplePG(object):
	
	
	# constructor
	def __init__(self, num_actions, input_size, hidden_layer_size, learning_rate,gamma,decay_rate,greedy_e_epsilon,random_seed):
		# store hyper-params
		self._A = num_actions
		self._D = input_size
		self._H = hidden_layer_size
		self._learning_rate = learning_rate
		self._decay_rate = decay_rate
		self._gamma = gamma
		
		# some temp variables
		self._xs,self._hs,self._dlogps,self._drs = [],[],[],[]

		# variables governing exploration
		self._exploration = True # should be set to false when evaluating
		self._explore_eps = greedy_e_epsilon
		
		#create model
		self.init_model(random_seed)
		
	
	def init_model(self,random_seed):
		# create model
		#with cp.cuda.Device(0):
		self._model = {}
		np.random.seed(random_seed)
	   
		# weights from input to hidden layer   
		self._model['W1'] = np.random.randn(self._D,self._H) / np.sqrt(self._D) # "Xavier" initialization
	   
		# weights from hidden to output (action) layer
		self._model['W2'] = np.random.randn(self._H,self._A) / np.sqrt(self._H)
			
				
		self._grad_buffer = { k : np.zeros_like(v) for k,v in self._model.items() } # update buffers that add up gradients over a batch
		self._rmsprop_cache = { k : np.zeros_like(v) for k,v in self._model.items() } # rmsprop memory

	
	# softmax function
	def softmax(self,x):
		probs = np.exp(x - np.max(x, axis=1, keepdims=True))
		probs /= np.sum(probs, axis=1, keepdims=True)
		return probs
		
	  
	def discount_rewards(self,r):
		""" take 1D float array of rewards and compute discounted reward """
		discounted_r = np.zeros_like(r)
		running_add = 0
		for t in reversed(range(0, r.size)):
			running_add = running_add * self._gamma + r[t]
			discounted_r[t] = float(running_add)
    
		return discounted_r
	
	# feed input to network and get result
	def policy_forward(self,x):
		if(len(x.shape)==1):
			x = x[np.newaxis,...]

		h = x.dot(self._model['W1'])
		
		if np.isnan(np.sum(self._model['W1'])):
			print("W1 sum is nan")
		
		if np.isnan(np.sum(self._model['W2'])):
			print("W2 sum is nan")
		
		if np.isnan(np.sum(h)):
			print("nan")
			
			h[np.isnan(h)] = np.random.random_sample()
			h[np.isinf(h)] = np.random.random_sample()
			

		if np.isnan(np.sum(h)):
			print("Still nan!")
		
		
		h[h<0] = 0 # ReLU nonlinearity
		logp = h.dot(self._model['W2'])

		p = self.softmax(logp)
  
		return p, h # return probability of taking actions, and hidden state
		
	
	def policy_backward(self,eph, epdlogp):
		""" backward pass. (eph is array of intermediate hidden states) """
		dW2 = eph.T.dot(epdlogp)  
		dh = epdlogp.dot(self._model['W2'].T)
		dh[eph <= 0] = 0 # backpro prelu
  
		t = time.time()
  
		if(be == "gpu"):
		  self._dh_gpu = cuda.to_gpu(dh, device=0)
		  self._epx_gpu = cuda.to_gpu(self._epx.T, device=0)
		  self._dW1 = cuda.to_cpu(self._epx_gpu.dot(self._dh_gpu) )
		else:
		  self._dW1 = self._epx.T.dot(dh) 
    

		#print((time.time()-t0)*1000, ' ms, @final bprop')

		return {'W1':self._dW1, 'W2':dW2}
	
	def set_explore_epsilon(self,e):
		self._explore_eps = e
	
	# input: current state/observation
	# output: action index
	def process_step(self, x, exploring):

		# feed input through network and get output action distribution and hidden layer
		aprob, h = self.policy_forward(x)
		
		#print(aprob)
		
		# if exploring
		if exploring == True:
			
			# greedy-e exploration
			rand_e = np.random.uniform()
			#print(rand_e)
			if rand_e < self._explore_eps:
				# set all actions to be equal probability
				aprob[0] = [ 1.0/len(aprob[0]) for i in range(len(aprob[0]))]
				#print("!")
		
		
		if np.isnan(np.sum(aprob)):
			print(aprob)
			aprob[0] = [ 1.0/len(aprob[0]) for i in range(len(aprob[0]))]
			print(aprob)
			#input()
		
		aprob_cum = np.cumsum(aprob)
		u = np.random.uniform()
		a = np.where(u <= aprob_cum)[0][0]	

		# record various intermediates (needed later for backprop)
		t = time.time()
		self._xs.append(x) # observation
		self._hs.append(h)

		#softmax loss gradient
		dlogsoftmax = aprob.copy()
		dlogsoftmax[0,a] -= 1 #-discounted reward 
		self._dlogps.append(dlogsoftmax)
		
		t  = time.time()

		return a
		
	# after process_step, this function needs to be called to set the reward
	def give_reward(self,reward):
		
		# store the reward in the list of rewards
		self._drs.append(reward)
		
	# reset to be used when evaluating
	def reset(self):
		self._xs,self._hs,self._dlogps,self._drs = [],[],[],[] # reset 
		self._grad_buffer = { k : np.zeros_like(v) for k,v in self._model.items() } # update buffers that add up gradients over a batch
		self._rmsprop_cache = { k : np.zeros_like(v) for k,v in self._model.items() } # rmsprop memory

		
	# this function should be called when an episode (i.e., a game) has finished
	def finish_episode(self):
		# stack together all inputs, hidden states, action gradients, and rewards for this episode
		
		# this needs to be stored to be used by policy_backward
		# self._xs is a list of vectors of size input dim and the number of vectors is equal to the number of time steps in the episode
		self._epx = np.vstack(self._xs)
		
		
		#for i in range(0,len(self._hs)):
		#	print(self._hs[i])
		
		# len(self._hs) = # time steps
		# stores hidden state activations
		eph = np.vstack(self._hs)
		
		#for i in range(0,len(self._dlogps)):
		#	print(self._dlogps[i])
		
		# self._dlogps stores a history of the probabilities over actions selected by the agent
		epdlogp = np.vstack(self._dlogps)
		
		# self._drs is the history of rewards
		#for i in range(0,len(self._drs)):
		#	print(self._drs[i])
		epr = np.vstack(self._drs)
		
		self._xs,self._hs,self._dlogps,self._drs = [],[],[],[] # reset array memory

		# compute the discounted reward backwards through time
		discounted_epr = (self.discount_rewards(epr))
		#for i in range(0,len(discounted_epr)):
		#	print(str(discounted_epr[i]) + "\t"+str(epr[i]))
		
		
		#print(discounted_epr)
		discounted_epr_mean = np.mean(discounted_epr)
		#print(discounted_epr_mean)
		
		# standardize the rewards to be unit normal (helps control the gradient estimator variance)
		
		#discounted_epr -= np.mean(discounted_epr)
		discounted_epr = np.subtract(discounted_epr,discounted_epr_mean)
		
		
		discounted_epr /= np.std(discounted_epr)
		
		epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
		
		start_time = time.time()
		grad = self.policy_backward(eph, epdlogp)
		#print("--- %s seconds for policy backward ---" % (time.time() - start_time))
		
		for k in self._model: self._grad_buffer[k] += grad[k] # accumulate grad over batch

	# called to update model parameters, generally every N episodes/games for some N
	def update_parameters(self):
		for k,v in self._model.items():
			g = self._grad_buffer[k] # gradient
			self._rmsprop_cache[k] = self._decay_rate * self._rmsprop_cache[k] + (1 - self._decay_rate) * g**2
			self._model[k] -= self._learning_rate * g / (np.sqrt(self._rmsprop_cache[k]) + 1e-5)
			self._grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
        
        
		
		


In [None]:
import numpy as np
import time
import matplotlib.pyplot as plt

# import our dqn agent

REWARD_STEP = -1
REWARD_DONE = 1000

"""
grid key:
0:	empty space
1: 	wall
2:	tree
3:	rock
4:	craft table

action keys:
W:	go forward
A:	turn left
D:	turn right
U:	use/break block (must be facing tree or rock)
C: 	craft (must be facing craft table and have at least 2 wood and 1 stone)

use action key:
tree -> add 1 wood
rock -> add 1 stone

"""



class LidarSensor(object):
	def __init__(self,increment):
		
		self.angle_increment = increment # e.g., 2pi/10 if we want 10 beams
		self.sense_range = 10
		
		
	def sense(self,mc_env):
		num_obj_types = len(mc_env.object_types) # this includes empty square
		
		turn_offset = 0
		
		if mc_env.agent_dir == 'W':
			turn_offset -= np.pi/2
		elif mc_env.agent_dir == 'S':
			turn_offset -= 2*np.pi/2
		elif mc_env.agent_dir == 'E':
			turn_offset -= 3*np.pi/2
		
		current_angle = 0 + turn_offset
		
		lidar_readings = []
		
		#print("Agent location: "+str(mc_env.agent_x)+","+str(mc_env.agent_y))
		
		while True:
			#print("Beam at angle = "+str(current_angle))
			
			beam_i = np.zeros(num_obj_types) # we get 1 value per object, excluding empty square

			# shoot beam
			for r in range(1,self.sense_range):
				# calc x and y position of beam at length r relative to agent
				x = mc_env.agent_x + np.round(r*np.cos(current_angle))
				y = mc_env.agent_y + np.round(r*np.sin(current_angle))
				obj_xy = mc_env.grid[int(x)][int(y)]
				
				if not obj_xy == 0: # if square is not empty
					
					sensor_value = float(self.sense_range - r)/float(self.sense_range)
					
					#print(str(int(x))+","+str(int(y))+","+str(int(obj_xy))+","+str(sensor_value))
					beam_i[obj_xy-1]=sensor_value
					#print beam_i
					
					break
			
			for k in range(0,len(beam_i)):
				lidar_readings.append(beam_i[k])
			#print lidar_readings	
			
			current_angle += self.angle_increment
			
			if current_angle >= 2*np.pi + turn_offset:
				break
		return lidar_readings


class microMC(object):
	
	# constructor
	def __init__(self, width, height,random_seed):
		
		#np.random.seed(random_seed)
		
		self.object_types = [1, 2, 3, 4] # we have 4 objects: wall, tree, rock, and craft table
		
		
		
		self.reset(width,height,random_seed)
		
	def reset(self, width, height,random_seed):
		rows, cols = (width, height) 
		
		self.width = width
		self.height = height
		self.grid = [[0 for i in range(cols)] for j in range(rows)] 
		
		# how many trees and rocks
		n_trees = width*height/20
		n_rocks = width*height/40
		
		# fill in walls with 1s
		for i in range(0,self.height):
			for j in range(0,self.width):
				if i == 0 or i == self.height-1 or j == 0 or j == self.width-1:
					self.grid[j][i] = 1
					
		# create random trees
		for k in range(0,int(n_trees)):
			x_k = np.random.randint(self.width-2)+1
			y_k = np.random.randint(self.height-2)+1
			self.grid[x_k][y_k]=2
			
			
		# create random rocks
		for k in range(0,int(n_rocks)):
			x_k = np.random.randint(self.width-2)+1
			y_k = np.random.randint(self.height-2)+1
			self.grid[x_k][y_k]=3
			
		# create crafting table
		while True:
			x_k = np.random.randint(self.width-4)+2
			y_k = np.random.randint(self.height-4)+2
			if self.grid[x_k][y_k] == 0:
				self.grid[x_k][y_k] = 4
				break
		
		# initialize agent position and inventory
		while True:
			x_k = np.random.randint(self.width-4)+2
			y_k = np.random.randint(self.height-4)+2
			if self.grid[x_k][y_k] == 0:
				self.agent_x = x_k
				self.agent_y = y_k
				self.agent_dir = 'N' # start facing north
				break
		
		self.inventory = dict([('wood', 0), ('stone', 0),('pogo',0)])
		
	def toString(self):
		out_str = ''
		for i in range(0,self.height):
			for j in range(0,self.width):
				if self.agent_x == j and self.agent_y == i:
					if self.agent_dir == 'N':
						out_str += '^'
					elif self.agent_dir == 'S':
						out_str += 'v'
					elif self.agent_dir == 'E':
						out_str += '>'
					elif self.agent_dir == 'W':
						out_str += '<'
				elif self.grid[j][i] == 1: # obstacle/wall
					out_str += '#'
				elif self.grid[j][i] == 2: # tree
					out_str += 'T'
				elif self.grid[j][i] == 3: # rocks
					out_str += 'R'
				elif self.grid[j][i] == 4: # craft table
					out_str += 'C'
				elif self.grid[j][i] == 0: # free space
					out_str += ' '
				
				#out_str += str(self.grid[j][i])
				out_str += ' '
			out_str += '\n'
		
		out_str += '\ninventory:\t' + str(self.inventory)
		
		return out_str
	
	def getFacingXY(self): # get the x y position in front of the agent
		# compute the target position in front of the agent
		target_x = self.agent_x
		target_y = self.agent_y
		
		if self.agent_dir == 'N':
			target_y -= 1
		elif self.agent_dir == 'W':
			target_x -= 1
		elif self.agent_dir == 'E':
			target_x += 1
		elif self.agent_dir == 'S':
			target_y += 1
		return [target_x,target_y]
	
	def execute_action(self, action):
		reward = REWARD_STEP
		done = False
		
		# first, process turn actions
		if action == 'A': # turn right
			if self.agent_dir == 'N':
				self.agent_dir = 'W'
			elif self.agent_dir == 'W':
				self.agent_dir = 'S'
			elif self.agent_dir == 'S':
				self.agent_dir = 'E'
			elif self.agent_dir == 'E':
				self.agent_dir = 'N'
		elif action == 'D': # turn left
			if self.agent_dir == 'N':
				self.agent_dir = 'E'
			elif self.agent_dir == 'W':
				self.agent_dir = 'N'
			elif self.agent_dir == 'S':
				self.agent_dir = 'W'
			elif self.agent_dir == 'E':
				self.agent_dir = 'S'
		elif action == 'W': # go forward
			
			# compute the target position in front of the agent
			[target_x, target_y] = self.getFacingXY()
			
			if self.grid[target_x][target_y] == 0: # if target position is empty, move
				self.agent_x = target_x
				self.agent_y = target_y
		elif action == 'U': # use / break block
			# compute the target position in front of the agent
			[target_x, target_y] = self.getFacingXY()
				
			if self.grid[target_x][target_y] == 2: # if tree in front
				self.grid[target_x][target_y] = 0 # we clear the tree
				self.inventory['wood'] += 1
				if self.inventory['wood'] <= 2:
					reward = 10 # learn to chop wood if needed
			elif self.grid[target_x][target_y] == 3: # if rock in front
				self.grid[target_x][target_y] = 0 # we clear the tree
				self.inventory['stone'] += 1
				if self.inventory['stone'] <= 1:
					reward = 10 # learn to chop stone if needed
		elif action == 'C': # craft -- need 2 wood and 1 rock
			[target_x, target_y] = self.getFacingXY()
			if self.grid[target_x][target_y] == 4: # if craft in front
				if self.inventory['wood'] >= 2 and self.inventory['stone'] >= 1:
					self.inventory['pogo'] += 1
					self.inventory['wood'] -= 2
					self.inventory['stone'] -= 1
					done = True
					reward = REWARD_DONE
		return [done, reward]
			
			
def main_dqn():
	# environment
	random_seed = 10
	env = microMC(20,10,random_seed)
	
	# sensor
	sensor = LidarSensor(np.pi/8)
	
	# policy network
	actionCnt = 5
	D = 8 * 2 * len(env.object_types) + 2 # how many input neurons
	NUM_HIDDEN = 30
	GAMMA = 0.95
	LEARNING_RATE = 1e-3
	DECAY_RATE = 0.99
	MAX_EPSILON = 0.1
	
	agent = SimplePG(actionCnt,D,NUM_HIDDEN,LEARNING_RATE,GAMMA,DECAY_RATE,MAX_EPSILON,random_seed)
	agent.set_explore_epsilon(MAX_EPSILON)
 
	rewards_history = []
	action_space = ['W','A','D','U','C']
	running_reward = None
  
  
	t_step = 0
	episode = 0
	t_limit = 100
	reward_sum = 0
  
  

	while True:
		#print env.toString()
		
		# get obseration from sensor
		obs = sensor.sense(env)
	
		# add inventory observation
		obs.append(env.inventory['wood'])
		obs.append(env.inventory['stone'])
	
		# construct input x 
		x = np.asarray(obs)
		
		# act 
		a = agent.process_step(x,True)
		#print("Action at t="+str(t_step)+" is "+action_space[a])
		
		[done,reward] = env.execute_action(action_space[a])
		#print("Reward = "+str(reward))
		# give reward
		agent.give_reward(reward)
		reward_sum += reward
		
		t_step += 1
		
		if t_step > t_limit:
			
			# compute running reward
			running_reward = reward_sum if running_reward is None else running_reward * 0.95 + reward_sum * 0.05
			rewards_history.append(running_reward)
			print('ep %f: resetting env. episode reward total was %f. running mean: %f' % (episode, reward_sum, running_reward))
			
			# finish agent
			
			#print("\n\nfinished episode = "+str(episode)+" with " +str(reward_sum)+"\n")

			
			done = True
			t_step = 0
			agent.finish_episode()
		
			# update after every episode
			agent.update_parameters()

			# update after every k episoddes
			#if episode % 5 == 0:
			#	agent.update_parameters()
		
			# reset environment
			episode += 1
			env = microMC(20,10,episode) # this is a bug causing memory leak, ideally env should have a reset function
			reward_sum = 0
	
			# quit after some number of episodes
			if episode > 5000:
				 break
			
	iterations = range(0, episode, 1)
	plt.plot(iterations, rewards_history)
	plt.ylabel('Average Return'), plt.xlabel('Iterations')
  








def main_teleop():
	
	env = microMC(20,10,5)
	print(env.toString())
	
	sensor = LidarSensor(np.pi/8)
	
	
	while True:
		
		obs = sensor.sense(env)
		
		action = input()
		[done,reward] = env.execute_action(action)
		
		print(str(done)+"\t"+str(reward))
		print(env.toString())
	
	print("Hello World!")

if __name__ == "__main__":
    main_dqn()


ep 0.000000: resetting env. episode reward total was -90.000000. running mean: -90.000000
ep 1.000000: resetting env. episode reward total was -101.000000. running mean: -90.550000
ep 2.000000: resetting env. episode reward total was -101.000000. running mean: -91.072500
ep 3.000000: resetting env. episode reward total was -79.000000. running mean: -90.468875
ep 4.000000: resetting env. episode reward total was -101.000000. running mean: -90.995431
ep 5.000000: resetting env. episode reward total was -90.000000. running mean: -90.945660
ep 6.000000: resetting env. episode reward total was -101.000000. running mean: -91.448377
ep 7.000000: resetting env. episode reward total was -101.000000. running mean: -91.925958
ep 8.000000: resetting env. episode reward total was -79.000000. running mean: -91.279660
ep 9.000000: resetting env. episode reward total was -101.000000. running mean: -91.765677
ep 10.000000: resetting env. episode reward total was -90.000000. running mean: -91.677393
ep 