import numpy as np
# 狀态集合
states = ["Rainy", "Sunny"]
# 行動集合
actions = ["Stay", "Go_out"]
# 機率轉移矩陣
transition_probabilities = [
[0.7, 0.3],
[0.4, 0.6]
]
# 獎勵函數
rewards = [
[0, 0],
[5, 0]
]
# 選擇最優動作
def get_optimal_action(state):
if state == "Rainy":
return "Stay"
return "Go_out"
# 計算機率
def get_transition_probability(current_state, next_state, action):
current_state_index = states.index(current_state)
next_state_index = states.index(next_state)
action_index = actions.index(action)
return transition_probabilities[current_state_index][next_state_index]
# 計算獎勵
def get_reward(current_state, action):
current_state_index = states.index(current_state)
action_index = actions.index(action)
return rewards[current_state_index][action_index]
# 初始狀态
current_state = "Rainy"
# 獎勵總和
total_reward = 0
# 模拟5次決策
for i in range(5):
action = get_optimal_action(current_state)
reward = get_reward(current_state, action)
total_reward += reward
next_state = np.random.choice(states, p=transition_probabilities[states.index(current_state)])
current_state = next_state
print("Total reward:", total_reward)