DQN


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

def display_frames_as_gif(frames):

    plt.figure(figsize = (frames[0].shape[1]/72.0,frames[0].shape[0]/72.0),dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

def animate(i):
    patch.set_data(frame[i])
    anmi = animation.FuncAnimation(plt.gcf(),animate,frames = len(frames),interval = 50)
    anim.save = ('movie_cartpole_DQN.mp4')
    display(display_animation(anmi,default_mode = 'loop'))
from collections import namedtuple
Tr = namedtuple('tr',('name_a','value_b'))
Tr_object = Tr('名称',100)
print(Tr_object)
print(Tr_object.value_b)
tr(name_a='名称', value_b=100)
100
# namedtupleを生成
from collections import namedtuple
Transition = namedtuple(
    'Transition', ('state', 'action', 'next_state', 'reward'))

ENV = 'CartPole-v0'
GAMMA = 0.99
MAX_STEPS = 200
NUM_EPISODES = 500

# 保存经验的内存定义

class ReplayMemory:
    def __init__(self,CAPACITY):
        self.capacity = CAPACITY
        self.memory = [] #保存经验的变量
        self.index = 0  #保存的index展示变量
    def push(self,state,action,state_next,reward):
        if len(self.memory) < self.capacity:    #确保内存没有超过容量
            self.memory.append(None)
            self.memory[self.index] = Transition(state,action,state_next,reward)
            self.index = (self.index + 1)% self.capacity  #这里index的值向后移动一位 +1
    def sample(self,batch_size):
        return random.sample(self.memory,batch_size) #随机取出一定数量的数据
    def __len__(self):
        return len(self.memory)
import random 
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

BATCH_SIZE = 32
CAPACITY = 10000
class Brain:
    def __init__(self,num_states,num_actions):
        self.num_actions = num_actions #取得cartpole的行动

        self.memory = ReplayMemory(CAPACITY) 
        # 构建神经网络
        self.model = nn.Sequential()
        self.model.add_module('fc1',nn.Linear(num_states,32)) #nn.Linear ?
        self.model.add_module('relu1',nn.ReLU())
        self.model.add_module('fc2',nn.Linear(32,32))
        self.model.add_module('relu2',nn.ReLU())
        self.model.add_module('fc3',nn.Linear(32,num_actions))

        print(self.model)

        self.optimizer = optim.Adam(self.model.parameters(),lr = 0.0001)
    def replay(self):

        if len(self.memory) < BATCH_SIZE:
            return 

        transitions = self.memory.sample(BATCH_SIZE)

        batch = Transition(*zip(*transitions)) #?
        #C=torch.cat((A,B),0)就表示按维数0(行)拼接A和B,也就是竖着拼接,A上B下。此时需要注意:列数必须一致,即维数1数值要相同,这里都是3列,方能列对齐。拼接后的C的第0维是两个维数0数值和,即2+4=6.
        #C=torch.cat((A,B),1)就表示按维数1(列)拼接A和B,也就是横着拼接,A左B右。此时需要注意:行数必须一致,即维数0数值要相同,这里都是2行,方能行对齐。拼接后的C的第1维是两个维数1数值和,即3+4=7.
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

        self.model.eval() #推理模式
        state_action_values = self.model(state_batch).gather(1,action_batch)
        # 检查next_state 是否存在
        non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None,batch.next_state)))

        next_state_values = torch.zeros(BATCH_SIZE)
        #torch.tensor.detach()用法介绍:
        #返回一个新的从当前图中分离的Variable。
        #返回的 Variable 不会梯度更新。
        #被detach 的Variable volatile=True, detach出来的volatile也为True。
        #返回的Variable和被detach的Variable指向同一个tensor。
        next_state_values[non_final_mask] = self.model(non_final_next_states).max(1)[0].detach() #?
        # Q学习
        expected_state_action_values = reward_batch + GAMMA*next_state_values
        self.model.train()
        #损失函数计算
        
        #返回一个新的张量,对输入的指定位置插入维度 1

        #注意: 返回张量与输入张量共享内存,所以改变其中一个的内容会改变另一个。

        #如果dim为负,则将会被转化dim+input.dim()+1dim+input.dim()+1
        
        #tensor (Tensor) – 输入张量
        #dim (int) – 插入维度的索引(从0开始)
        #out (Tensor, optional) – 结果张量

        #import torch
        #x = torch.ones(4)
        #print(x)
        #print(x.size())

        #y = torch.unsqueeze(x, 0)
        #print(y)
        #print(y.size())
        #z = torch.unsqueeze(x, 1)
        #print(z)
        #print(z.size())
        #插入维度之前:
        #[ 1, 1, 1, 1 ]

        #在第0维插入一个维度,使其变成(1,4),即在最外层插入一个中括号即可:

        #[ [ 1, 1, 1, 1 ] ]

        #在第1维插入一个维度,使其变成(4,1)

        #[ [1], [1], [1], [1] ]
        loss = F.smooth_l1_loss(state_action_values,expected_state_action_values.unsqueeze(1))   #[minibatch*1]

        #更新結合パラメータ
        self.optimizer.zero_grad()# 重置梯度
        loss.backward() #反向传播计算
        self.optimizer.step() #更新parameter

    def decide_action(self,state,episode):
        epsilon = 0.5 * (1/(episode + 1))

        if epsilon <= np.random.uniform(0,1):
            self.model.eval()
            with torch.no_grad():
                action = self.model(state).max(1)[1].view(1,1)
                #torch.max()[0], 只返回最大值的每个数
                #troch.max()[1], 只返回最大值的每个索引
                #torch.max()[1].data 只返回variable中的数据部分(去掉Variable containing:)
                #torch.max()[1].data.numpy() 把数据转化成numpy ndarry
                #torch.max()[1].data.numpy().squeeze() 把数据条目中维度为1 的删除掉

                #view(1,1)
                #在函数的参数中经常可以看到-1例如x.view(-1, 4)
                #这里-1表示一个不确定的数,就是你如果不确定你想要reshape成几行,但是你很肯定要reshape成4列,那不确定的地方就可以写成-1
                #例如一个长度的16向量x,
                #x.view(-1, 4)等价于x.view(4, 4)
                #x.view(-1, 2)等价于x.view(8,2)
        else:
            action = torch.LongTensor([[random.randrange(self.num_actions)]])

        return action
class Agent:
    def __init__(self,num_states,num_actions):
        self.brain = Brain(num_states,num_actions)
    def update_q_function(self):
        self.brain.replay() #这里Brain()赋值给self.brain,所以可以调用replay()?

    def get_action(self,state,episode):
        action = self.brain.decide_action(state,episode)
        return action
    def memorize(self,state,action,state_next,reward):
        self.brain.memory.push(state,action,state_next,reward)
class Environment:
    def __init__(self):
        self.env = gym.make(ENV)
        #初始化设定状态和行动的个数
        num_states = self.env.observation_space.shape[0] 
        num_actions = self.env.action_space.n
        #生成环境内行动的Agent
        self.agent = Agent(num_states,num_actions)
    def run(self):
        episode_10_list = np.zeros(10)
        complete_episodes = 0
        episode_final = False
        frames = []
        for episode in range(NUM_EPISODES):
            observation = self.env.reset() #环境初始化
            state = observation #初始化的环境直接当作状态s使用
            #起初state是numpy类型 这里通过torch.from_numpy().type转换成向量型
            state = torch.from_numpy(state).type(
                torch.FloatTensor)
            #print(state)
            #FloatTensorof size 4 ->1*4?
            state = torch.unsqueeze(state,0)
            #print(state)

            for step in range(MAX_STEPS):
                if episode_final is True:
                    frames.append(self.env.render(mode = 'rgb_array'))
                # 这步所求的action预测 是在agent中进行
                action = self.agent.get_action(state,episode)
                #这里不使用reward和info所以 _
                observation_next,_,done,_ = self.env.step(action.item())#env源代码?
                if done:
                    state_next = None
                    episode_10_list = np.hstack((episode_10_list[1:],step+1)) #?这里是寻找
                    #沿着水平方向将数组堆叠起来
                    #arr1 = np.array([[1, 2], [3, 4], [5, 6]])
                    #arr2 = np.array([[7, 8], [9, 0], [0, 1]])
                    #[[1 2 7 8]
                     #[3 4 9 0]
                     #[5 6 0 1]]

                    if step < 195:
                        reward = torch.FloatTensor([-1.0]) 
                        complete_episodes = 0
                    else:
                        reward = torch.FloatTensor([1.0])
                        complete_episodes = complete_episodes +1
                else:
                    reward = torch.FloatTensor([0.0])
                    state_next =observation_next
                    state_next = torch.from_numpy(state_next).type(torch.FloatTensor)
                    state_next = torch.unsqueeze(state_next,0)

                #给内存增加经验
                self.agent.memorize(state,action,state_next,reward)
                #通过replay()更新Q函数
                self.agent.update_q_function()
                state = state_next
                if done:
                    print('%d Episode: Finished after %d steps: 10次试行的平均step数 = %.1lf' %(episode,step+1,episode_10_list.mean()))
                    break
            if episode_final is True:
                display_frames_as_gif(frames)
                break
            if complete_episodes >= 10:
                print('连续成功10次')
                episode_final = True
cartpole_env = Environment()
Sequential(
  (fc1): Linear(in_features=4, out_features=32, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)
cartpole_env.run()
0 Episode: Finished after 10 steps: 10次试行的平均step数 = 1.0
1 Episode: Finished after 13 steps: 10次试行的平均step数 = 2.3
2 Episode: Finished after 9 steps: 10次试行的平均step数 = 3.2
3 Episode: Finished after 10 steps: 10次试行的平均step数 = 4.2
4 Episode: Finished after 9 steps: 10次试行的平均step数 = 5.1
5 Episode: Finished after 9 steps: 10次试行的平均step数 = 6.0
6 Episode: Finished after 10 steps: 10次试行的平均step数 = 7.0
7 Episode: Finished after 9 steps: 10次试行的平均step数 = 7.9
8 Episode: Finished after 9 steps: 10次试行的平均step数 = 8.8
9 Episode: Finished after 10 steps: 10次试行的平均step数 = 9.8
10 Episode: Finished after 12 steps: 10次试行的平均step数 = 10.0
11 Episode: Finished after 11 steps: 10次试行的平均step数 = 9.8
12 Episode: Finished after 13 steps: 10次试行的平均step数 = 10.2
13 Episode: Finished after 10 steps: 10次试行的平均step数 = 10.2
14 Episode: Finished after 10 steps: 10次试行的平均step数 = 10.3
15 Episode: Finished after 10 steps: 10次试行的平均step数 = 10.4
16 Episode: Finished after 15 steps: 10次试行的平均step数 = 10.9
17 Episode: Finished after 16 steps: 10次试行的平均step数 = 11.6
18 Episode: Finished after 22 steps: 10次试行的平均step数 = 12.9
19 Episode: Finished after 21 steps: 10次试行的平均step数 = 14.0
<ipython-input-6-55bc7a6f3a6f>:43: UserWarning: indexing with dtype torch.uint8 is now deprecated, 
please use a dtype torch.bool instead. 
(Triggered internally at  ..\aten\src\ATen/native/IndexingUtils.h:30.)
  next_state_values[non_final_mask] = self.model(non_final_next_states).max(1)[0].detach() #?
20 Episode: Finished after 23 steps: 10次试行的平均step数 = 15.1
21 Episode: Finished after 36 steps: 10次试行的平均step数 = 17.6
22 Episode: Finished after 28 steps: 10次试行的平均step数 = 19.1
23 Episode: Finished after 35 steps: 10次试行的平均step数 = 21.6
24 Episode: Finished after 23 steps: 10次试行的平均step数 = 22.9
25 Episode: Finished after 39 steps: 10次试行的平均step数 = 25.8
26 Episode: Finished after 29 steps: 10次试行的平均step数 = 27.2
27 Episode: Finished after 29 steps: 10次试行的平均step数 = 28.5
28 Episode: Finished after 24 steps: 10次试行的平均step数 = 28.7
29 Episode: Finished after 80 steps: 10次试行的平均step数 = 34.6
30 Episode: Finished after 26 steps: 10次试行的平均step数 = 34.9
31 Episode: Finished after 25 steps: 10次试行的平均step数 = 33.8
32 Episode: Finished after 32 steps: 10次试行的平均step数 = 34.2
33 Episode: Finished after 25 steps: 10次试行的平均step数 = 33.2
34 Episode: Finished after 31 steps: 10次试行的平均step数 = 34.0
35 Episode: Finished after 34 steps: 10次试行的平均step数 = 33.5
36 Episode: Finished after 45 steps: 10次试行的平均step数 = 35.1
37 Episode: Finished after 40 steps: 10次试行的平均step数 = 36.2
38 Episode: Finished after 50 steps: 10次试行的平均step数 = 38.8
39 Episode: Finished after 11 steps: 10次试行的平均step数 = 31.9
40 Episode: Finished after 56 steps: 10次试行的平均step数 = 34.9
41 Episode: Finished after 23 steps: 10次试行的平均step数 = 34.7
42 Episode: Finished after 16 steps: 10次试行的平均step数 = 33.1
43 Episode: Finished after 19 steps: 10次试行的平均step数 = 32.5
44 Episode: Finished after 12 steps: 10次试行的平均step数 = 30.6
45 Episode: Finished after 29 steps: 10次试行的平均step数 = 30.1
46 Episode: Finished after 13 steps: 10次试行的平均step数 = 26.9
47 Episode: Finished after 20 steps: 10次试行的平均step数 = 24.9
48 Episode: Finished after 14 steps: 10次试行的平均step数 = 21.3
49 Episode: Finished after 14 steps: 10次试行的平均step数 = 21.6
50 Episode: Finished after 11 steps: 10次试行的平均step数 = 17.1
51 Episode: Finished after 13 steps: 10次试行的平均step数 = 16.1
52 Episode: Finished after 14 steps: 10次试行的平均step数 = 15.9
53 Episode: Finished after 31 steps: 10次试行的平均step数 = 17.1
54 Episode: Finished after 19 steps: 10次试行的平均step数 = 17.8
55 Episode: Finished after 29 steps: 10次试行的平均step数 = 17.8
56 Episode: Finished after 33 steps: 10次试行的平均step数 = 19.8
57 Episode: Finished after 58 steps: 10次试行的平均step数 = 23.6
58 Episode: Finished after 40 steps: 10次试行的平均step数 = 26.2
59 Episode: Finished after 38 steps: 10次试行的平均step数 = 28.6
60 Episode: Finished after 36 steps: 10次试行的平均step数 = 31.1
61 Episode: Finished after 47 steps: 10次试行的平均step数 = 34.5
62 Episode: Finished after 52 steps: 10次试行的平均step数 = 38.3
63 Episode: Finished after 36 steps: 10次试行的平均step数 = 38.8
64 Episode: Finished after 31 steps: 10次试行的平均step数 = 40.0
65 Episode: Finished after 76 steps: 10次试行的平均step数 = 44.7
66 Episode: Finished after 40 steps: 10次试行的平均step数 = 45.4
67 Episode: Finished after 24 steps: 10次试行的平均step数 = 42.0
68 Episode: Finished after 51 steps: 10次试行的平均step数 = 43.1
69 Episode: Finished after 53 steps: 10次试行的平均step数 = 44.6
70 Episode: Finished after 34 steps: 10次试行的平均step数 = 44.4
71 Episode: Finished after 31 steps: 10次试行的平均step数 = 42.8
72 Episode: Finished after 34 steps: 10次试行的平均step数 = 41.0
73 Episode: Finished after 51 steps: 10次试行的平均step数 = 42.5
74 Episode: Finished after 46 steps: 10次试行的平均step数 = 44.0
75 Episode: Finished after 42 steps: 10次试行的平均step数 = 40.6
76 Episode: Finished after 50 steps: 10次试行的平均step数 = 41.6
77 Episode: Finished after 32 steps: 10次试行的平均step数 = 42.4
78 Episode: Finished after 37 steps: 10次试行的平均step数 = 41.0
79 Episode: Finished after 45 steps: 10次试行的平均step数 = 40.2
80 Episode: Finished after 67 steps: 10次试行的平均step数 = 43.5
81 Episode: Finished after 41 steps: 10次试行的平均step数 = 44.5
82 Episode: Finished after 57 steps: 10次试行的平均step数 = 46.8
83 Episode: Finished after 77 steps: 10次试行的平均step数 = 49.4
84 Episode: Finished after 39 steps: 10次试行的平均step数 = 48.7
85 Episode: Finished after 51 steps: 10次试行的平均step数 = 49.6
86 Episode: Finished after 61 steps: 10次试行的平均step数 = 50.7
87 Episode: Finished after 81 steps: 10次试行的平均step数 = 55.6
88 Episode: Finished after 63 steps: 10次试行的平均step数 = 58.2
89 Episode: Finished after 84 steps: 10次试行的平均step数 = 62.1
90 Episode: Finished after 200 steps: 10次试行的平均step数 = 75.4
91 Episode: Finished after 58 steps: 10次试行的平均step数 = 77.1
92 Episode: Finished after 57 steps: 10次试行的平均step数 = 77.1
93 Episode: Finished after 53 steps: 10次试行的平均step数 = 74.7
94 Episode: Finished after 109 steps: 10次试行的平均step数 = 81.7
95 Episode: Finished after 82 steps: 10次试行的平均step数 = 84.8
96 Episode: Finished after 61 steps: 10次试行的平均step数 = 84.8
97 Episode: Finished after 50 steps: 10次试行的平均step数 = 81.7
98 Episode: Finished after 156 steps: 10次试行的平均step数 = 91.0
99 Episode: Finished after 162 steps: 10次试行的平均step数 = 98.8
100 Episode: Finished after 200 steps: 10次试行的平均step数 = 98.8
101 Episode: Finished after 92 steps: 10次试行的平均step数 = 102.2
102 Episode: Finished after 90 steps: 10次试行的平均step数 = 105.5
103 Episode: Finished after 130 steps: 10次试行的平均step数 = 113.2
104 Episode: Finished after 147 steps: 10次试行的平均step数 = 117.0
105 Episode: Finished after 119 steps: 10次试行的平均step数 = 120.7
106 Episode: Finished after 186 steps: 10次试行的平均step数 = 133.2
107 Episode: Finished after 200 steps: 10次试行的平均step数 = 148.2
108 Episode: Finished after 110 steps: 10次试行的平均step数 = 143.6
109 Episode: Finished after 111 steps: 10次试行的平均step数 = 138.5
110 Episode: Finished after 159 steps: 10次试行的平均step数 = 134.4
111 Episode: Finished after 200 steps: 10次试行的平均step数 = 145.2
112 Episode: Finished after 200 steps: 10次试行的平均step数 = 156.2
113 Episode: Finished after 200 steps: 10次试行的平均step数 = 163.2
114 Episode: Finished after 200 steps: 10次试行的平均step数 = 168.5
115 Episode: Finished after 200 steps: 10次试行的平均step数 = 176.6
116 Episode: Finished after 200 steps: 10次试行的平均step数 = 178.0
117 Episode: Finished after 200 steps: 10次试行的平均step数 = 178.0
118 Episode: Finished after 200 steps: 10次试行的平均step数 = 187.0
119 Episode: Finished after 200 steps: 10次试行的平均step数 = 195.9
120 Episode: Finished after 200 steps: 10次试行的平均step数 = 200.0
连续成功10次
121 Episode: Finished after 200 steps: 10次试行的平均step数 = 200.0

png


评论
  目录