yenchenlin · BPDanek · Apr 7, 2019 · Apr 11, 2019 · Aug 22, 2019 · Aug 22, 2019
diff --git a/README.md b/README.md
@@ -110,6 +110,6 @@ INITIAL_EPSILON = 0.1
 ## Disclaimer
 This work is highly based on the following repos:
 
-1. [sourabhv/FlapPyBird] (https://github.com/sourabhv/FlapPyBird)
+1. [sourabhv/FlapPyBird](https://github.com/sourabhv/FlapPyBird)
 2. [asrivat1/DeepLearningVideoGames](https://github.com/asrivat1/DeepLearningVideoGames)
 
diff --git a/deep_q_network.py b/deep_q_network.py
@@ -79,7 +79,7 @@ def trainNetwork(s, readout, h_fc1, sess):
     # define the cost function
     a = tf.placeholder("float", [None, ACTIONS])
     y = tf.placeholder("float", [None])
-    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1)
+    readout_action = tf.reduce_sum(tf.multiply(readout, a), axis=1)
     cost = tf.reduce_mean(tf.square(y - readout_action))
     train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
 
@@ -150,7 +150,7 @@ def trainNetwork(s, readout, h_fc1, sess):
         # only train if done observing
         if t > OBSERVE:
             # sample a minibatch to train on
-            minibatch = random.sample(D, BATCH)
+            minibatch = random.sample(list(D), BATCH)
 
             # get the batch variables
             s_j_batch = [d[0] for d in minibatch]

diff --git a/deep_q_network_opt_v1.py b/deep_q_network_opt_v1.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+#*
+
+# distinction between dqn (vanilla) and this:
+# we want to formulate an addition to state, s, called "delta s", ds, which will cause the Q value of s+ds and a
+# target action, a^t, to be higher than the q value for s+ds and any other action. The implication is that adding ds
+# to our data will result in an action to be drawn. In cases where ds is not added/nothing is changed, we will
+# behave normally.
+# In order to retain the natural behaviour of a DQN controller, we will not be changing the weights or controller
+# for that matter, we will instead minimize the loss between Q(s+ds, a^t) and Q(s+ds, a), where a^t is the target a,
+# and a is any non-target a. Thee loss function will be (some) hinge loss: l(a,b) = max(b - (a + eps), 0), which
+# will essentially enforce the condition: a >= b + eps
+# An informal proof associated with the possibility this will work depends on the fact that our controller learns how
+# to behave well within a certain set of input states from the possible set of states it's been trained on,
+# called (here) the game-possible pixel space. This is the set of frames the game can generate under any scenario
+# within the game. The game-possible pixel space is small, relative to the pixel space, which is a space containing
+# all possible combinations of pixel intensities that a screen can generate. Given that ds blongs in the pixel space,
+# and s belongs in the game-possible pixel space, we can say that s+ds belongs in the pixel space, which the controller
+# may not know how to handle, which, if abused properly may result in a simple adversarial attack.
+
+from __future__ import print_function
+import tensorflow as tf
+import cv2
+import sys
+sys.path.append("game/")
+import wrapped_flappy_bird as game
+import random
+import numpy as np
+from collections import deque
+
+# const for advesarial optimization:
+# flap = [0, 1]
+# noaction = [1, 0]
+action_target = [0, 1]
+LR = 0.01 # learning rate for optimizing ds
+# number of time steps in which to calculate ds, first 10 frames of game
+INTERVAL = 10
+
+GAME = 'bird' # the name of the game being played for log files
+ACTIONS = 2 # number of valid actions
+GAMMA = 0.99 # decay rate of past observations
+OBSERVE = 100000. # timesteps to observe before training
+EXPLORE = 2000000. # frames over which to anneal epsilon
+FINAL_EPSILON = 0.0001 # final value of epsilon
+INITIAL_EPSILON = 0.0001 # starting value of epsilon
+REPLAY_MEMORY = 50000 # number of previous transitions to remember
+BATCH = 32 # size of minibatch
+FRAME_PER_ACTION = 1
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev = 0.01)
+    return tf.Variable(initial)
+
+def bias_variable(shape):
+    initial = tf.constant(0.01, shape = shape)
+    return tf.Variable(initial)
+
+def conv2d(x, W, stride):
+    return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
+
+def max_pool_2x2(x):
+    return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
+
+def createNetwork():
+    # network weights
+    W_conv1 = weight_variable([8, 8, 4, 32])
+    b_conv1 = bias_variable([32])
+
+    W_conv2 = weight_variable([4, 4, 32, 64])
+    b_conv2 = bias_variable([64])
+
+    W_conv3 = weight_variable([3, 3, 64, 64])
+    b_conv3 = bias_variable([64])
+
+    W_fc1 = weight_variable([1600, 512])
+    b_fc1 = bias_variable([512])
+
+    W_fc2 = weight_variable([512, ACTIONS])
+    b_fc2 = bias_variable([ACTIONS])
+
+    # input layer
+    s = tf.placeholder("float", [None, 80, 80, 4])
+
+    # hidden layers
+    h_conv1 = tf.nn.relu(conv2d(s, W_conv1, 4) + b_conv1)
+    h_pool1 = max_pool_2x2(h_conv1)
+
+    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, 2) + b_conv2)
+    #h_pool2 = max_pool_2x2(h_conv2)
+
+    h_conv3 = tf.nn.relu(conv2d(h_conv2, W_conv3, 1) + b_conv3)
+    #h_pool3 = max_pool_2x2(h_conv3)
+
+    #h_pool3_flat = tf.reshape(h_pool3, [-1, 256])
+    h_conv3_flat = tf.reshape(h_conv3, [-1, 1600])
+
+    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)
+
+    # readout layer
+    readout = tf.matmul(h_fc1, W_fc2) + b_fc2
+
+    return s, readout, h_fc1
+
+def trainNetwork(s, readout, h_fc1, delta_s, sess):
+    # define the cost function
+    a = tf.placeholder("float", [None, ACTIONS])
+    y = tf.placeholder("float", [None])
+    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1)
+    cost = tf.reduce_mean(tf.square(y - readout_action))
+    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
+
+    # open up a game state to communicate with emulator
+    game_state = game.GameState()
+
+    # store the previous observations in replay memory
+    D = deque()
+
+    # printing
+    a_file = open("logs_" + GAME + "/readout.txt", 'w')
+    h_file = open("logs_" + GAME + "/hidden.txt", 'w')
+
+    # get the first state by doing nothing and preprocess the image to 80x80x4
+    do_nothing = np.zeros(ACTIONS)
+    do_nothing[0] = 1
+    x_t, r_0, terminal = game_state.frame_step(do_nothing)
+    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
+    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
+    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
+
+    # saving and loading networks
+    saver = tf.train.Saver()
+    sess.run(tf.initialize_all_variables())
+    checkpoint = tf.train.get_checkpoint_state("saved_networks")
+    if checkpoint and checkpoint.model_checkpoint_path:
+        saver.restore(sess, checkpoint.model_checkpoint_path)
+        print("Successfully loaded:", checkpoint.model_checkpoint_path)
+    else:
+        print("Could not find old network weights")
+
+    # start training
+    epsilon = INITIAL_EPSILON
+    t = 0
+    while "flappy bird" != "angry bird":
+        # choose an action epsilon greedily
+        readout_t = readout.eval(feed_dict={s : [s_t]})[0]
+        a_t = np.zeros([ACTIONS])
+        action_index = 0
+        if t % FRAME_PER_ACTION == 0:
+            if random.random() <= epsilon:
+                print("----------Random Action----------")
+                action_index = random.randrange(ACTIONS)
+                a_t[random.randrange(ACTIONS)] = 1
+            else:
+                action_index = np.argmax(readout_t)
+                a_t[action_index] = 1
+        else:
+            a_t[0] = 1 # do nothing
+
+        # scale down epsilon
+        if epsilon > FINAL_EPSILON and t > OBSERVE:
+            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
+
+        # run the selected action and observe next state and reward
+        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
+        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
+        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
+        x_t1 = np.reshape(x_t1, (80, 80, 1))
+        #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)
+        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)
+
+        # store the transition in D
+        D.append((s_t, a_t, r_t, s_t1, terminal))
+        if len(D) > REPLAY_MEMORY:
+            D.popleft()
+
+        # only train if done observing
+        if t > INTERVAL:
+            # sample a minibatch to optimize on
+            opt_batch = random.sample(list(D), INTERVAL)
+
+            # get the batch variables
+            s_opt_batch = [d[0] for d in opt_batch]
+
+            s_ds = np.ndarray((INTERVAL, 80, 80, 4), dtype=float) # forward init.
+
+            # a_batch = [d[1] for d in minibatch]
+            # r_batch = [d[2] for d in minibatch]
+            # s_j1_batch = [d[3] for d in minibatch]
+
+            # y_batch = []
+            # readout_j_batch = readout.eval(feed_dict = {s : s_j_batch})
+            # readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
+            # for i in range(0, len(minibatch)):
+            #     terminal = minibatch[i][4]
+            #     # if terminal, only equals reward
+            #     if terminal:
+            #         y_batch.append(r_batch[i])
+            #     else:
+            #         y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))
+
+            # IN ATTACK FORMULATION NOT NECESSARY, WE WANT TO RETAIN WEIGHTS, BUT TAKE THE PARAMS INTO ATTACK OPERATION
+            # perform gradient step
+            # train_step.run(feed_dict = {
+            #     y : y_batch,
+            #     a : a_batch,
+            #     s : s_j_batch}
+            # )
+
+            # taking params into attack operation:
+            # intake batch of s_t, a_t, r_t, s_t1 resulted from a normal controller devised optimization, across batch by modulating ds,
+            # reduce the sum (expected) loss between Q(s+ds, a^t) and Q(s+ds, a), where a^t != a.
+            # subjects: batch of s_t, a_t, r_t, s_t1, size BATCH.
+            # Note: the training is complete, so in theory we shouldn't be using this as a SGD batch anymore.
+            #
+            #
+            # theory/idea:
+            # RL is less suceptible to a stationary attack since reproducing a setting is challenging. We can go through an
+            # interaction, record it, and then produce an optimization which will abuse that origional interaction, but when
+            # will that come in handy? In CV, usually repeated inputs are easy to produce, but in an RL setting there isn't
+            # opportunity for repeated input.
+            # if we find that one adversarial input transfers well to other similar images, maybe we can make a case here.
+
+            # input and noise which should result in target action to be drawn
+            s_ds = np.reshape(s_opt_batch,[INTERVAL, 80, 80, 4]) + delta_s
+
+            # Q values for both actions at state s + ds for entire batch
+            # you just need to feed into s
+            # ds generates automatically
+            # then you use s+ds as your new input
+            # talk after meeting have Q's
+            readout_s_ds = readout.eval(feed_dict={s : [s_opt_batch][0]})
+
+            # readout(s) = [Q(no flap), Q(flap)]
+            # a = readout[target_action]
+            a = tf.placeholder("float", INTERVAL) # readout_s_ds[1]
+            a = readout_s_ds[:,0]
+
+            # b = readout[!target_action]
+            b = tf.placeholder("float", INTERVAL) # readout_s_ds[0]
+            b =  readout_s_ds[:,1]
+
+            eps = 1 # Q values are typically 10- 30
+            loss = tf.nn.relu(b - a + eps)
+            opt = tf.train.AdamOptimizer(LR).minimize(loss, var_list=(delta_s))
+            # opt.run(feed_dict={})
+
+        # update the old values
+        s_t = s_t1
+        t += 1
+
+        # save progress every 10000 iterations
+        if t % 10000 == 0:
+            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)
+
+        # print info
+        state = ""
+        if t <= OBSERVE:
+            state = "observe"
+        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
+            state = "explore"
+        else:
+            state = "train"
+
+        print("TIMESTEP", t, "/ STATE", state, \
+            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
+            "/ Q_MAX %e" % np.max(readout_t))
+        # write info to files
+        '''
+        if t % 10000 <= 100:
+            a_file.write(",".join([str(x) for x in readout_t]) + '\n')
+            h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n')
+            cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1)
+        '''
+
+def playGame():
+    sess = tf.InteractiveSession()
+    s, readout, h_fc1 = createNetwork()
+
+    # subject of optimization
+    # gaussian noise, 1 set of perturbations which will be added to 4 frames
+    # same noise across every frame within stack within data set
+    # image data not normalized, gaussian distribution must bee scaled to [0, 255]
+    # tensor = np.random.normal(loc=(255/2), scale=(255*(0.01**0.5)), size=(INTERVAL, 80, 80, 4))
+    tensor = (tf.random.normal([INTERVAL, 80, 80, 4], mean=(255.0 / 2), stddev=(255 * (0.01 ** 0.5))))
+    delta_s = tf.Variable(initial_value=tensor, trainable=True)
+
+    trainNetwork(s, readout, h_fc1, delta_s, sess)
+
+def main():
+    playGame()
+
+if __name__ == "__main__":
+    main()