-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path004_blackjack.py
executable file
·642 lines (518 loc) · 21 KB
/
004_blackjack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
import collections, util, math, random
############################################################
class ValueIteration(util.MDPAlgorithm):
'''
Solve the MDP using value iteration. Your solve() method must set
- self.V to the dictionary mapping states to optimal values
- self.pi to the dictionary mapping states to an optimal action
Note: epsilon is the error tolerance: you should stop value iteration when
all of the values change by less than epsilon.
The ValueIteration class is a subclass of util.MDPAlgorithm (see util.py).
'''
def solve(self, mdp, epsilon=0.001):
mdp.computeStates()
def computeQ(mdp, V, state, action):
# Return Q(state, action) based on V(state).
return sum(prob * (reward + mdp.discount() * V[newState]) \
for newState, prob, reward in mdp.succAndProbReward(state, action))
def computeOptimalPolicy(mdp, V):
# Return the optimal policy given the values V.
pi = {}
for state in mdp.states:
pi[state] = max((computeQ(mdp, V, state, action), action) for action in mdp.actions(state))[1]
return pi
V = collections.Counter() # state -> value of state
numIters = 0
while True:
newV = {}
for state in mdp.states:
newV[state] = max(computeQ(mdp, V, state, action) for action in mdp.actions(state))
numIters += 1
if max(abs(V[state] - newV[state]) for state in mdp.states) < epsilon:
V = newV
break
V = newV
# Compute the optimal policy now
pi = computeOptimalPolicy(mdp, V)
print "ValueIteration: %d iterations" % numIters
self.pi = pi
self.V = V
############################################################
# Problem 2a
# If you decide 2a is true, prove it in writeup.pdf and put "return None" for
# the code blocks below. If you decide that 2a is false, construct a counterexample.
class CounterexampleMDP(util.MDP):
def startState(self):
return 0
# Return set of actions possible from |state|.
def actions(self, state):
if state == 0:
return ['action']
else:
return ['terminate']
#def isEnd(self, state):
# return (state == 1 or state == -1)
# Return a list of (newState, prob, reward) tuples corresponding to edges
# coming out of |state|.
def succAndProbReward(self, state, action):
results = [] # state, probability, reward
if not (state == 1 or state == -1):
results.append( ( -1, .9, 10 ) )
results.append( ( 1, .1, 1000 ) )
# test:
#results.append( ( -1, .7, 10 ) )
#results.append( ( 1, .3, 1000 ) )
return results
def discount(self):
return 1
############################################################
# Problem 3a
class BlackjackMDP(util.MDP):
def __init__(self, cardValues, multiplicity, threshold, peekCost):
"""
cardValues: array of card values for each card type
multiplicity: number of each card type
threshold: maximum total before going bust
peekCost: how much it costs to peek at the next card
"""
self.cardValues = cardValues
self.multiplicity = multiplicity
self.threshold = threshold
self.peekCost = peekCost
# Return the start state.
# Look at this function to learn about the state representation.
# The first element of the tuple is the sum of the cards in the player's
# hand.
# The second element is the index (not the value) of the next card,
# if the player peeked in the
# last action. If they didn't peek, this will be None.
# The final element is the current deck.
def startState(self):
# total, next card (if any), multiplicity for each card
return (0, None, (self.multiplicity,) * len(self.cardValues))
# Return set of actions possible from |state|.
# You do not need to modify this function.
# All logic for dealing with end states should be done in succAndProbReward
def actions(self, state):
return ['Take', 'Peek', 'Quit']
# Return a list of (newState, prob, reward) tuples corresponding to edges
# coming out of |state|. Indicate a terminal state (after quitting or
# busting) by setting the deck to None.
# When the probability is 0 for a particular transition, don't include that
# in the list returned by succAndProbReward.
def succAndProbReward(self, state, action):
results = [] # successor-state, probability, reward
cardCounts = state[2]
if( cardCounts == None ): #no more cards, game over state
return []
totalCardsLeft = float(sum( cardCounts ))
didPeek = (state[1] != None)
if action is 'Take':
if( didPeek ): #peeked on last round...
newCardCounts = list(cardCounts)
newCardCounts[ state[1] ] = newCardCounts[ state[1] ] - 1
if( newCardCounts[ state[1] ] < 0 ): newCardCounts[ state[1] ] = 0.0
newHoldingValue = state[0] + self.cardValues[ state[1] ]
cardsLeftNow = float(sum( newCardCounts ))
busted = (newHoldingValue > self.threshold)
if( cardsLeftNow == 0.0 and not busted): #no more cards left!
return [ ( (newHoldingValue, None, None ), 1.0, newHoldingValue ) ]
elif( cardsLeftNow == 0.0 and busted): #no more cards but busted ...what a silly silly robot.
#print "BUSTED AT LAST CARD AFTER PEEKING!"
return [ ( ( newHoldingValue, None, None ), 1.0, 0.0 ) ]
elif not busted and cardsLeftNow > 0: #not busted, and still more cards
return [ ( ( newHoldingValue, None, tuple(newCardCounts) ), 1.0, 0.0) ]
elif busted and cardsLeftNow > 0: #bust after peeking? really not a very smart robot.
#print "BUSTED AFTER PEEKING!"
return [ ( ( newHoldingValue, None, None ), 1.0, 0.0 ) ]
else: #didn't peek last round...
for i, cardValue in enumerate(self.cardValues):
if( cardCounts[i] > 0 ):
newCardCounts = list(cardCounts)
newCardCounts[ i ] = newCardCounts[ i ] - 1
newHoldingValue = state[0] + cardValue
cardsLeftNow = float(sum( newCardCounts ))
busted = (newHoldingValue > self.threshold)
if busted: #bust
results.append( ( ( newHoldingValue, None, None ), cardCounts[i]/totalCardsLeft, 0.0 ) )
elif( cardsLeftNow <= 0.0 ): #no more cards left!
results.append( ( (newHoldingValue, None, None ), 1.0, newHoldingValue ) )
elif not busted and cardsLeftNow > 0: #not busted, still more cards:
results.append( ((newHoldingValue, None, tuple(newCardCounts)), cardCounts[i]/totalCardsLeft, 0.0 ) )
if action is 'Peek':
if( didPeek ): #peeked last time, trying to peek twice? bad robot.
return []
else:
for i, cardValue in enumerate(self.cardValues):
if( cardCounts[i] > 0 ):
busted = (state[0] > self.threshold)
if not busted: #not busted
results.append( ((state[0], i, cardCounts), cardCounts[i]/totalCardsLeft, -self.peekCost ) )
else: #busted
results.append( ( (state[0], None, None ), cardCounts[i]/totalCardsLeft, -self.peekCost ) )
if action is 'Quit':
results = [ ( (state[0], None, None ), 1.0, state[0] ) ]
return results
# END_YOUR_CODE
def discount(self):
return 1
############################################################
# Problem 3b
def peekingMDP():
"""
Return an instance of BlackjackMDP where peeking is the optimal action at
least 10% of the time.
"""
return BlackjackMDP([2,3,19], 7, 20, 1)
############################################################
# Problem 4a: Q learning
# Performs Q-learning. Read util.RLAlgorithm for more information.
# actions: a function that takes a state and returns a list of actions.
# discount: a number between 0 and 1, which determines the discount factor
# featureExtractor: a function that takes a state and action and returns
# a list of (feature name, feature value) pairs.
# explorationProb: the epsilon value indicating how frequently the policy
# returns a random action
class QLearningAlgorithm(util.RLAlgorithm):
def __init__(self, actions, discount, featureExtractor, explorationProb=0.2):
self.actions = actions
self.discount = discount
self.featureExtractor = featureExtractor
self.explorationProb = explorationProb
self.weights = collections.Counter()
self.numIters = 0
self.lastETA = 0
# Return the Q function associated with the weights and features
def getQ(self, state, action):
score = 0.0
for f, v in self.featureExtractor(state, action):
score += self.weights[f] * v
return score
# This algorithm will produce an action given a state.
# Here we use the epsilon-greedy algorithm: with probability
# |explorationProb|, take a random action.
def getAction(self, state):
self.numIters += 1
if random.random() < self.explorationProb:
return random.choice(self.actions(state))
else:
return max((self.getQ(state, action), action) for action in self.actions(state))[1]
# Call this function to get the step size to update the weights.
def getStepSize(self):
return 1.0 / math.sqrt(self.numIters)
def multF(self, value, phi):
score = 0.0
for f, v in phi:
score += value * v
return score
# We will call this function with (s, a, r, s'), which you should use to update |weights|.
# Note that if s is a terminal state, then s' will be None. Remember to check for this.
# You should update the weights using self.getStepSize(); use
# self.getQ() to compute the current estimate of the parameters.
#
def incorporateFeedback(self, state, action, reward, newState):
eta = self.getStepSize()
dsc = self.discount
V_opt = 0.0
if( newState != None):
V_opt = max(self.getQ(newState, actn) for actn in self.actions(newState))
predict = self.getQ(state, action )
target = reward + dsc * V_opt
features = self.featureExtractor(state, action)
for phi in features:
self.weights[phi[0]] += -eta*(predict-target) * phi[1]
# for f, v in features:
# #self.weights[f] = (1.0-eta) * Q_opt + eta * (reward + dsc * V_opt)
# self.weights[f] = self.weights[f] - eta *( predict - target ) * v
# Return a singleton list containing indicator feature for the (state, action)
# pair. Provides no generalization.
def identityFeatureExtractor(state, action):
featureKey = (state, action)
featureValue = 1
return [(featureKey, featureValue)]
# ############################################################
# # Problem 4b: convergence of Q-learning
# #
# # "Small" test case ;)
#
# smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
#
# mdp = smallMDP
#
# print "\n\n\n######################################"
# print "######## s m a l l * m d p ###########"
# print "######################################\n"
#
# rl = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor, .2 )
#
# # simulate(mdp, rl, numTrials=10, maxIterations=1000, verbose=False, sort=False):
#
# trials = util.simulate(mdp, rl, 30000, 1000, False, True )
#
# rl.explorationProb = 0
#
# print "Average rewards of Simulate: "
# print sum(trials) / float(len(trials))
#
# mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
# vi = ValueIteration()
# vi.solve(mdp)
#
# print "Average rewards of Value Iteration: "
# print sum(vi.V.values() ) / float(len(vi.V.values()))
# #
# # if( list(mdp.states) == vi.pi.keys() ):
# # print "mdp states and value iteration states ARE THE SAME!\n"
# #
# # if( vi.V.keys() == vi.pi.keys() ):
# # print "V.keys and vi.pi.keys ARE THE SAME!\n"
# #
# #print "vi.V.keys() == ", vi.V.keys()
#
# print "Differences between Simulate (and) Value Iteration: "
# count = 0
# for state, action in vi.pi.iteritems():
# rlaction = rl.getAction( state )
# if( action != rlaction):
# count += 1
# #print "\n"
# print "**Value Iteration Policy: \t"+action +": ", state
# print "**Simulate Policy: \t\t" +rlaction+": " , state
# #print "\n"
# #else:
# # print "Same Value Iteration Result: \t"+action +": ", state
# # print "Same Simulate Result: \t\t" +rlaction+": " , state
#
# print "\nsmall mdp: TOTAL NUM DIFFERENCES BTWN SIM AND VI: " + str( count )
# print "percent error: ", float(count) / float(len(vi.V.keys()))
#
# #print sum( vi.pi.values() ) / float(len( vi.pi.values() ))
#
#
# # Large test case
# largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)
#
# mdp = largeMDP
#
# print "\n######################################"
# print "######## L A R G E * M D P ###########"
# print "######################################\n"
#
# rl = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor, .2 )
#
# # simulate(mdp, rl, numTrials=10, maxIterations=1000, verbose=False, sort=False):
#
# trials = util.simulate(mdp, rl, 30000, 1000, False, True )
#
#
# rl.explorationProb = 0
#
# print "Average rewards of Simulate: "
# print sum(trials) / float(len(trials))
#
# mdp = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)
# vi = ValueIteration()
# vi.solve(mdp)
#
# print "Average rewards of Value Iteration: "
# print sum(vi.V.values() ) / float(len(vi.V.values()))
# #
# # if( list(mdp.states) == list(vi.pi.keys()) ):
# # print "mdp states and value iteration states ARE THE SAME!\n"
# # else:
# # print "MDP STATES AND VI STATES NOT SAME? MPD # states = " + str(len(list(mdp.states))) +" Vi num states = " +str(len(vi.pi.keys()))
# #
# # if( vi.V.keys() == vi.pi.keys() ):
# # print "V.keys and vi.pi.keys ARE THE SAME!\n"
#
# # print "vi.V.keys() == ", vi.V.keys()
#
# print "Differences between Simulate (and) Value Iteration: "
# limitCount = 0
# for state, action in vi.pi.iteritems():
# rlaction = rl.getAction( state )
# if( action != rlaction):
# limitCount += 1
# if limitCount <= 10:
# #print "\n"
# print "**Value Iteration Policy: \t"+action +": ", state
# print "**Simulate Policy: \t\t" +rlaction+": " , state
# #print "\n"
#
# print "\nLarge Mdp: TOTAL NUM DIFFERENCES BTWN SIM AND VI: " + str( limitCount )
# print "percent error: ", float(limitCount) / float(len(vi.V.keys()))
#
# #else:
# # print "Same Value Iteration Result: \t"+action +": ", state
# # print "Same Simulate Result: \t\t" +rlaction+": " , state
#
#
############################################################
# Problem 4c: features for Q-learning.
# You should return a list of (feature key, feature value) pairs (see
# identityFeatureExtractor()).
# Implement the following features:
# - indicator on the total and the action (1 feature).
# - indicator on the presence/absence of each card and the action (1 feature).
# Example: if the deck is (3, 4, 0 , 2), then your indicator on the presence
# of each card is (1,1,0,1)
# Only add this feature if the deck != None
# - indicator on the number of cards for each card type and the action (len(counts) features).
# Only add these features if the deck != None
def blackjackFeatureExtractor(state, action):
total, nextCard, counts = state
features = []
featureValue = 1.0
f = float(total)
features.append( ( ( 'total_f', f, action), featureValue ) )
if( counts != None):
binary = []
index = 0
for value in counts:
binary.append( float(value>0) )
f = (float(index), float(value))
features.append( ( ( 'counting_f', f, action), featureValue ) )
index += 1
f = tuple(binary)
features.append( ( ( "binary_f", f, action), featureValue ) )
return features
#
#
# smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
#
# mdp = smallMDP
#
# print "\n\n\n######################################"
# print "######## s m a l l * m d p ###########"
# print "######################################\n"
#
# rl = QLearningAlgorithm(mdp.actions, mdp.discount(), blackjackFeatureExtractor, .2 )
#
# # simulate(mdp, rl, numTrials=10, maxIterations=1000, verbose=False, sort=False):
#
# trials = util.simulate(mdp, rl, 30000, 1000, False, True )
#
# rl.explorationProb = 0
#
# print "Average rewards of Simulate: "
# print sum(trials) / float(len(trials))
#
# mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
# vi = ValueIteration()
# vi.solve(mdp)
#
# print "Average rewards of Value Iteration: "
# print sum(vi.V.values() ) / float(len(vi.V.values()))
# #
# # if( list(mdp.states) == vi.pi.keys() ):
# # print "mdp states and value iteration states ARE THE SAME!\n"
# #
# # if( vi.V.keys() == vi.pi.keys() ):
# # print "V.keys and vi.pi.keys ARE THE SAME!\n"
# #
# #print "vi.V.keys() == ", vi.V.keys()
#
# print "Differences between Simulate (and) Value Iteration: "
# count = 0
# for state, action in vi.pi.iteritems():
# rlaction = rl.getAction( state )
# if( action != rlaction):
# count += 1
# #print "\n"
# print "**Value Iteration Policy: \t"+action +": ", state
# print "**Simulate Policy: \t\t" +rlaction+": " , state
# #print "\n"
# #else:
# # print "Same Value Iteration Result: \t"+action +": ", state
# # print "Same Simulate Result: \t\t" +rlaction+": " , state
#
# print "\nsmall mdp: TOTAL NUM DIFFERENCES BTWN SIM AND VI: " + str( count )
# print "percent error: ", float(count) / float(len(vi.V.keys()))
#
# #print sum( vi.pi.values() ) / float(len( vi.pi.values() ))
#
#
# # Large test case
# largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)
#
# mdp = largeMDP
#
# print "\n######################################"
# print "######## L A R G E * M D P ###########"
# print "######################################\n"
#
# rl = QLearningAlgorithm(mdp.actions, mdp.discount(), blackjackFeatureExtractor, .2 )
#
# # simulate(mdp, rl, numTrials=10, maxIterations=1000, verbose=False, sort=False):
#
# trials = util.simulate(mdp, rl, 30000, 1000, False, True )
#
#
# rl.explorationProb = 0
#
# print "Average rewards of Simulate: "
# print sum(trials) / float(len(trials))
#
# mdp = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)
# vi = ValueIteration()
# vi.solve(mdp)
#
# print "Average rewards of Value Iteration: "
# print sum(vi.V.values() ) / float(len(vi.V.values()))
# #
# # if( list(mdp.states) == list(vi.pi.keys()) ):
# # print "mdp states and value iteration states ARE THE SAME!\n"
# # else:
# # print "MDP STATES AND VI STATES NOT SAME? MPD # states = " + str(len(list(mdp.states))) +" Vi num states = " +str(len(vi.pi.keys()))
# #
# # if( vi.V.keys() == vi.pi.keys() ):
# # print "V.keys and vi.pi.keys ARE THE SAME!\n"
#
# # print "vi.V.keys() == ", vi.V.keys()
#
# print "Differences between Simulate (and) Value Iteration: "
# limitCount = 0
# for state, action in vi.pi.iteritems():
# rlaction = rl.getAction( state )
# if( action != rlaction):
# limitCount += 1
# if limitCount <= 10:
# #print "\n"
# print "**Value Iteration Policy: \t"+action +": ", state
# print "**Simulate Policy: \t\t" +rlaction+": " , state
# #print "\n"
#
# print "\nLarge Mdp: TOTAL NUM DIFFERENCES BTWN SIM AND VI: " + str( limitCount )
# print "percent error: ", float(limitCount) / float(len(vi.V.keys()))
#
# #else:
# # print "Same Value Iteration Result: \t"+action +": ", state
# # print "Same Simulate Result: \t\t" +rlaction+": " , state
#
#
#
#
#
############################################################
# Problem 4d: What happens when the MDP changes underneath you?!
#
# # Original mdp
# originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
#
#
# mdp = originalMDP
# vi = ValueIteration()
# vi.solve(mdp)
#
#
# # New threshold
# newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1)
#
# mdp = newThresholdMDP
#
# rl = util.FixedRLAlgorithm( vi.pi )
# #rl = QLearningAlgorithm(mdp.actions, mdp.discount(), blackjackFeatureExtractor )
#
#
# trials = util.simulate(mdp, rl, 300, 1000, True, False )
#