bib-lecture.bib

@incollection{scarf1959optimality,
  title={The Optimality of (s, {S}) Policies in the Dynamic Inventory Problem},
  author={Scarf, Herbert},
  booktitle={Mathematical Methods in the Social Sciences},
  editor={Arrow, Kenneth J. and Karlin, Samuel and Suppes, Patrick},
  year={1959},
  publisher={Stanford University Press},
  address={Stanford, CA},
  pages={196--202},
  chapter={13}
}

@article{chen2021decision,
  title={Decision transformer: Reinforcement learning via sequence modeling},
  author={Chen, Lili and Lu, Kevin and Rajeswaran, Aravind and Lee, Kimin and Grover, Aditya and Laskin, Misha and Abbeel, Pieter and Srinivas, Aravind and Mordatch, Igor},
  journal={Advances in neural information processing systems},
  volume={34},
  pages={15084--15097},
  year={2021}
}

@book{altman2021constrained,
  title={Constrained Markov decision processes},
  author={Altman, Eitan},
  year={2021},
  publisher={Routledge}
}

@inproceedings{ng2000algorithms,
  title={Algorithms for inverse reinforcement learning.},
  author={Ng, Andrew Y and Russell, Stuart and others},
  booktitle={Icml},
  volume={1},
  number={2},
  pages={2},
  year={2000}
}

@book{khalil1996robust,
  title={Robust and optimal control},
  author={Khalil, IS and Doyle, JC and Glover, K},
  volume={2},
  year={1996},
  publisher={Prentice hall}
}

@article{mannor2013algorithmic,
  title={Algorithmic aspects of mean--variance optimization in Markov decision processes},
  author={Mannor, Shie and Tsitsiklis, John N},
  journal={European Journal of Operational Research},
  volume={231},
  number={3},
  pages={645--653},
  year={2013},
  publisher={Elsevier}
}

@article{mannor2004geometric,
  title={A geometric approach to multi-criterion reinforcement learning},
  author={Mannor, Shie and Shimkin, Nahum},
  journal={The Journal of Machine Learning Research},
  volume={5},
  pages={325--360},
  year={2004},
  publisher={JMLR. org}
}

@article{kaufmann2023champion,
  title={Champion-level drone racing using deep reinforcement learning},
  author={Kaufmann, Elia and Bauersfeld, Leonard and Loquercio, Antonio and M{\"u}ller, Matthias and Koltun, Vladlen and Scaramuzza, Davide},
  journal={Nature},
  volume={620},
  number={7976},
  pages={982--987},
  year={2023},
  publisher={Nature Publishing Group UK London}
}

@article{vinyals2019grandmaster,
  title={Grandmaster level in StarCraft II using multi-agent reinforcement learning},
  author={Vinyals, Oriol and Babuschkin, Igor and Czarnecki, Wojciech M and Mathieu, Micha{\"e}l and Dudzik, Andrew and Chung, Junyoung and Choi, David H and Powell, Richard and Ewalds, Timo and Georgiev, Petko and others},
  journal={nature},
  volume={575},
  number={7782},
  pages={350--354},
  year={2019},
  publisher={Nature Publishing Group}
}

@book{russell2016artificial,
  title={Artificial intelligence: a modern approach},
  author={Russell, Stuart J and Norvig, Peter},
  year={2016},
  publisher={Pearson}
}

@book{powell2007approximate,
  title={Approximate Dynamic Programming: Solving the curses of dimensionality},
  author={Powell, Warren B},
  volume={703},
  year={2007},
  publisher={John Wiley \& Sons}
}

@article{bemporad1999control,
  title={Control of systems integrating logic, dynamics, and constraints},
  author={Bemporad, Alberto and Morari, Manfred},
  journal={Automatica},
  volume={35},
  number={3},
  pages={407--427},
  year={1999},
  publisher={Elsevier}
}

@book{kirk2004optimal,
  title={Optimal control theory: an introduction},
  author={Kirk, Donald E},
  year={2004},
  publisher={Courier Corporation}
}

@inproceedings{zhang2023planning,
  title={Planning with Large Language Models for Code Generation},
  author={Zhang, Shun and Chen, Zhenfang and Shen, Yikang and Ding, Mingyu and Tenenbaum, Joshua B. and Gan, Chuang},
  booktitle={Proceedings of the International Conference on Learning Representations (ICLR)},
  year={2023},
}

@article{ouyang2022training,
  title={Training language models to follow instructions with human feedback},
  author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
  journal={Advances in neural information processing systems},
  volume={35},
  pages={27730--27744},
  year={2022}
}

@article{levine2016end,
  title={End-to-end training of deep visuomotor policies},
  author={Levine, Sergey and Finn, Chelsea and Darrell, Trevor and Abbeel, Pieter},
  journal={Journal of Machine Learning Research},
  volume={17},
  number={39},
  pages={1--40},
  year={2016}
}

@article{taylor2009transfer,
  title={Transfer learning for reinforcement learning domains: A survey.},
  author={Taylor, Matthew E and Stone, Peter},
  journal={Journal of Machine Learning Research},
  volume={10},
  number={7},
  year={2009}
}

@article{khetarpal2022towards,
  title={Towards continual reinforcement learning: A review and perspectives},
  author={Khetarpal, Khimya and Riemer, Matthew and Rish, Irina and Precup, Doina},
  journal={Journal of Artificial Intelligence Research},
  volume={75},
  pages={1401--1476},
  year={2022}
}

@article{nilim2005robust,
  title={Robust control of Markov decision processes with uncertain transition matrices},
  author={Nilim, Arnab and El Ghaoui, Laurent},
  journal={Operations Research},
  volume={53},
  number={5},
  pages={780--798},
  year={2005},
  publisher={INFORMS}
}

@inproceedings{tobin2017domain,
  title={Domain randomization for transferring deep neural networks from simulation to the real world},
  author={Tobin, Josh and Fong, Rachel and Ray, Alex and Schneider, Jonas and Zaremba, Wojciech and Abbeel, Pieter},
  booktitle={2017 IEEE/RSJ international conference on intelligent robots and systems (IROS)},
  pages={23--30},
  year={2017},
  organization={IEEE}
}

@article{kirk2023survey,
  title={A survey of zero-shot generalisation in deep reinforcement learning},
  author={Kirk, Robert and Zhang, Amy and Grefenstette, Edward and Rockt{\"a}schel, Tim},
  journal={Journal of Artificial Intelligence Research},
  volume={76},
  pages={201--264},
  year={2023}
}

@article{ghavamzadeh2015bayesian,
  title={Bayesian reinforcement learning: A survey},
  author={Ghavamzadeh, Mohammad and Mannor, Shie and Pineau, Joelle and Tamar, Aviv and others},
  journal={Foundations and Trends{\textregistered} in Machine Learning},
  volume={8},
  number={5-6},
  pages={359--483},
  year={2015},
  publisher={Now Publishers, Inc.}
}

@article{beck2023survey,
  title={A survey of meta-reinforcement learning},
  author={Beck, Jacob and Vuorio, Risto and Liu, Evan Zheran and Xiong, Zheng and Zintgraf, Luisa and Finn, Chelsea and Whiteson, Shimon},
  journal={arXiv preprint arXiv:2301.08028},
  year={2023}
}

@article{hallak2015contextual,
  title={Contextual markov decision processes},
  author={Hallak, Assaf and Di Castro, Dotan and Mannor, Shie},
  journal={arXiv preprint arXiv:1502.02259},
  year={2015}
}

@inproceedings{tamar2022regularization,
  title={Regularization guarantees generalization in bayesian reinforcement learning through algorithmic stability},
  author={Tamar, Aviv and Soudry, Daniel and Zisselman, Ev},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={36},
  number={8},
  pages={8423--8431},
  year={2022}
}

@article{srivastava2019training,
  title={Training agents using upside-down reinforcement learning},
  author={Srivastava, Rupesh Kumar and Shyam, Pranav and Mutz, Filipe and Ja{\'s}kowski, Wojciech and Schmidhuber, J{\"u}rgen},
  journal={arXiv preprint arXiv:1912.02877},
  year={2019}
}

@inproceedings{kaelbling1993learning,
  title={Learning to achieve goals},
  author={Kaelbling, Leslie Pack},
  booktitle={IJCAI},
  volume={2},
  pages={1094--8},
  year={1993},
  organization={Citeseer}
}

@article{lake2017building,
  title={Building machines that learn and think like people},
  author={Lake, Brenden M and Ullman, Tomer D and Tenenbaum, Joshua B and Gershman, Samuel J},
  journal={Behavioral and brain sciences},
  volume={40},
  pages={e253},
  year={2017},
  publisher={Cambridge University Press}
}

@book{astrom2008adaptive,
  title={Adaptive Control},
  author={{\AA}str{\"o}m, K.J. and Wittenmark, B.},
  isbn={9780486462783},
  lccn={2008007690},
  series={Dover Books on Electrical Engineering},
  url={https://books.google.co.il/books?id=L0m_CR-IK24C},
  year={2008},
  publisher={Dover Publications}
}


@article{sutton1992reinforcement,
  title={Reinforcement learning is direct adaptive optimal control},
  author={Sutton, Richard S and Barto, Andrew G and Williams, Ronald J},
  journal={IEEE control systems magazine},
  volume={12},
  number={2},
  pages={19--22},
  year={1992},
  publisher={IEEE}
}

@article{dijkstra1959note,
  title={A note on two problems in connexion with graphs},
  author={Dijkstra, EW},
  journal={Numerische Mathematik},
  volume={1},
  pages={269--271},
  year={1959},
  publisher={Springer}
}

@book{hirsch2013differential,
  title={Differential equations, dynamical systems, and an introduction to chaos},
  author={Hirsch, Morris W and Smale, Stephen and Devaney, Robert L},
  year={2013},
  publisher={Academic press}
}

@book{khalil2002nonlinear,
  title={Nonlinear Systems},
  author={Khalil, H.K.},
  isbn={9780130673893},
  lccn={95045804},
  series={Pearson Education},
  url={https://books.google.co.il/books?id=t_d1QgAACAAJ},
  year={2002},
  publisher={Prentice Hall}
}


@article{latif2014banach,
  title={Banach contraction principle and its generalizations},
  author={Latif, Abdul},
  journal={Topics in fixed point theory},
  pages={33--64},
  year={2014},
  publisher={Springer}
}

@book{JacobsonMayne1970,
  title={Differential Dynamic Programming},
  author={Jacobson, Donald E. and Mayne, David Q.},
  year={1970},
  publisher={American Elsevier Publishing Company},
  address={New York}
}

@inproceedings{todorov2005generalized,
  title={A generalized iterative LQG method for locally-optimal feedback control of constrained nonlinear stochastic systems},
  author={Todorov, Emanuel and Li, Weiwei},
  booktitle={Proceedings of the 2005, American Control Conference, 2005.},
  pages={300--306},
  year={2005},
  organization={IEEE}
}

@article{viterbi1967error,
  title={Error bounds for convolutional codes and an asymptotically optimum decoding algorithm},
  author={Viterbi, Andrew},
  journal={IEEE transactions on Information Theory},
  volume={13},
  number={2},
  pages={260--269},
  year={1967},
  publisher={IEEE}
}

@article{hart1968formal,
  title={A formal basis for the heuristic determination of minimum cost paths},
  author={Hart, Peter E and Nilsson, Nils J and Raphael, Bertram},
  journal={IEEE transactions on Systems Science and Cybernetics},
  volume={4},
  number={2},
  pages={100--107},
  year={1968},
  publisher={IEEE}
}

@inproceedings{ng1999policy,
  title={Policy invariance under reward transformations: Theory and application to reward shaping},
  author={Ng, Andrew Y and Harada, Daishi and Russell, Stuart},
  booktitle={International Conference on Machine Learning},
  volume={99},
  pages={278--287},
  year={1999}
}

@article{greensmith2004variance,
  title={Variance Reduction Techniques for Gradient Estimates in Reinforcement Learning.},
  author={Greensmith, Evan and Bartlett, Peter L and Baxter, Jonathan},
  journal={Journal of Machine Learning Research},
  volume={5},
  number={9},
  year={2004}
}

@inproceedings{scherrer2014local,
  title={Local policy search in a convex space and conservative policy iteration as boosted policy search},
  author={Scherrer, Bruno and Geist, Matthieu},
  booktitle={Machine Learning and Knowledge Discovery in Databases: European Conference, ECML PKDD 2014, Nancy, France, September 15-19, 2014. Proceedings, Part III 14},
  pages={35--50},
  year={2014},
  organization={Springer}
}

@article{schulman2017proximal,
  title={Proximal policy optimization algorithms},
  author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
  journal={arXiv preprint arXiv:1707.06347},
  year={2017}
}

@article{KearnsS02,
  author    = {Michael J. Kearns and
               Satinder P. Singh},
  title     = {Near-Optimal Reinforcement Learning in Polynomial Time},
  journal   = {Machine Learning},
  volume    = {49},
  number    = {2-3},
  pages     = {209--232},
  year      = {2002},
  url       = {https://doi.org/10.1023/A:1017984413808},
  doi       = {10.1023/A:1017984413808},
  timestamp = {Sun, 28 May 2017 13:18:23 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/ml/KearnsS02},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@book{kushner2003stochastic,
  title={{Stochastic approximation and recursive algorithms and applications}},
  author={Kushner, H.J. and Yin, G.},

  year={2003},
  publisher={Springer Verlag}
}

@book{borkar2009stochastic,
  title={Stochastic approximation: a dynamical systems viewpoint},
  author={Borkar, Vivek S},
  volume={48},
  year={2009},
  publisher={Springer}
}

@article{munos2007performance,
  title={Performance bounds in l\_p-norm for approximate value iteration},
  author={Munos, R{\'e}mi},
  journal={SIAM journal on control and optimization},
  volume={46},
  number={2},
  pages={541--561},
  year={2007},
  publisher={SIAM}
}

@inproceedings{kearns2000bias,
  title={Bias-Variance Error Bounds for Temporal Difference Updates.},
  author={Kearns, Michael J and Singh, Satinder},
  booktitle={COLT},
  pages={142--147},
  year={2000}
}

@inproceedings{KearnsS98a,
  author    = {Michael J. Kearns and
               Satinder P. Singh},
  title     = {Finite-Sample Convergence Rates for Q-Learning and Indirect Algorithms},
  booktitle = {Advances in Neural Information Processing Systems 11, {[NIPS} Conference,
               Denver, Colorado, USA, November 30 - December 5, 1998]},
  pages     = {996--1002},
  year      = {1998}
}

@article{BrafmanT02,
  author    = {Ronen I. Brafman and
               Moshe Tennenholtz},
  title     = {{R-MAX} - {A} General Polynomial Time Algorithm for Near-Optimal Reinforcement
               Learning},
  journal   = {Journal of Machine Learning Research},
  volume    = {3},
  pages     = {213--231},
  year      = {2002},
  url       = {http://www.jmlr.org/papers/v3/brafman02a.html},
  timestamp = {Thu, 05 Feb 2004 13:43:02 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/jmlr/BrafmanT02},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{StrehlLL09,
  author    = {Alexander L. Strehl and
               Lihong Li and
               Michael L. Littman},
  title     = {Reinforcement Learning in Finite MDPs: {PAC} Analysis},
  journal   = {Journal of Machine Learning Research},
  volume    = {10},
  pages     = {2413--2444},
  year      = {2009},
  url       = {http://doi.acm.org/10.1145/1577069.1755867},
  doi       = {10.1145/1577069.1755867},
  timestamp = {Mon, 13 Nov 2017 02:31:07 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/jmlr/StrehlLL09},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@Inbook{Li2012,
author="Li, Lihong",
title="Sample Complexity Bounds of Exploration",
bookTitle="Reinforcement Learning: State-of-the-Art",
year="2012",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="175--204",
abstract="Efficient exploration is widely recognized as a fundamental challenge inherent in reinforcement learning. Algorithms that explore efficiently converge faster to near-optimal policies. While heuristics techniques are popular in practice, they lack formal guarantees and may not work well in general. This chapter studies algorithms with polynomial sample complexity of exploration, both model-based and model-free ones, in a unified manner. These so-called PAC-MDP algorithms behave near-optimally except in a ``small'' number of steps with high probability. A new learning model known as KWIK is used to unify most existing model-based PAC-MDP algorithms for various subclasses of Markov decision processes.We also compare the sample-complexity framework to alternatives for formalizing exploration efficiency such as regret minimization and Bayes optimal solutions.",
isbn="978-3-642-27645-3",
doi="10.1007/978-3-642-27645-3_6",
url="https://doi.org/10.1007/978-3-642-27645-3_6"
}


@article{SinghS96,
  author    = {Satinder P. Singh and
               Richard S. Sutton},
  title     = {Reinforcement Learning with Replacing Eligibility Traces},
  journal   = {Machine Learning},
  volume    = {22},
  number    = {1-3},
  pages     = {123--158},
  year      = {1996},
  url       = {https://doi.org/10.1023/A:1018012322525},
  doi       = {10.1023/A:1018012322525},
  timestamp = {Sun, 28 May 2017 13:18:24 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/ml/SinghS96},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{Even-DarM03,
  author    = {Eyal Even{-}Dar and
               Yishay Mansour},
  title     = {Learning Rates for Q-learning},
  journal   = {Journal of Machine Learning Research},
  volume    = {5},
  pages     = {1--25},
  year      = {2003},
  url       = {http://www.jmlr.org/papers/v5/evendar03a.html},
  timestamp = {Thu, 05 Feb 2004 13:53:36 +0100},
  biburl    = {https://dblp.org/rec/bib/journals/jmlr/Even-DarM03},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{SeijenHWW09,
  author    = {Harm van Seijen and
               Hado van Hasselt and
               Shimon Whiteson and
               Marco A. Wiering},
  title     = {A theoretical and empirical analysis of Expected Sarsa},
  booktitle = {{IEEE} Symposium on Adaptive Dynamic Programming and Reinforcement
               Learning, {ADPRL} 2009, Nashville, TN, USA, March 31 - April 1, 2009},
  pages     = {177--184},
  year      = {2009}
}

@book{SuttonB98,
  author    = {Richard S. Sutton and
               Andrew G. Barto},
  title     = {Reinforcement learning - an introduction},
  series    = {Adaptive computation and machine learning},
  publisher = {{MIT} Press},
  year      = {1998},
  url       = {http://www.worldcat.org/oclc/37293240},
  isbn      = {0262193981},
  timestamp = {Wed, 26 Apr 2017 17:48:08 +0200},
  biburl    = {https://dblp.org/rec/bib/books/lib/SuttonB98},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{Tesauro95,
  author    = {Gerald Tesauro},
  title     = {Temporal Difference Learning and TD-Gammon},
  journal   = {Commun. {ACM}},
  volume    = {38},
  number    = {3},
  pages     = {58--68},
  year      = {1995},
  url       = {http://doi.acm.org/10.1145/203330.203343},
  doi       = {10.1145/203330.203343},
  timestamp = {Tue, 07 Jun 2011 16:52:36 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/cacm/Tesauro95},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Tesauro02,
  author    = {Gerald Tesauro},
  title     = {Programming backgammon using self-teaching neural nets},
  journal   = {Artif. Intell.},
  volume    = {134},
  number    = {1-2},
  pages     = {181--199},
  year      = {2002},
  url       = {https://doi.org/10.1016/S0004-3702(01)00110-2},
  doi       = {10.1016/S0004-3702(01)00110-2},
  timestamp = {Sat, 27 May 2017 14:24:43 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/ai/Tesauro02},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{MnihKSRVBGRFOPB15,
  author    = {Volodymyr Mnih and
               Koray Kavukcuoglu and
               David Silver and
               Andrei A. Rusu and
               Joel Veness and
               Marc G. Bellemare and
               Alex Graves and
               Martin A. Riedmiller and
               Andreas Fidjeland and
               Georg Ostrovski and
               Stig Petersen and
               Charles Beattie and
               Amir Sadik and
               Ioannis Antonoglou and
               Helen King and
               Dharshan Kumaran and
               Daan Wierstra and
               Shane Legg and
               Demis Hassabis},
  title     = {Human-level control through deep reinforcement learning},
  journal   = {Nature},
  volume    = {518},
  number    = {7540},
  pages     = {529--533},
  year      = {2015},
  url       = {https://doi.org/10.1038/nature14236},
  doi       = {10.1038/nature14236},
  timestamp = {Sat, 20 May 2017 00:24:51 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/nature/MnihKSRVBGRFOPB15},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{KohlS04,
  author    = {Nate Kohl and
               Peter Stone},
  title     = {Policy Gradient Reinforcement Learning for Fast Quadrupedal Locomotion},
  booktitle = {Proceedings of the 2004 {IEEE} International Conference on Robotics
               and Automation, {ICRA} 2004, April 26 - May 1, 2004, New Orleans,
               LA, {USA}},
  pages     = {2619--2624},
  year      = {2004}
}

@article{SilverHMGSDSAPL16,
  author    = {David Silver and
               Aja Huang and
               Chris J. Maddison and
               Arthur Guez and
               Laurent Sifre and
               George van den Driessche and
               Julian Schrittwieser and
               Ioannis Antonoglou and
               Vedavyas Panneershelvam and
               Marc Lanctot and
               Sander Dieleman and
               Dominik Grewe and
               John Nham and
               Nal Kalchbrenner and
               Ilya Sutskever and
               Timothy P. Lillicrap and
               Madeleine Leach and
               Koray Kavukcuoglu and
               Thore Graepel and
               Demis Hassabis},
  title     = {Mastering the game of {Go} with deep neural networks and tree search},
  journal   = {Nature},
  volume    = {529},
  number    = {7587},
  pages     = {484--489},
  year      = {2016},
  url       = {https://doi.org/10.1038/nature16961},
  doi       = {10.1038/nature16961},
  timestamp = {Sat, 20 May 2017 00:24:51 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/nature/SilverHMGSDSAPL16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{AbbeelCN2010,
 author = {Abbeel, Pieter and Coates, Adam and Ng, Andrew Y.},
 title = {Autonomous Helicopter Aerobatics Through Apprenticeship Learning},
 journal = {Int. J. Rob. Res.},
 issue_date = {November  2010},
 volume = {29},
 number = {13},
 month = nov,
 year = {2010},
 issn = {0278-3649},
 pages = {1608--1639},
 numpages = {32},
 url = {http://dx.doi.org/10.1177/0278364910371999},
 doi = {10.1177/0278364910371999},
 acmid = {1894944},
 publisher = {Sage Publications, Inc.},
 address = {Thousand Oaks, CA, USA},
 keywords = {Apprenticeship learning, autonomous flight, autonomous helicopter, helicopter aerobatics, learning from demonstrations},
}

@inproceedings{KearnsMN99,
  author    = {Michael J. Kearns and
               Yishay Mansour and
               Andrew Y. Ng},
  title     = {Approximate Planning in Large POMDPs via Reusable Trajectories},
  booktitle = {Advances in Neural Information Processing Systems 12, {[NIPS} Conference,
               Denver, Colorado, USA, November 29 - December 4, 1999]},
  pages     = {1001--1007},
  year      = {1999},
  crossref  = {DBLP:conf/nips/1999},
  url       = {http://papers.nips.cc/paper/1664-approximate-planning-in-large-pomdps-via-reusable-trajectories},
  timestamp = {Thu, 11 Dec 2014 17:34:08 +0100},
  biburl    = {https://dblp.org/rec/bib/conf/nips/KearnsMN99},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{KearnsMN02,
  author    = {Michael J. Kearns and
               Yishay Mansour and
               Andrew Y. Ng},
  title     = {A Sparse Sampling Algorithm for Near-Optimal Planning in Large Markov
               Decision Processes},
  journal   = {Machine Learning},
  volume    = {49},
  number    = {2-3},
  pages     = {193--208},
  year      = {2002},
  url       = {https://doi.org/10.1023/A:1017932429737},
  doi       = {10.1023/A:1017932429737},
  timestamp = {Sun, 28 May 2017 13:18:23 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/ml/KearnsMN02},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{PieterN04,
  author    = {Pieter Abbeel and
               Andrew Y. Ng},
  title     = {Apprenticeship learning via inverse reinforcement learning},
  booktitle = {Machine Learning, Proceedings of the Twenty-first International Conference
               {(ICML} 2004), Banff, Alberta, Canada, July 4-8, 2004},
  year      = {2004}
}

@inproceedings{NgR00,
  author    = {Andrew Y. Ng and
               Stuart J. Russell},
  title     = {Algorithms for Inverse Reinforcement Learning},
  booktitle = {Proceedings of the Seventeenth International Conference on Machine
               Learning {(ICML} 2000), Stanford University, Stanford, CA, USA, June
               29 - July 2, 2000},
  pages     = {663--670},
  year      = {2000}
}

@article{TsitsiklisVR97,
 author = {J. Tsitsiklis and B. Van Roy},
 title = {An Analysis of Temporal-Difference Learning with Function Approximation},
 journal = {IEEE Trans. on Automatic Control},
 volume = {42},
 number = {5},
 year = {1997},
 pages = {674--690},
}

@book{BookCormenLRS2009,
  author    = {Thomas H. Cormen and
               Charles E. Leiserson and
               Ronald L. Rivest and
               Clifford Stein},
  title     = {Introduction to Algorithms, 3rd Edition},
  publisher = {{MIT} Press},
  year      = {2009},
  url       = {http://mitpress.mit.edu/books/introduction-algorithms}
}


@article{kearns2002sparse,
  title={A sparse sampling algorithm for near-optimal planning in large Markov decision processes},
  author={Kearns, Michael and Mansour, Yishay and Ng, Andrew Y},
  journal={Machine learning},
  volume={49},
  number={2-3},
  pages={193--208},
  year={2002},
  publisher={Springer}
}

@inproceedings{kocsis2006bandit,
  title={Bandit based monte-carlo planning},
  author={Kocsis, Levente and Szepesv{\'a}ri, Csaba},
  booktitle={European conference on machine learning},
  pages={282--293},
  year={2006},
  organization={Springer}
}

@incollection{gordon1995stable,
  title={Stable function approximation in dynamic programming},
  author={Gordon, Geoffrey J},
  booktitle={Machine Learning Proceedings 1995},
  pages={261--268},
  year={1995},
  publisher={Elsevier}
}

@article{silver2016mastering,
  title={Mastering the game of Go with deep neural networks and tree search},
  author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
  journal={Nature},
  volume={529},
  number={7587},
  pages={484--489},
  year={2016},
  publisher={Nature Publishing Group}
}

@article{silver2017mastering,
  title={Mastering the game of go without human knowledge},
  author={Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and Guez, Arthur and Hubert, Thomas and Baker, Lucas and Lai, Matthew and Bolton, Adrian and others},
  journal={Nature},
  volume={550},
  number={7676},
  pages={354--359},
  year={2017},
  publisher={Nature Publishing Group}
}

@article{mnih2015human,
  title={Human-level control through deep reinforcement learning},
  author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and others},
  journal={nature},
  volume={518},
  number={7540},
  pages={529--533},
  year={2015},
  publisher={Nature Publishing Group}
}

@article{Samuel62,
  author    = {Arthur L. Samuel},
  title     = {Artificial intelligence - a frontier of automation},
  journal   = {Elektron. Rechenanlagen},
  volume    = {4},
  number    = {4},
  pages     = {173--177},
  year      = {1962},
  url       = {https://doi.org/10.1524/itit.1962.4.16.173},
  doi       = {10.1524/itit.1962.4.16.173},
  timestamp = {Mon, 18 May 2020 12:40:49 +0200},
  biburl    = {https://dblp.org/rec/journals/it/Samuel62.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DeepBlue,
title = {Deep Blue},
journal = {Artificial Intelligence},
volume = {134},
number = {1},
pages = {57-83},
year = {2002},
issn = {0004-3702},
doi = {https://doi.org/10.1016/S0004-3702(01)00129-1},
url = {https://www.sciencedirect.com/science/article/pii/S0004370201001291},
author = {Murray Campbell and A.Joseph Hoane and Feng-hsiung Hsu},
keywords = {Computer chess, Game tree search, Parallel search, Selective search, Search extensions, Evaluation function},
abstract = {Deep Blue is the chess machine that defeated then-reigning World Chess Champion Garry Kasparov in a six-game match in 1997. There were a number of factors that contributed to this success, including: •a single-chip chess search engine,•a massively parallel system with multiple levels of parallelism,•a strong emphasis on search extensions,•a complex evaluation function, and•effective use of a Grandmaster game database. This paper describes the Deep Blue system, and gives some of the rationale that went into the design decisions behind Deep Blue.}
}

@article{Karp78,
  author    = {Richard M. Karp},
  title     = {A characterization of the minimum cycle mean in a digraph},
  journal   = {Discret. Math.},
  volume    = {23},
  number    = {3},
  pages     = {309--311},
  year      = {1978},
  url       = {https://doi.org/10.1016/0012-365X(78)90011-0},
  doi       = {10.1016/0012-365X(78)90011-0},
  timestamp = {Fri, 12 Feb 2021 13:44:46 +0100},
  biburl    = {https://dblp.org/rec/journals/dm/Karp78.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{ChaturvediM17,
  author    = {Mmanu Chaturvedi and
               Ross M. McConnell},
  title     = {A note on finding minimum mean cycle},
  journal   = {Inf. Process. Lett.},
  volume    = {127},
  pages     = {21--22},
  year      = {2017},
  url       = {https://doi.org/10.1016/j.ipl.2017.06.007},
  doi       = {10.1016/j.ipl.2017.06.007},
  timestamp = {Tue, 12 Sep 2017 17:58:15 +0200},
  biburl    = {https://dblp.org/rec/journals/ipl/ChaturvediM17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@book{cormen2009introduction,
  title={Introduction to algorithms},
  author={Cormen, Thomas H and Leiserson, Charles E and Rivest, Ronald L and Stein, Clifford},
  year={2009},
  publisher={MIT press}
}

@book{KleinbergTardos06,
  author = {Kleinberg, Jon and Tardos, \'Eva},
  publisher = {Addison Wesley},
  title = {Algorithm Design},
  year = 2006
}
@book{DasguptaPapadimitriouVazirani08,
  author    = {Sanjoy Dasgupta and
               Christos H. Papadimitriou and
               Umesh V. Vazirani},
  title     = {Algorithms},
  publisher = {McGraw-Hill},
  year      = {2008}
}

@book{van1996weak,
  title={Weak Convergence and Empirical Processes: With Applications to Statistics},
  author={van der Vaart, AW and van der Vaart, A.W. and van der Vaart, A. and Wellner, J.},
  isbn={9780387946405},
  lccn={95049099},
  series={Springer Series in Statistics},
  url={https://books.google.fr/books?id=seH8dMrEgggC},
  year={1996},
  publisher={Springer}
}


@book{Howard1960,
  author = {Howard, R. A.},
  publisher = {MIT Press},
  title = {Dynamic Programming and Markov Processes},
  year = 1960
}

@book{Bellman:DynamicProgramming,
  author = {Bellman, Richard},
  publisher = {Dover Publications},
  title = {{Dynamic Programming}},
  year = 1957
}

@article{Bellman54,
author  = {Richard Bellman},
title   = {The theory of dynamic programming},
journal = {Bull. Amer. Math. Soc.}, 
volume  =   {60},
number  =   {6},
pages   =   {503–515},
year    =   {1954},
note    =   {Defines value iteration?}
}

@article{Shapley53,
  author    = {L. S. Shapley},
  title     = {Stochastic games},
  journal   = {Proc Natl Acad Sci USA},
  volume    = {39},
  pages     = {1095–-1100},
  year      = {1953}
}

@Inbook{Zhang2021,
author="Zhang, Kaiqing
and Yang, Zhuoran
and Ba{\c{s}}ar, Tamer",
editor="Vamvoudakis, Kyriakos G.
and Wan, Yan
and Lewis, Frank L.
and Cansever, Derya",
title="Multi-Agent Reinforcement Learning: A Selective Overview of Theories and Algorithms",
bookTitle="Handbook of Reinforcement Learning and Control",
year="2021",
publisher="Springer International Publishing",
address="Cham",
pages="321--384"
}

@book{puterman2014markov,
  author = {Puterman, Martin L},
  publisher = {John Wiley \& Sons},
  title = {Markov decision processes: discrete stochastic dynamic programming},
  year = 2014
}

@book{BertsekasTsitsiklis96,
  author = {Bertsekas, D. P. and Tsitsiklis, J. N.},
  booktitle = {Neuro-dynamic programming.},
  publisher = {Athena Scientific},
  title = {Neuro-dynamic programming.},
  year = 1996
}

@book{Bertsekas05,
  author    = {Dimitri P. Bertsekas},
  title     = {Dynamic programming and optimal control, 3rd Edition},
  publisher = {Athena Scientific},
  year      = {2005}
}

@book{Szepesvari,
  author    = {Csaba Szepesv{\'{a}}ri},
  title     = {Algorithms for Reinforcement Learning},
  series    = {Synthesis Lectures on Artificial Intelligence and Machine Learning},
  publisher = {Morgan {\&} Claypool Publishers},
  year      = {2010},
}


@comment{Papers for policy and value iteration discounted, chapter 5 discounted}

@inproceedings{LittmanDK95,
  author       = {Michael L. Littman and
                  Thomas L. Dean and
                  Leslie Pack Kaelbling},
  title        = {On the Complexity of Solving Markov Decision Problems},
  booktitle    = {Conference on Uncertainty in Artificial Intelligence (UAI)},
  pages        = {394--402},
  publisher    = {Morgan Kaufmann},
  year         = {1995}
}

@article{MelekopoglouC94,
  author       = {Mary Melekopoglou and
                  Anne Condon},
  title        = {On the Complexity of the Policy Improvement Algorithm for Markov Decision
                  Processes},
  journal      = {{INFORMS} J. Comput.},
  volume       = {6},
  number       = {2},
  pages        = {188--192},
  year         = {1994}
}

@inproceedings{MansourS99,
  author       = {Yishay Mansour and
                  Satinder Singh},
  title        = {On the Complexity of Policy Iteration},
  booktitle    = {Conference on Uncertainty
                  in Artificial Intelligence (UAI) },
  pages        = {401--408},
  year         = {1999}
}

@inproceedings{Fearnley10,
  author       = {John Fearnley},
  title        = {Exponential Lower Bounds for Policy Iteration},
  booktitle    = {Automata, Languages and Programming
                  (ICALP) },
  volume       = {6199},
  pages        = {551--562},
  year         = {2010}
}

@inproceedings{HollandersDJ12,
  author       = {Romain Hollanders and
                  Jean{-}Charles Delvenne and
                  Rapha{\"{e}}l M. Jungers},
  title        = {The complexity of Policy Iteration is exponential for discounted Markov
                  Decision Processes},
  booktitle    = {Proceedings of the 51th {IEEE} Conference on Decision and Control (CDC)},
  pages        = {5997--6002},
  year         = {2012}
}

@article{Ye11,
  author       = {Yinyu Ye},
  title        = {The Simplex and Policy-Iteration Methods Are Strongly Polynomial for the Markov Decision Problem with a Fixed Discount Rate},
  journal      = {Math. Oper. Res.},
  volume       = {36},
  number       = {4},
  pages        = {593--603},
  year         = {2011}
}

@article{HansenMZ13,
  author       = {Thomas Dueholm Hansen and
                  Peter Bro Miltersen and
                  Uri Zwick},
  title        = {Strategy Iteration Is Strongly Polynomial for 2-Player Turn-Based
                  Stochastic Games with a Constant Discount Factor},
  journal      = {J. {ACM}},
  volume       = {60},
  number       = {1},
  pages        = {1:1--1:16},
  year         = {2013}
}

@article{MadaniTZ10,
  author       = {Omid Madani and Mikkel Thorup and Uri Zwick},
  title        = {Discounted deterministic Markov decision processes and discounted all-pairs shortest paths},
  journal      = {{ACM} Trans. Algorithms},
  volume       = {6},
  number       = {2},
  pages        = {33:1--33:25},
  year         = {2010}
}

@inproceedings{PostY13,
  author       = {Ian Post and Yinyu Ye},
  title        = {The simplex method is strongly polynomial for deterministic Markov decision processes},
  booktitle    = {Symposium on Discrete Algorithms (SODA) },
  pages        = {1465--1473},
  publisher    = {{SIAM}},
  year         = {2013}
}

@article{blackwell1965discounted,
  title={Discounted dynamic programming},
  author={Blackwell, David},
  journal={The Annals of Mathematical Statistics},
  volume={36},
  number={1},
  pages={226--235},
  year={1965},
  publisher={JSTOR}
}


@article{d1963probabilistic,
  title={A probabilistic production and inventory problem},
  author={d'Epenoux, Francois},
  journal={Management Science},
  volume={10},
  number={1},
  pages={98--108},
  year={1963},
  publisher={INFORMS}
}

@article{manne1960linear, 
title={Linear programming and sequential decisions}, 
author={Manne, Alan S}, 
journal={Management Science}, 
volume={6}, number={3}, 
pages={259--267}, year={1960}, 
publisher={INFORMS}
}

@comment{Generative model}

@article{AzarMK13,
  author       = {Mohammad Gheshlaghi Azar and
                  R{\'{e}}mi Munos and
                  Hilbert J. Kappen},
  title        = {Minimax {PAC} bounds on the sample complexity of reinforcement learning
                  with a generative model},
  journal      = {Mach. Learn.},
  volume       = {91},
  number       = {3},
  pages        = {325--349},
  year         = {2013}
}

@inproceedings{DannB15,
  author       = {Christoph Dann and
                  Emma Brunskill},
  title        = {Sample Complexity of Episodic Fixed-Horizon Reinforcement Learning},
  booktitle    = {Neural Information Processing Systems (NeurIPS)},
  year         = {2015}
}

@inproceedings{DannLB17,
  author       = {Christoph Dann and
                  Tor Lattimore and
                  Emma Brunskill},
  title        = {Unifying {PAC} and Regret: Uniform {PAC} Bounds for Episodic Reinforcement   Learning},
  booktitle    = {Neural Information Processing Systems (NeurIPS)},
  year         = {2017}
}

@inproceedings{AgarwalKY20,
  author       = {Alekh Agarwal and
                  Sham M. Kakade and
                  Lin F. Yang},
  editor       = {Jacob D. Abernethy and
                  Shivani Agarwal},
  title        = {Model-Based Reinforcement Learning with a Generative Model is Minimax
                  Optimal},
  booktitle    = {Conference on Learning Theory, {COLT} },
  year         = {2020}
}

@phdthesis{Kakade2003,
  author  = "Sham Kakade",
  title   = "On the sample complexity of reinforcement learning",
  school  = "University College London",
  year    = "2003"
}

@inproceedings{kakade2002approximately,
  title={Approximately optimal approximate reinforcement learning},
  author={Kakade, Sham and Langford, John},
  booktitle={Proceedings of the Nineteenth International Conference on Machine Learning},
  pages={267--274},
  year={2002}
}

@article{StrehlL08,
  author       = {Alexander L. Strehl and
                  Michael L. Littman},
  title        = {An analysis of model-based Interval Estimation for Markov Decision
                  Processes},
  journal      = {J. Comput. Syst. Sci.},
  volume       = {74},
  number       = {8},
  pages        = {1309--1331},
  year         = {2008}
}

@inproceedings{Fiechter94,
  author       = {Claude{-}Nicolas Fiechter},
  title        = {Efficient Reinforcement Learning},
  booktitle    = {Computational
                  Learning Theory (COLT) },
  year         = {1994}
}

@inproceedings{KaufmannMDJLV21,
  author       = {Emilie Kaufmann and
                  Pierre M{\'{e}}nard and
                  Omar Darwiche Domingues and
                  Anders Jonsson and
                  Edouard Leurent and
                  Michal Valko},
  title        = {Adaptive Reward-Free Exploration},
  booktitle    = {Algorithmic Learning Theory (ALT)},
  year         = {2021}
}

@inproceedings{JinKSY20,
  author       = {Chi Jin and
                  Akshay Krishnamurthy and
                  Max Simchowitz and
                  Tiancheng Yu},
  title        = {Reward-Free Exploration for Reinforcement Learning},
  booktitle    = { International Conference on Machine Learning (ICML)},
  year         = {2020}
}

@inproceedings{MenardDJKLV21,
  author       = {Pierre M{\'{e}}nard and
                  Omar Darwiche Domingues and
                  Anders Jonsson and
                  Emilie Kaufmann and
                  Edouard Leurent and
                  Michal Valko},
  title        = {Fast active learning for pure exploration in reinforcement learning},
  booktitle    = { International Conference on Machine Learning (ICML)},
  year         = {2021}
}

@inproceedings{SzitaS10,
  author       = {Istvan Szita and
                  Csaba Szepesv{\'{a}}ri},
  title        = {Model-based reinforcement learning with nearly tight exploration complexity
                  bounds},
  booktitle    = { International Conference on Machine Learning (ICML)},
  year         = {2010}
}

@inproceedings{SzitaL09,
  author       = {Istvan Szita and
                  Andr{\'{a}}s L{\"{o}}rincz},
  editor       = {Andrea Pohoreckyj Danyluk and
                  L{\'{e}}on Bottou and
                  Michael L. Littman},
  title        = {Optimistic initialization and greediness lead to polynomial time learning
                  in factored MDPs},
  booktitle    = { International Conference on Machine Learning (ICML)},
  year         = {2009}
}

@article{LattimoreH14a,
  author       = {Tor Lattimore and
                  Marcus Hutter},
  title        = {Near-optimal {PAC} bounds for discounted MDPs},
  journal      = {Theor. Comput. Sci.},
  volume       = {558},
  pages        = {125--143},
  year         = {2014}
}

@comment{Model free}

@article{Sutton88,
  author       = {Richard S. Sutton},
  title        = {Learning to Predict by the Methods of Temporal Differences},
  journal      = {Mach. Learn.},
  volume       = {3},
  pages        = {9--44},
  year         = {1988},
 comment        ={Tabular TD(0)}
}

@article{WatkinsD92,
  author       = {Christopher J. C. H. Watkins and
                  Peter Dayan},
  title        = {Q-Learning},
  journal      = {Mach. Learn.},
  volume       = {8},
  pages        = {279--292},
  year         = {1992},
  url          = {https://doi.org/10.1007/BF00992698},
  doi          = {10.1007/BF00992698},
  timestamp    = {Fri, 27 Mar 2020 08:37:06 +0100},
  biburl       = {https://dblp.org/rec/journals/ml/WatkinsD92.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{Dayan92,
  author       = {Peter Dayan},
  title        = {The Convergence of TD(lambda) for General lambda},
  journal      = {Mach. Learn.},
  volume       = {8},
  pages        = {341--362},
  year         = {1992},
  url          = {https://doi.org/10.1007/BF00992701},
  doi          = {10.1007/BF00992701},
  timestamp    = {Fri, 27 Mar 2020 08:37:06 +0100},
  biburl       = {https://dblp.org/rec/journals/ml/Dayan92.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{JaakkolaJS94,
  author       = {Tommi S. Jaakkola and
                  Michael I. Jordan and
                  Satinder P. Singh},
  title        = {On the Convergence of Stochastic Iterative Dynamic Programming Algorithms},
  journal      = {Neural Comput.},
  volume       = {6},
  number       = {6},
  pages        = {1185--1201},
  year         = {1994},
  url          = {https://doi.org/10.1162/neco.1994.6.6.1185},
  doi          = {10.1162/neco.1994.6.6.1185},
  timestamp    = {Tue, 01 Sep 2020 13:12:06 +0200},
  biburl       = {https://dblp.org/rec/journals/neco/JaakkolaJS94.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{Tsitsiklis94,
  author       = {John N. Tsitsiklis},
  title        = {Asynchronous Stochastic Approximation and {Q}-Learning},
  journal      = {Mach. Learn.},
  volume       = {16},
  number       = {3},
  pages        = {185--202},
  year         = {1994},
  url          = {https://doi.org/10.1007/BF00993306},
  doi          = {10.1007/BF00993306},
  timestamp    = {Mon, 02 Mar 2020 16:29:52 +0100},
  biburl       = {https://dblp.org/rec/journals/ml/Tsitsiklis94.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@comment{Rummery, G. A., Niranjan, M. (1994). On-line Q-learning using connectionist systems. Technical
Report CUED/F-INFENG/TR 166. Engineering Department, Cambridge University.}


@article{SinghJLS00,
  author       = {Satinder Singh and
                  Tommi S. Jaakkola and
                  Michael L. Littman and
                  Csaba Szepesv{\'{a}}ri},
  title        = {Convergence Results for Single-Step On-Policy Reinforcement-Learning
                  Algorithms},
  journal      = {Mach. Learn.},
  volume       = {38},
  number       = {3},
  pages        = {287--308},
  year         = {2000},
  url          = {https://doi.org/10.1023/A:1007678930559},
  doi          = {10.1023/A:1007678930559},
  timestamp    = {Tue, 19 Apr 2022 16:03:27 +0200},
  biburl       = {https://dblp.org/rec/journals/ml/SinghJLS00.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@comment{Watkins, C. J. C. H. (1989). Learning from Delayed Rewards. PhD thesis, University of
Cambridge. Introduced Q-learning}


@article{MetropolisU49,
  author       = {N. Metropolis and S. Ulam},
  title        = {The Monte Carlo method},
  journal      = {Journal of the American Statistical Association},
  volume       = {44},
  pages        = {335-341},
  year         = {1949}
}

@inproceedings{BartoD93,
  author       = {Andrew G. Barto and
                  Michael O. Duff},
  editor       = {Jack D. Cowan and
                  Gerald Tesauro and
                  Joshua Alspector},
  title        = {Monte Carlo Matrix Inversion and Reinforcement Learning},
  booktitle    = {Advances in Neural Information Processing Systems 6, [7th {NIPS} Conference,
                  Denver, Colorado, USA, 1993]},
  pages        = {687--694},
  publisher    = {Morgan Kaufmann},
  year         = {1993},
  url          = {http://papers.nips.cc/paper/865-monte-carlo-matrix-inversion-and-reinforcement-learning},
  timestamp    = {Mon, 16 May 2022 15:41:51 +0200},
  biburl       = {https://dblp.org/rec/conf/nips/BartoD93.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DayanS94,
  author       = {Peter Dayan and
                  Terrence J. Sejnowski},
  title        = {TD(lambda) Converges with Probability 1},
  journal      = {Mach. Learn.},
  volume       = {14},
  number       = {1},
  pages        = {295--301},
  year         = {1994},
  url          = {https://doi.org/10.1023/A:1022657612745},
  doi          = {10.1023/A:1022657612745},
  timestamp    = {Mon, 02 Mar 2020 16:29:57 +0100},
  biburl       = {https://dblp.org/rec/journals/ml/DayanS94.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@ARTICLE{Ljung77,
  author={Ljung, L.},
  journal={IEEE Transactions on Automatic Control}, 
  title={Analysis of recursive stochastic algorithms}, 
  year={1977},
  volume={22},
  number={4},
  pages={551-575},
  doi={10.1109/TAC.1977.1101561}
  }
  
  @book{LjungS83,
  author = {Ljung, L. and S\"oderstr\"om, T.},
  biburl = {https://www.bibsonomy.org/bibtex/28fe58e8d3cf165e98e58f4d9764f4804/aude.hofleitner},
  groups = {public},
  interhash = {399d9664ecf2316aa8d3e735a34755d7},
  intrahash = {8fe58e8d3cf165e98e58f4d9764f4804},
  keywords = {},
  publisher = {{MIT} press Cambridge, {MA}},
  title = {Theory and practice of recursive identification},
  year = 1983
}

 @book{Kushner84,
  author = {H. J. Kushner},
  publisher = {{MIT} press Cambridge, {MA}},
  title = {Approximation and Weak Convergence Methods for Random Processes},
  year = 1984
}

@book{KushnerC78,
       author = "Kushner, H.J. and D.S. Clark",
        title = "Stochastic Approximation Methods for Constrained and Unconstrained Systems",
    publisher = "Springer-Verlag",
      address = "New York",
         year =  1978,
  }
  
  @article{RobbinsS51,
author = {Herbert Robbins and Sutton Monro},
title = {{A Stochastic Approximation Method}},
volume = {22},
journal = {The Annals of Mathematical Statistics},
number = {3},
publisher = {Institute of Mathematical Statistics},
pages = {400 -- 407},
year = {1951},
doi = {10.1214/aoms/1177729586},
URL = {https://doi.org/10.1214/aoms/1177729586}
}

  @article{Blum54,
author = {Julius R. Blum},
title = {Multivariable stochastic approximation methods},
volume = {25},
journal = {The Annals of Mathematical Statistics},
number = {4},
publisher = {Institute of Mathematical Statistics},
pages = {737 -- 744},
year = {1954}
}

@inproceedings{SuttonMSM99,
  author       = {Richard S. Sutton and
                  David A. McAllester and
                  Satinder Singh and
                  Yishay Mansour},
  title        = {Policy Gradient Methods for Reinforcement Learning with Function Approximation},
  booktitle    = {NIPS},
  pages        = {1057--1063},
  year         = {1999}
}

@article{Williams92,
  author       = {Ronald J. Williams},
  title        = {Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning},
  journal      = {Machine Learning},
  volume       = {8},
  pages        = {229--256},
  year         = {1992}
}

@article{BaxterB01,
  author       = {Jonathan Baxter and
                  Peter L. Bartlett},
  title        = {Infinite-Horizon Policy-Gradient Estimation},
  journal      = {J. Artif. Intell. Res.},
  volume       = {15},
  pages        = {319--350},
  year         = {2001}
}

@article{MarbachT03,
  author       = {Peter Marbach and
                  John N. Tsitsiklis},
  title        = {Approximate Gradient Methods in Policy-Space Optimization of Markov
                  Reward Processes},
  journal      = {Discret. Event Dyn. Syst.},
  volume       = {13},
  number       = {1-2},
  pages        = {111--148},
  year         = {2003}
}

@article{MarbachT01,
  author       = {Peter Marbach and
                  John N. Tsitsiklis},
  title        = {Simulation-based optimization of Markov reward processes},
  journal      = {{IEEE} Trans. Autom. Control.},
  volume       = {46},
  number       = {2},
  pages        = {191--209},
  year         = {2001}
}

@article{PhansalkarT95,
  author       = {Vijay V. Phansalkar and
                  M. A. L. Thathachar},
  title        = {Local and Global Optimization Algorithms for Generalized Learning
                  Automata},
  journal      = {Neural Comput.},
  volume       = {7},
  number       = {5},
  pages        = {950--973},
  year         = {1995}
}

@misc{SilverClass,
  title        = "Reinforcement Learning: UCL course",
  author       = "{Silver D.}",
  year         = 2015,
  howpublished = "https://www.davidsilver.uk/teaching/",
}

@book{Cesa-Bianchi-Lugosi-book,
  author       = {Nicol{\`{o}} Cesa{-}Bianchi and
                  G{\'{a}}bor Lugosi},
  title        = {Prediction, learning, and games},
  publisher    = {Cambridge University Press},
  year         = {2006}
}

@article{Slivkins-book-19,
  author       = {Aleksandrs Slivkins},
  title        = {Introduction to Multi-Armed Bandits},
  journal      = {Found. Trends Mach. Learn.},
  volume       = {12},
  number       = {1-2},
  pages        = {1--286},
  year         = {2019}
}

@book{Lattimore-Csaba-book-2020,
  title={Bandit Algorithms},
  author={Lattimore, Tor and Szepesv{\'a}ri, Csaba},
  year={2020},
  publisher={Cambridge University Press}
}

@article{Robbins52,
  author       = {Robbins, H. },
  title        = {Some aspects of the sequential design of experiments},
  journal      = {Bulletin of the American Mathematical Society},
  volume       = {58},
  number       = {5},
  pages        = {527–535},
  year         = {1952}
}

@article{LaiR85,
title = {Asymptotically efficient adaptive allocation rules},
journal = {Advances in Applied Mathematics},
volume = {6},
number = {1},
pages = {4-22},
year = {1985}
}

@article{AuerCF02,
  author       = {Peter Auer and
                  Nicol{\`{o}} Cesa{-}Bianchi and
                  Paul Fischer},
  title        = {Finite-time Analysis of the Multiarmed Bandit Problem},
  journal      = {Mach. Learn.},
  volume       = {47},
  number       = {2-3},
  pages        = {235--256},
  year         = {2002}
}

@article{Even-DarMM06,
  author       = {Eyal Even{-}Dar and
                  Shie Mannor and
                  Yishay Mansour},
  title        = {Action Elimination and Stopping Conditions for the Multi-Armed Bandit
                  and Reinforcement Learning Problems},
  journal      = {J. Mach. Learn. Res.},
  volume       = {7},
  pages        = {1079--1105},
  year         = {2006}
}

@misc{DavidSilver-course,
  author = {David Silver},
  title = {{UCL} Course on {RL}},
  year = {2015},
note=  {https://www.davidsilver.uk/teaching/},
  url = {https://www.davidsilver.uk/teaching/}
}