@inproceedings{tiapkin2026accelerating,
  abstract = {Traditional Reinforcement Learning from Human Feedback (RLHF) often relies on reward models, frequently assuming preference structures like the Bradley-Terry model, which may not accurately capture the complexities of real human preferences (e.g., intransitivity). Nash Learning from Human Feedback (NLHF) offers a more direct alternative by framing the problem as finding a Nash equilibrium of a game defined by these preferences. While many works study the Nash learning problem directly in the policy space, we instead consider it under a more realistic policy parametrization setting. We first analyze a simple self-play policy gradient method, which is equivalent to Online IPO. We establish high-probability last-iterate convergence guarantees for this method, but our analysis also reveals a possible stability limitation of the underlying dynamics. Motivated by this, we embed the self-play updates into a proximal point framework, yielding a stabilized algorithm. For this combined method, we prove high-probability last-iterate convergence and discuss its more practical version, which we call Nash Prox. Finally, we apply this method to post-training of large language models and validate its empirical performance.},
  author = {Daniil Tiapkin and Daniele Calandriello and Denis Belomestny and Éric Moulines and Alexey Naumov and Kashif Rasul and Michal Valko and Pierre Ménard},
  booktitle = {Conference on Learning Theory},
  title = {{Proximal Point Nash Learning from Human Feedback}},
  url = {https://misovalko.github.io/publications/tiapkin2026accelerating.pdf},
  archivePrefix = {arXiv},
  arxivId = {2505.19731},
  eprint = {2505.19731},
  year = {2026}
}
