@article{zhai2023stabilizing, title={Stabilizing Transformer Training by Preventing Attention Entropy Collapse}, author={Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind}, journal={arXiv preprint arXiv:2303.06296}, url={https://arxiv.org/abs/2303.06296}, year={2023} }