@article{zhai2023stabilizing, title={Stabilizing Transformer Training by Preventing Attention Entropy Collapse}, author={Zhai, Shuangfei and Likhomanenko, Tatiana and Littwin, Etai and Busbridge, Dan and Ramapuram, Jason and Zhang, Yizhe and Gu, Jiatao and Susskind, Josh}, journal={arXiv preprint arXiv:2303.06296}, year={2023} }