Spaces:
Running
Running
| @inproceedings{vaswani2017attention, | |
| title={Attention Is All You Need}, | |
| author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, { | |
| }Lukasz and Polosukhin, Illia}, | |
| booktitle={Advances in Neural Information Processing Systems}, | |
| year={2017} | |
| } | |
| @book{mckinney2017python, | |
| title={Python for Data Analysis}, | |
| author={McKinney, Wes}, | |
| publisher={O'Reilly Media}, | |
| address={Sebastopol, CA}, | |
| year={2017}, | |
| edition={2}, | |
| isbn={978-1491957660} | |
| } | |
| @inproceedings{he2016resnet, | |
| title={Deep Residual Learning for Image Recognition}, | |
| author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, | |
| booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, | |
| pages={770--778}, | |
| year={2016}, | |
| doi={10.1109/CVPR.2016.90}, | |
| url={https://doi.org/10.1109/CVPR.2016.90} | |
| } | |
| @article{silver2017mastering, | |
| title={Mastering the game of Go without human knowledge}, | |
| author={Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and others}, | |
| journal={Nature}, | |
| volume={550}, | |
| number={7676}, | |
| pages={354--359}, | |
| year={2017}, | |
| month={oct}, | |
| doi={10.1038/nature24270}, | |
| url={https://www.nature.com/articles/nature24270} | |
| } | |
| @techreport{openai2023gpt4, | |
| title={GPT-4 Technical Report}, | |
| author={{OpenAI}}, | |
| institution={OpenAI}, | |
| year={2023}, | |
| number={arXiv:2303.08774}, | |
| archivePrefix={arXiv}, | |
| eprint={2303.08774}, | |
| primaryClass={cs.CL}, | |
| url={https://arxiv.org/abs/2303.08774} | |
| } | |
| @phdthesis{doe2020thesis, | |
| title={Learning Efficient Representations for Large-Scale Visual Recognition}, | |
| author={Doe, Jane}, | |
| school={Massachusetts Institute of Technology}, | |
| address={Cambridge, MA}, | |
| year={2020}, | |
| doi={10.5555/mit-2020-xyz} | |
| } | |
| @incollection{cover2006entropy, | |
| title={Entropy, Relative Entropy, and Mutual Information}, | |
| author={Cover, Thomas M. and Thomas, Joy A.}, | |
| booktitle={Elements of Information Theory}, | |
| publisher={Wiley}, | |
| address={Hoboken, NJ}, | |
| edition={2}, | |
| year={2006}, | |
| pages={13--55}, | |
| isbn={978-0471241959} | |
| } | |
| @misc{zenodo2021dataset, | |
| title={ImageNet-21K Subset (Version 2.0)}, | |
| author={Smith, John and Lee, Alice and Kumar, Ravi}, | |
| year={2021}, | |
| howpublished={Dataset on Zenodo}, | |
| doi={10.5281/zenodo.1234567}, | |
| url={https://doi.org/10.5281/zenodo.1234567}, | |
| note={Accessed 2025-09-01} | |
| } | |
| @misc{sklearn2024, | |
| title={scikit-learn: Machine Learning in Python (Version 1.4)}, | |
| author={Pedregosa, Fabian and Varoquaux, Ga{"e}l and Gramfort, Alexandre and others}, | |
| year={2024}, | |
| howpublished={Software}, | |
| doi={10.5281/zenodo.592264}, | |
| url={https://scikit-learn.org} | |
| } | |
| @inproceedings{smith2024privacy, | |
| title={Privacy-Preserving Training with Low-Precision Secure Aggregation}, | |
| author={Smith, Emily and Zhang, Wei and Rossi, Marco and Patel, Neha}, | |
| booktitle={Proceedings of the 41st International Conference on Machine Learning}, | |
| editor={Smith, A. and Johnson, B.}, | |
| series={Proceedings of Machine Learning Research}, | |
| volume={235}, | |
| pages={12345--12367}, | |
| address={Vienna, Austria}, | |
| publisher={PMLR}, | |
| month={jul}, | |
| year={2024}, | |
| url={https://proceedings.mlr.press/v235/} | |
| } | |
| @article{kingma2015adam, | |
| title={Adam: A Method for Stochastic Optimization}, | |
| author={Kingma, Diederik P. and Ba, Jimmy}, | |
| journal={International Conference on Learning Representations (ICLR)}, | |
| year={2015}, | |
| archivePrefix={arXiv}, | |
| eprint={1412.6980}, | |
| primaryClass={cs.LG}, | |
| url={https://arxiv.org/abs/1412.6980} | |
| } | |
| @misc{raffel2020t5, | |
| title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, | |
| author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and others}, | |
| year={2020}, | |
| howpublished={arXiv preprint}, | |
| archivePrefix={arXiv}, | |
| eprint={1910.10683}, | |
| primaryClass={cs.LG}, | |
| doi={10.48550/arXiv.1910.10683}, | |
| url={https://arxiv.org/abs/1910.10683} | |
| } | |
| @article{templeton2024scaling, | |
| title={Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet}, | |
| author={Templeton, Adly and Conerly, Tom and Marcus, Jonathan and Lindsey, Jack and Bricken, Trenton and Chen, Brian and Pearce, Adam and Citro, Craig and Ameisen, Emmanuel and Jones, Andy and Cunningham, Hoagy and Turner, Nicholas L and McDougall, Callum and MacDiarmid, Monte and Freeman, C. Daniel and Sumers, Theodore R. and Rees, Edward and Batson, Joshua and Jermyn, Adam and Carter, Shan and Olah, Chris and Henighan, Tom}, | |
| year={2024}, | |
| journal={Transformer Circuits Thread}, | |
| url={https://transformer-circuits.pub/2024/scaling-monosemanticity/index.html} | |
| } | |
| @article{cunningham2023sparse, | |
| title={Sparse autoencoders find highly interpretable features in language models}, | |
| author={Cunningham, Hoagy and Ewart, Aidan and Riggs, Logan and Huben, Robert and Sharkey, Lee}, | |
| journal={arXiv preprint arXiv:2309.08600}, | |
| year={2023} | |
| } | |
| @article{lieberum2024gemma, | |
| title={Gemma scope: Open sparse autoencoders everywhere all at once on gemma 2}, | |
| author={Lieberum, Tom and Rajamanoharan, Senthooran and Conmy, Arthur and Smith, Lewis and Sonnerat, Nicolas and Varma, Vikrant and Kram{\'a}r, J{\'a}nos and Dragan, Anca and Shah, Rohin and Nanda, Neel}, | |
| journal={arXiv preprint arXiv:2408.05147}, | |
| year={2024} | |
| } | |
| @article{wu2025axbench, | |
| title={Axbench: Steering llms? even simple baselines outperform sparse autoencoders}, | |
| author={Wu, Zhengxuan and Arora, Aryaman and Geiger, Atticus and Wang, Zheng and Huang, Jing and Jurafsky, Dan and Manning, Christopher D and Potts, Christopher}, | |
| journal={arXiv preprint arXiv:2501.17148}, | |
| year={2025} | |
| } | |
| @article{gao2024scaling, | |
| title={Scaling and evaluating sparse autoencoders}, | |
| author={Gao, Leo and la Tour, Tom Dupr{\'e} and Tillman, Henk and Goh, Gabriel and Troll, Rajan and Radford, Alec and Sutskever, Ilya and Leike, Jan and Wu, Jeffrey}, | |
| journal={arXiv preprint arXiv:2406.04093}, | |
| year={2024} | |
| } | |
| @article{fiotto2024nnsight, | |
| title={NNsight and NDIF: Democratizing access to open-weight foundation model internals}, | |
| author={Fiotto-Kaufman, Jaden and Loftus, Alexander R and Todd, Eric and Brinkmann, Jannik and Pal, Koyena and Troitskii, Dmitrii and Ripa, Michael and Belfki, Adam and Rager, Can and Juang, Caden and others}, | |
| journal={arXiv preprint arXiv:2407.14561}, | |
| year={2024} | |
| } | |