|
1 | 1 | --- |
2 | 2 | --- |
| 3 | +@inproceedings{ zhou2025payattentionsmallweights, |
| 4 | +title={Pay Attention to Small Weights}, |
| 5 | +author={Chao Zhou and Tom Jacobs and Advait Gadhikar and Rebekka Burkholz}, |
| 6 | +booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems}, |
| 7 | +year={2025}, |
| 8 | +url={https://openreview.net/forum?id=XKnOA7MhCz}, |
| 9 | +pdf={https://openreview.net/pdf?id=XKnOA7MhCz}, |
| 10 | +abstract={Finetuning large pretrained neural networks is known to be resource-intensive, both in terms of memory and computational cost. To mitigate this, a common approach is to restrict training to a subset of the model parameters. By analyzing the relationship between gradients and weights during finetuning, we observe a notable pattern: large gradients are often associated with small-magnitude weights. This correlation is more pronounced in finetuning settings than in training from scratch. Motivated by this observation, we propose NANOADAM, which dynamically updates only the small-magnitude weights during finetuning and offers several practical advantages: first, this criterion is gradient-free -- the parameter subset can be determined without gradient computation; second, it preserves large-magnitude weights, which are likely to encode critical features learned during pretraining, thereby reducing the risk of catastrophic forgetting; thirdly, it permits the use of larger learning rates and consistently leads to better generalization performance in experiments. We demonstrate this for both NLP and vision tasks.}, |
| 11 | +} |
| 12 | + |
| 13 | +@inproceedings{ Gadhikar2025SignInTT, |
| 14 | +title={Sign-In to the Lottery: Reparameterizing Sparse Training}, |
| 15 | +author={Advait Gadhikar and Tom Jacobs and Chao Zhou and Rebekka Burkholz}, |
| 16 | +booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems}, |
| 17 | +year={2025}, |
| 18 | +url={https://openreview.net/forum?id=iwKT7MEZZw}, |
| 19 | +pdf={https://openreview.net/pdf?id=iwKT7MEZZw}, |
| 20 | +abstract={The performance gap between training sparse neural networks from scratch (PaI) and dense-to-sparse training presents a major roadblock for efficient deep learning. According to the Lottery Ticket Hypothesis, PaI hinges on finding a problem specific parameter initialization. As we show, to this end, determining correct parameter signs is sufficient. Yet, they remain elusive to PaI. To address this issue, we propose Sign-In, which employs a dynamic reparameterization that provably induces sign flips. Such sign flips are complementary to the ones that dense-to-sparse training can accomplish, rendering Sign-In as an orthogonal method. While our experiments and theory suggest performance improvements of PaI, they also carve out the main open challenge to close the gap between PaI and dense-to-sparse training.}, |
| 21 | +img={signin.jpg} |
| 22 | +} |
| 23 | + |
| 24 | +@inproceedings{ pham2025the, |
| 25 | +title={The Graphon Limit Hypothesis: Understanding Neural Network Pruning via Infinite Width Analysis}, |
| 26 | +author={Hoang Pham and The-Anh Ta and Tom Jacobs and Rebekka Burkholz and Long Tran-Thanh}, |
| 27 | +booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems}, |
| 28 | +year={2025}, |
| 29 | +url={https://openreview.net/forum?id=EEZLBhyer1}, |
| 30 | +pdf={https://openreview.net/pdf?id=EEZLBhyer1}, |
| 31 | +abstract={Sparse neural networks promise efficiency, yet training them effectively remains a fundamental challenge. Despite advances in pruning methods that create sparse architectures, understanding why some sparse structures are better trainable than others with the same level of sparsity remains poorly understood. Aiming to develop a systematic approach to this fundamental problem, we propose a novel theoretical framework based on the theory of graph limits, particularly graphons, that characterizes sparse neural networks in the infinite-width regime. Our key insight is that connectivity patterns of sparse neural networks induced by pruning methods converge to specific graphons as networks' width tends to infinity, which encodes implicit structural biases of different pruning methods. We postulate the Graphon Limit Hypothesis and provide empirical evidence to support it. Leveraging this graphon representation, we derive a Graphon Neural Tangent Kernel (Graphon NTK) to study the training dynamics of sparse networks in the infinite width limit. Graphon NTK provides a general framework for the theoretical analysis of sparse networks. We empirically show that the spectral analysis of Graphon NTK correlates with observed training dynamics of sparse networks, explaining the varying convergence behaviours of different pruning methods. Our framework provides theoretical insights into the impact of connectivity patterns on the trainability of various sparse network architectures.}, |
| 32 | +} |
| 33 | + |
3 | 34 | @inproceedings{jacobs2025mirror, |
4 | 35 | title={Mirror, Mirror of the Flow: How Does Regularization Shape Implicit Bias?}, |
5 | 36 | author={Tom Jacobs and Chao Zhou and Rebekka Burkholz}, |
|
0 commit comments