From 90ea7a02c655f353a70d1dc4aae5c0e3ac22cf09 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 14:27:43 -0500
Subject: [PATCH 01/13] Add interactive glossary with hover tooltips

- Created Sphinx glossary with 15 PyTorch-specific terms
- Added interactive hover tooltips using custom JavaScript
- Integrated glossary into main documentation navigation
- Added sphinx-hoverxref dependency for tooltip functionality
---
 .ci/docker/requirements.txt     |   1 +
 _static/js/glossary-tooltips.js | 285 ++++++++++++++++++++++++++++++++
 conf.py                         |  16 ++
 glossary.rst                    |  65 ++++++++
 index.rst                       |   6 +
 5 files changed, 373 insertions(+)
 create mode 100644 _static/js/glossary-tooltips.js
 create mode 100644 glossary.rst

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 086633cf043..73a9dcd1632 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -11,6 +11,7 @@ sphinx-copybutton==0.5.2
 sphinx_sitemap==2.7.1
 sphinxcontrib-mermaid==1.0.0
 sphinxcontrib.katex==0.9.10
+sphinx-hoverxref==1.4.1
 pypandoc==1.15
 pandocfilters==1.5.1
 markdown==3.8.2
diff --git a/_static/js/glossary-tooltips.js b/_static/js/glossary-tooltips.js
new file mode 100644
index 00000000000..ed4566df057
--- /dev/null
+++ b/_static/js/glossary-tooltips.js
@@ -0,0 +1,285 @@
+/**
+ * Glossary Tooltips - Custom tooltip implementation for Sphinx glossary terms
+ *
+ * This script adds hover tooltips to glossary term references throughout the documentation.
+ * When hovering over a glossary term link, it fetches the definition from the glossary page
+ * and displays it in a tooltip without requiring navigation.
+ */
+
+(function() {
+    'use strict';
+
+    // Cache for glossary definitions to avoid repeated fetches
+    const glossaryCache = {};
+    let glossaryContent = null;
+    let tooltip = null;
+    let currentTarget = null;
+    let hideTimeout = null;
+
+    /**
+     * Create the tooltip element
+     */
+    function createTooltip() {
+        tooltip = document.createElement('div');
+        tooltip.className = 'glossary-tooltip';
+        tooltip.style.cssText = `
+            position: absolute;
+            display: none;
+            background: #2c2c2c;
+            color: #e8e8e8;
+            padding: 12px 16px;
+            border-radius: 6px;
+            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+            max-width: 400px;
+            z-index: 10000;
+            font-size: 14px;
+            line-height: 1.5;
+            pointer-events: none;
+        `;
+        document.body.appendChild(tooltip);
+    }
+
+    /**
+     * Fetch and cache glossary content using iframe (works with file:// URLs)
+     */
+    async function fetchGlossaryContent() {
+        if (glossaryContent) {
+            return glossaryContent;
+        }
+
+        return new Promise((resolve, reject) => {
+            try {
+                // Create hidden iframe to load glossary
+                const iframe = document.createElement('iframe');
+                iframe.style.display = 'none';
+
+                // Determine glossary URL
+                const currentUrl = window.location.href;
+                const glossaryUrl = currentUrl.substring(0, currentUrl.lastIndexOf('/') + 1) + 'glossary.html';
+
+                console.log('Loading glossary from:', glossaryUrl);
+
+                iframe.onload = function() {
+                    try {
+                        const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
+
+                        // Verify we got valid content
+                        if (!iframeDoc.getElementById('term-ATen') && !iframeDoc.getElementById('term-JIT')) {
+                            console.warn('Glossary loaded but no terms found');
+                        } else {
+                            console.log('Glossary content successfully loaded');
+                        }
+
+                        glossaryContent = iframeDoc;
+
+                        // Remove iframe after loading
+                        setTimeout(() => {
+                            if (iframe.parentNode) {
+                                iframe.parentNode.removeChild(iframe);
+                            }
+                        }, 100);
+
+                        resolve(glossaryContent);
+                    } catch (error) {
+                        console.error('Error accessing iframe content:', error);
+                        reject(error);
+                    }
+                };
+
+                iframe.onerror = function(error) {
+                    console.error('Error loading glossary iframe:', error);
+                    reject(error);
+                };
+
+                document.body.appendChild(iframe);
+                iframe.src = glossaryUrl;
+
+                // Timeout after 5 seconds
+                setTimeout(() => {
+                    if (!glossaryContent) {
+                        console.error('Glossary loading timeout');
+                        if (iframe.parentNode) {
+                            iframe.parentNode.removeChild(iframe);
+                        }
+                        reject(new Error('Timeout loading glossary'));
+                    }
+                }, 5000);
+
+            } catch (error) {
+                console.error('Failed to create glossary iframe:', error);
+                reject(error);
+            }
+        });
+    }
+
+    /**
+     * Extract definition text from glossary entry
+     */
+    function getDefinitionText(termId, doc) {
+        if (glossaryCache[termId]) {
+            return glossaryCache[termId];
+        }
+
+        try {
+            // Find the term definition in the glossary
+            const termElement = doc.getElementById(termId);
+            if (!termElement) {
+                return null;
+            }
+
+            // Get the definition - it's in the <dd> that follows the <dt>
+            let definitionElement = termElement.nextElementSibling;
+            while (definitionElement && definitionElement.tagName !== 'DD') {
+                definitionElement = definitionElement.nextElementSibling;
+            }
+
+            if (!definitionElement) {
+                return null;
+            }
+
+            // Clone the element to manipulate it without affecting the original
+            const clone = definitionElement.cloneNode(true);
+
+            // Remove any internal reference links (keep the text but remove the link)
+            clone.querySelectorAll('a.reference.internal').forEach(link => {
+                const text = document.createTextNode(link.textContent);
+                link.parentNode.replaceChild(text, link);
+            });
+
+            // Get clean text with basic formatting
+            let text = clone.textContent.trim();
+
+            // Limit length and add ellipsis if needed
+            const maxLength = 300;
+            if (text.length > maxLength) {
+                text = text.substring(0, maxLength).trim() + '...';
+            }
+
+            glossaryCache[termId] = text;
+            return text;
+        } catch (error) {
+            console.error('Error extracting definition:', error);
+            return null;
+        }
+    }
+
+    /**
+     * Show tooltip at the given position
+     */
+    function showTooltip(text, target) {
+        if (!tooltip || !text) {
+            return;
+        }
+
+        clearTimeout(hideTimeout);
+
+        tooltip.textContent = text;
+        tooltip.style.display = 'block';
+
+        // Position tooltip
+        const rect = target.getBoundingClientRect();
+        const scrollTop = window.pageYOffset || document.documentElement.scrollTop;
+        const scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
+
+        let top = rect.bottom + scrollTop + 8;
+        let left = rect.left + scrollLeft + (rect.width / 2);
+
+        // Adjust position if tooltip would go off-screen
+        const tooltipRect = tooltip.getBoundingClientRect();
+
+        // Horizontal adjustment
+        if (left + tooltipRect.width / 2 > window.innerWidth) {
+            left = window.innerWidth - tooltipRect.width - 10 + scrollLeft;
+        } else if (left - tooltipRect.width / 2 < 0) {
+            left = 10 + scrollLeft;
+        } else {
+            left = left - tooltipRect.width / 2;
+        }
+
+        // Vertical adjustment - show above if no room below
+        if (rect.bottom + tooltipRect.height + 16 > window.innerHeight + scrollTop) {
+            top = rect.top + scrollTop - tooltipRect.height - 8;
+        }
+
+        tooltip.style.top = top + 'px';
+        tooltip.style.left = left + 'px';
+
+        currentTarget = target;
+    }
+
+    /**
+     * Hide tooltip with delay
+     */
+    function hideTooltip() {
+        hideTimeout = setTimeout(() => {
+            if (tooltip) {
+                tooltip.style.display = 'none';
+                currentTarget = null;
+            }
+        }, 100);
+    }
+
+    /**
+     * Handle mouse enter on glossary term link
+     */
+    async function handleMouseEnter(event) {
+        const link = event.currentTarget;
+        const href = link.getAttribute('href');
+
+        // Check if this is a glossary term link
+        if (!href || !href.includes('glossary.html#term-')) {
+            return;
+        }
+
+        // Extract term ID from href
+        const termId = href.split('#')[1];
+        if (!termId) {
+            return;
+        }
+
+        // Show loading indicator for slow networks
+        const loadingText = 'Loading definition...';
+        showTooltip(loadingText, link);
+
+        // Fetch glossary content if not already cached
+        const doc = await fetchGlossaryContent();
+        if (!doc) {
+            hideTooltip();
+            return;
+        }
+
+        // Get definition text
+        const definition = getDefinitionText(termId, doc);
+        if (definition && currentTarget === link) {
+            showTooltip(definition, link);
+        } else {
+            hideTooltip();
+        }
+    }
+
+    /**
+     * Initialize tooltips for all glossary term links
+     */
+    function initializeGlossaryTooltips() {
+        // Create tooltip element
+        createTooltip();
+
+        // Find all glossary term links
+        const glossaryLinks = document.querySelectorAll('a.reference.internal[href*="glossary.html#term-"]');
+
+        glossaryLinks.forEach(link => {
+            link.addEventListener('mouseenter', handleMouseEnter);
+            link.addEventListener('mouseleave', hideTooltip);
+        });
+
+        console.log(`Initialized glossary tooltips for ${glossaryLinks.length} terms`);
+    }
+
+    // Initialize when DOM is ready
+    if (document.readyState === 'loading') {
+        document.addEventListener('DOMContentLoaded', initializeGlossaryTooltips);
+    } else {
+        initializeGlossaryTooltips();
+    }
+
+})();
diff --git a/conf.py b/conf.py
index 67227c0784b..a150e982fbc 100644
--- a/conf.py
+++ b/conf.py
@@ -141,8 +141,20 @@ def wrapper(*args, **kwargs):
     "sphinx_sitemap",
     "sphinx_reredirects",
     "sphinxcontrib.mermaid",
+    "hoverxref.extension",
 ]
 
+# sphinx-hoverxref configuration
+hoverxref_auto_ref = True
+hoverxref_domains = ["py", "std"]  # Add 'std' domain for glossary terms
+hoverxref_role_types = {
+    "term": "tooltip",  # Enable tooltips for glossary terms
+    "ref": "tooltip",
+    "std:term": "tooltip",  # Explicitly enable tooltips for std domain terms
+}
+hoverxref_tooltip_maxwidth = 600
+hoverxref_tooltip_theme = ["tooltipster-shadow", "tooltipster-shadow-custom"]
+
 intersphinx_mapping = {
     "torch": ("https://docs.pytorch.org/docs/stable/", None),
     "tensordict": ("https://docs.pytorch.org/tensordict/stable", None),
@@ -438,6 +450,10 @@ def handle_jinja_templates(app, docname, source):
     "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css",
 ]
 
+html_js_files = [
+    "js/glossary-tooltips.js",
+]
+
 
 def html_page_context(app, pagename, templatename, context, doctree):
     # Check if the page is in gallery directories
diff --git a/glossary.rst b/glossary.rst
new file mode 100644
index 00000000000..3d172da2130
--- /dev/null
+++ b/glossary.rst
@@ -0,0 +1,65 @@
+PyTorch Glossary
+================
+
+This glossary provides definitions for terms commonly used in PyTorch documentation.
+
+.. glossary::
+   :sorted:
+
+   ATen
+      Short for "A Tensor Library". The foundational tensor and mathematical
+      operation library on which all else is built.
+
+   Operation
+      A unit of work. For example, the work of matrix multiplication is an operation
+      called ``aten::matmul``.
+
+   Native Operation
+      An operation that comes natively with PyTorch ATen, for example ``aten::matmul``.
+
+   Custom Operation
+      An Operation that is defined by users and is usually a :term:`Compound Operation`.
+      For example, this `tutorial <https://pytorch.org/docs/stable/notes/extending.html>`_
+      details how to create Custom Operations.
+
+   Kernel
+      Implementation of a PyTorch operation, specifying what should be done when an
+      operation executes.
+
+   Compound Operation
+      A Compound Operation is composed of other operations. Its kernel is usually
+      device-agnostic. Normally it doesn't have its own derivative functions defined.
+      Instead, AutoGrad automatically computes its derivative based on operations it
+      uses.
+
+   Composite Operation
+      Same as :term:`Compound Operation`.
+
+   Non-Leaf Operation
+      Same as :term:`Compound Operation`.
+
+   Leaf Operation
+      An operation that's considered a basic operation, as opposed to a :term:`Compound
+      Operation`. Leaf Operation always has dispatch functions defined, usually has a
+      derivative function defined as well.
+
+   Device Kernel
+      Device-specific kernel of a :term:`Leaf Operation`.
+
+   Compound Kernel
+      Opposed to :term:`Device Kernels<Device Kernel>`, Compound kernels are usually
+      device-agnostic and belong to :term:`Compound Operations<Compound Operation>`.
+
+   JIT
+      Just-In-Time Compilation.
+
+   TorchScript
+      An interface to the TorchScript :term:`JIT` compiler and interpreter.
+
+   Tracing
+      Using ``torch.jit.trace`` on a function to get an executable that can be optimized
+      using just-in-time compilation.
+
+   Scripting
+      Using ``torch.jit.script`` on a function to inspect source code and compile it as
+      :term:`TorchScript` code.
diff --git a/index.rst b/index.rst
index 5a5e80abfbb..d0dc5507206 100644
--- a/index.rst
+++ b/index.rst
@@ -862,3 +862,9 @@ Additional Resources
    :hidden:
 
    prototype/prototype_index
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   glossary

From ada9883258023a6f57b88a0ab16da11f921aa196 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 21:25:37 -0500
Subject: [PATCH 02/13] Updating glossary file.

---
 glossary.rst | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 1 deletion(-)

diff --git a/glossary.rst b/glossary.rst
index 3d172da2130..28027b42ccd 100644
--- a/glossary.rst
+++ b/glossary.rst
@@ -10,6 +10,109 @@ This glossary provides definitions for terms commonly used in PyTorch documentat
       Short for "A Tensor Library". The foundational tensor and mathematical
       operation library on which all else is built.
 
+   attention mechanism
+      A technique used in deep learning models, particularly transformer architectures,
+      to selectively focus on certain input elements or tokens when computing output
+      representations, improving performance and interpretability.
+
+   backward pass
+      The backward pass is part of the backpropagation algorithm where the error
+      gradients are computed and propagated backwards through the network, adjusting
+      the weights and biases to minimize the loss.
+
+   backpropagation
+      An essential algorithm in training neural networks. It calculates the gradient
+      of the loss function with respect to the model's parameters, allowing the
+      network to learn from its mistakes and improve over time.
+
+   CNN
+      Convolutional Neural Network: A type of neural network designed for image and
+      video processing, using convolutional and pooling layers to extract features.
+
+   Convolutional Neural Network
+      A type of neural network designed for image and video processing, using
+      convolutional and pooling layers to extract features. Also known as CNN.
+
+   CUDA
+      Compute Unified Device Architecture: A parallel computing platform developed
+      by NVIDIA that allows developers to use GPUs for general-purpose computing,
+      including machine learning and deep learning applications.
+
+   embedding
+      A way to represent categorical variables as dense vectors, often used in
+      natural language processing and recommender systems.
+
+   epoch
+      An epoch is a unit of measurement in machine learning that represents one
+      complete pass through the entire training dataset. During each epoch, the
+      model's weights are updated based on the loss calculated from the predictions
+      made on the training data.
+
+   forward pass
+      The forward pass is the process of passing input data through a neural network
+      to obtain an output prediction. It's the first step in training a model,
+      followed by the backward pass and optimization.
+
+   GPU
+      Graphics Processing Unit: A specialized electronic circuit designed to quickly
+      manipulate and alter memory to accelerate computations. In the context of AI
+      and machine learning, GPUs are used to accelerate computationally intensive
+      tasks like training neural networks.
+
+   gradient
+      In machine learning, the gradient represents the rate of change of the loss
+      function with respect to the model's parameters. It's used in backpropagation
+      to update the weights and biases during training.
+
+   Inductor
+      A PyTorch component that enables just-in-time (JIT) compilation of PyTorch
+      models, allowing for faster inference times and better performance on CPUs
+      and GPUs. It is the default backend for torch.compile.
+
+   inference
+      The process of making predictions or drawing conclusions from a trained AI
+      model, typically involving the application of the learned relationships to
+      new, unseen data.
+
+   loss function
+      A loss function, also known as a cost function, is a mathematical function
+      used to evaluate the performance of a machine learning model during training,
+      providing a measure of how well the model is doing.
+
+   LSTM
+      Long Short-Term Memory Network: A type of recurrent neural network (RNN)
+      designed to handle sequential data with long-term dependencies. LSTMs use
+      memory cells and gates to selectively retain information over time.
+
+   optimizer
+      An algorithm used to update the weights and biases of a neural network during
+      training to minimize the loss function. Common optimizers include SGD, Adam,
+      and RMSprop.
+
+   quantization
+      A technique used to reduce the precision of numerical values in a deep learning
+      model, often to reduce memory usage, improve performance, and enable deployment
+      on resource-constrained devices.
+
+   RNN
+      Recurrent Neural Network: A type of neural network designed for sequential data,
+      using recurrent connections to capture temporal dependencies.
+
+   tensor
+      Tensors are a specialized data structure that are very similar to arrays and
+      matrices. In PyTorch, tensors are used to encode the inputs and outputs of a
+      model, as well as the model's parameters.
+
+   torch.compile
+      A PyTorch function that compiles PyTorch code into an optimized form, allowing
+      for faster execution and better performance. It is the main entry point for
+      PyTorch 2.x optimizations.
+
+   transformer
+      A type of neural network architecture introduced in the paper "Attention is All
+      You Need" (Vaswani et al., 2017), which relies entirely on self-attention
+      mechanisms to process sequential data, such as text or images.
+
    Operation
       A unit of work. For example, the work of matrix multiplication is an operation
       called ``aten::matmul``.
@@ -51,7 +154,8 @@ This glossary provides definitions for terms commonly used in PyTorch documentat
       device-agnostic and belong to :term:`Compound Operations<Compound Operation>`.
 
    JIT
-      Just-In-Time Compilation.
+      Just-In-Time Compilation: A compilation technique where code is compiled into
+      machine code at runtime, just before it is executed.
 
    TorchScript
       An interface to the TorchScript :term:`JIT` compiler and interpreter.

From 2345aa43f5ca62d083f48253506ad0f56211110e Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 21:26:05 -0500
Subject: [PATCH 03/13] Updating tooltip js.

---
 _static/js/glossary-tooltips.js | 80 ++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 15 deletions(-)

diff --git a/_static/js/glossary-tooltips.js b/_static/js/glossary-tooltips.js
index ed4566df057..cf024479ceb 100644
--- a/_static/js/glossary-tooltips.js
+++ b/_static/js/glossary-tooltips.js
@@ -6,7 +6,7 @@
  * and displays it in a tooltip without requiring navigation.
  */
 
-(function() {
+(function () {
     'use strict';
 
     // Cache for glossary definitions to avoid repeated fetches
@@ -53,13 +53,41 @@
                 const iframe = document.createElement('iframe');
                 iframe.style.display = 'none';
 
-                // Determine glossary URL
-                const currentUrl = window.location.href;
-                const glossaryUrl = currentUrl.substring(0, currentUrl.lastIndexOf('/') + 1) + 'glossary.html';
+                // Determine glossary URL - find the base URL by looking for common patterns
+                const currentPath = window.location.pathname;
+                let basePath = '';
+
+                // Find the root of the documentation
+                const pathParts = currentPath.split('/');
+                for (let i = 0; i < pathParts.length; i++) {
+                    if (pathParts[i] === '_build') {
+                        // For local builds, glossary is at _build/html/glossary.html
+                        basePath = pathParts.slice(0, i + 2).join('/') + '/';
+                        break;
+                    }
+                }
+
+                // If we couldn't find _build, try to find common doc directories
+                if (!basePath) {
+                    const knownDirs = ['advanced', 'beginner', 'intermediate', 'recipes', 'prototype', 'unstable'];
+                    for (let i = pathParts.length - 1; i >= 0; i--) {
+                        if (knownDirs.includes(pathParts[i])) {
+                            basePath = pathParts.slice(0, i).join('/') + '/';
+                            break;
+                        }
+                    }
+                }
+
+                // Fallback to going up directories based on current location
+                if (!basePath) {
+                    basePath = currentPath.substring(0, currentPath.lastIndexOf('/') + 1) + '../';
+                }
+
+                const glossaryUrl = window.location.origin + basePath + 'glossary.html';
 
                 console.log('Loading glossary from:', glossaryUrl);
 
-                iframe.onload = function() {
+                iframe.onload = function () {
                     try {
                         const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
 
@@ -70,14 +98,22 @@
                             console.log('Glossary content successfully loaded');
                         }
 
-                        glossaryContent = iframeDoc;
+                        // Clone the body content before removing iframe
+                        const clonedBody = iframeDoc.body.cloneNode(true);
+
+                        // Create a container to hold the content
+                        const container = document.createElement('div');
+                        container.innerHTML = clonedBody.innerHTML;
+                        container.style.display = 'none';
+                        container.id = 'glossary-content-cache';
+                        document.body.appendChild(container);
+
+                        glossaryContent = container;
 
-                        // Remove iframe after loading
-                        setTimeout(() => {
-                            if (iframe.parentNode) {
-                                iframe.parentNode.removeChild(iframe);
-                            }
-                        }, 100);
+                        // Remove iframe after cloning
+                        if (iframe.parentNode) {
+                            iframe.parentNode.removeChild(iframe);
+                        }
 
                         resolve(glossaryContent);
                     } catch (error) {
@@ -86,7 +122,7 @@
                     }
                 };
 
-                iframe.onerror = function(error) {
+                iframe.onerror = function (error) {
                     console.error('Error loading glossary iframe:', error);
                     reject(error);
                 };
@@ -115,15 +151,17 @@
     /**
      * Extract definition text from glossary entry
      */
-    function getDefinitionText(termId, doc) {
+    function getDefinitionText(termId, container) {
         if (glossaryCache[termId]) {
             return glossaryCache[termId];
         }
 
         try {
             // Find the term definition in the glossary
-            const termElement = doc.getElementById(termId);
+            // container is a div element, so use querySelector instead of getElementById
+            const termElement = container.querySelector('#' + CSS.escape(termId));
             if (!termElement) {
+                console.warn('Term not found:', termId);
                 return null;
             }
 
@@ -134,6 +172,18 @@
             }
 
             if (!definitionElement) {
+                // Try looking for the parent dt and its sibling dd
+                const parentDt = termElement.closest('dt');
+                if (parentDt) {
+                    definitionElement = parentDt.nextElementSibling;
+                    while (definitionElement && definitionElement.tagName !== 'DD') {
+                        definitionElement = definitionElement.nextElementSibling;
+                    }
+                }
+            }
+
+            if (!definitionElement) {
+                console.warn('Definition element not found for:', termId);
                 return null;
             }
 

From d00c05a2732eafb1630aa994f4d0b8b9c5893d7a Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 21:40:00 -0500
Subject: [PATCH 04/13] Adding markup for glossary terms.

---
 advanced_source/cpp_autograd.rst              | 54 +++++++++----------
 advanced_source/cpp_export.rst                |  4 +-
 advanced_source/cpp_frontend.rst              |  5 +-
 advanced_source/custom_class_pt2.rst          |  2 +-
 advanced_source/dispatcher.rst                |  6 +--
 advanced_source/extend_dispatcher.rst         |  8 +--
 advanced_source/torch-script-parallelism.rst  |  4 +-
 advanced_source/torch_script_custom_ops.rst   |  4 +-
 compilers_index.rst                           |  2 +-
 recipes_source/compiling_optimizer.rst        | 26 ++++-----
 .../distributed_optim_torchscript.rst         |  4 +-
 recipes_source/torchscript_inference.rst      |  4 +-
 12 files changed, 64 insertions(+), 59 deletions(-)

diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst
index 51e5e0b358f..5bc488da2b1 100644
--- a/advanced_source/cpp_autograd.rst
+++ b/advanced_source/cpp_autograd.rst
@@ -15,7 +15,7 @@ Basic autograd operations
 
 (Adapted from `this tutorial <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#autograd-automatic-differentiation>`_)
 
-Create a tensor and set ``torch::requires_grad()`` to track computation with it
+Create a :term:`tensor` and set ``torch::requires_grad()`` to track computation with it
 
 .. code-block:: cpp
 
@@ -64,7 +64,7 @@ Do more operations on ``y``
 
   auto z = y * y * 3;
   auto out = z.mean();
-  
+
   std::cout << z << std::endl;
   std::cout << z.grad_fn()->name() << std::endl;
   std::cout << out << std::endl;
@@ -90,10 +90,10 @@ Out:
   auto a = torch::randn({2, 2});
   a = ((a * 3) / (a - 1));
   std::cout << a.requires_grad() << std::endl;
-  
+
   a.requires_grad_(true);
   std::cout << a.requires_grad() << std::endl;
-  
+
   auto b = (a * a).sum();
   std::cout << b.grad_fn()->name() << std::endl;
 
@@ -106,13 +106,13 @@ Out:
   SumBackward0
 
 Let's backprop now. Because ``out`` contains a single scalar, ``out.backward()``
-is equivalent to ``out.backward(torch::tensor(1.))``.
+is equivalent to ``out.backward(torch::tensor(1.))``. This is part of the :term:`backward pass`.
 
 .. code-block:: cpp
 
   out.backward();
 
-Print gradients d(out)/dx
+Print :term:`gradients<gradient>` d(out)/dx
 
 .. code-block:: cpp
 
@@ -134,12 +134,12 @@ Now let's take a look at an example of vector-Jacobian product:
 .. code-block:: cpp
 
   x = torch::randn(3, torch::requires_grad());
-  
+
   y = x * 2;
   while (y.norm().item<double>() < 1000) {
     y = y * 2;
   }
-    
+
   std::cout << y << std::endl;
   std::cout << y.grad_fn()->name() << std::endl;
 
@@ -159,7 +159,7 @@ If we want the vector-Jacobian product, pass the vector to ``backward`` as argum
 
   auto v = torch::tensor({0.1, 1.0, 0.0001}, torch::kFloat);
   y.backward(v);
-  
+
   std::cout << x.grad() << std::endl;
 
 Out:
@@ -178,7 +178,7 @@ either by putting ``torch::NoGradGuard`` in a code block
 
   std::cout << x.requires_grad() << std::endl;
   std::cout << x.pow(2).requires_grad() << std::endl;
-  
+
   {
     torch::NoGradGuard no_grad;
     std::cout << x.pow(2).requires_grad() << std::endl;
@@ -218,31 +218,31 @@ please see `the corresponding C++ API docs <https://pytorch.org/cppdocs/api/clas
 Computing higher-order gradients in C++
 ---------------------------------------
 
-One of the applications of higher-order gradients is calculating gradient penalty.
+One of the applications of higher-order :term:`gradients<gradient>` is calculating :term:`gradient` penalty.
 Let's see an example of it using ``torch::autograd::grad``:
 
 .. code-block:: cpp
 
   #include <torch/torch.h>
-  
+
   auto model = torch::nn::Linear(4, 3);
-  
+
   auto input = torch::randn({3, 4}).requires_grad_(true);
   auto output = model(input);
-  
+
   // Calculate loss
   auto target = torch::randn({3, 3});
   auto loss = torch::nn::MSELoss()(output, target);
-  
+
   // Use norm of gradients as penalty
   auto grad_output = torch::ones_like(output);
   auto gradient = torch::autograd::grad({output}, {input}, /*grad_outputs=*/{grad_output}, /*create_graph=*/true)[0];
   auto gradient_penalty = torch::pow((gradient.norm(2, /*dim=*/1) - 1), 2).mean();
-  
+
   // Add gradient penalty to loss
   auto combined_loss = loss + gradient_penalty;
   combined_loss.backward();
-  
+
   std::cout << input.grad() << std::endl;
 
 Out:
@@ -277,14 +277,14 @@ Below you can find code for a ``Linear`` function from ``torch::nn``:
 .. code-block:: cpp
 
   #include <torch/torch.h>
-  
+
   using namespace torch::autograd;
-  
+
   // Inherit from Function
   class LinearFunction : public Function<LinearFunction> {
    public:
     // Note that both forward and backward are static functions
-  
+
     // bias is an optional argument
     static torch::Tensor forward(
         AutogradContext *ctx, torch::Tensor input, torch::Tensor weight, torch::Tensor bias = torch::Tensor()) {
@@ -295,13 +295,13 @@ Below you can find code for a ``Linear`` function from ``torch::nn``:
       }
       return output;
     }
-  
+
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
       auto saved = ctx->get_saved_variables();
       auto input = saved[0];
       auto weight = saved[1];
       auto bias = saved[2];
-  
+
       auto grad_output = grad_outputs[0];
       auto grad_input = grad_output.mm(weight);
       auto grad_weight = grad_output.t().mm(input);
@@ -309,7 +309,7 @@ Below you can find code for a ``Linear`` function from ``torch::nn``:
       if (bias.defined()) {
         grad_bias = grad_output.sum(0);
       }
-  
+
       return {grad_input, grad_weight, grad_bias};
     }
   };
@@ -322,7 +322,7 @@ Then, we can use the ``LinearFunction`` in the following way:
   auto weight = torch::randn({4, 3}).requires_grad_();
   auto y = LinearFunction::apply(x, weight);
   y.sum().backward();
-  
+
   std::cout << x.grad() << std::endl;
   std::cout << weight.grad() << std::endl;
 
@@ -344,9 +344,9 @@ Here, we give an additional example of a function that is parametrized by non-te
 .. code-block:: cpp
 
   #include <torch/torch.h>
-  
+
   using namespace torch::autograd;
-  
+
   class MulConstant : public Function<MulConstant> {
    public:
     static torch::Tensor forward(AutogradContext *ctx, torch::Tensor tensor, double constant) {
@@ -355,7 +355,7 @@ Here, we give an additional example of a function that is parametrized by non-te
       ctx->saved_data["constant"] = constant;
       return tensor * constant;
     }
-  
+
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
       // We return as many input gradients as there were arguments.
       // Gradients of non-tensor arguments to forward must be `torch::Tensor()`.
diff --git a/advanced_source/cpp_export.rst b/advanced_source/cpp_export.rst
index 56c4bcbaae7..25ac0713bfd 100644
--- a/advanced_source/cpp_export.rst
+++ b/advanced_source/cpp_export.rst
@@ -1,3 +1,3 @@
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst
index 968afa01b23..2739a051a94 100644
--- a/advanced_source/cpp_frontend.rst
+++ b/advanced_source/cpp_frontend.rst
@@ -38,8 +38,10 @@ with the C++ frontend. Concretely, we will be training a `DCGAN
 generate images of MNIST digits. While conceptually a simple example, it should
 be enough to give you a whirlwind overview of the PyTorch C++ frontend and wet
 your appetite for training more complex models. We will begin with some
+We'll begin with some
 motivating words for why you would want to use the C++ frontend to begin with,
-and then dive straight into defining and training our model.
+and then dive straight into defining and training our model. In this tutorial, we'll train
+a model on :term:`GPU` for optimal performance.
 
 .. tip::
 
@@ -961,6 +963,7 @@ Writing the Training Loop
 Let's now finish the algorithmic part of our example and implement the delicate
 dance between the generator and discriminator. First, we'll create two
 optimizers, one for the generator and one for the discriminator. The optimizers
+The :term:`optimizer`s
 we use implement the `Adam <https://arxiv.org/pdf/1412.6980.pdf>`_ algorithm:
 
 .. code-block:: cpp
diff --git a/advanced_source/custom_class_pt2.rst b/advanced_source/custom_class_pt2.rst
index 229a94f2ce9..f3c43016ddd 100644
--- a/advanced_source/custom_class_pt2.rst
+++ b/advanced_source/custom_class_pt2.rst
@@ -247,7 +247,7 @@ After re-compilation, we can export the custom op with:
 Why do we need to make a Fake Class?
 ------------------------------------
 
-Tracing with real custom object has several major downsides:
+:term:`Tracing` with real custom object has several major downsides:
 
 1. Operators on real objects can be time consuming e.g. the custom object
    might be reading from the network or loading data from the disk.
diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
index 4b03803c15b..f824fe3d004 100644
--- a/advanced_source/dispatcher.rst
+++ b/advanced_source/dispatcher.rst
@@ -18,7 +18,7 @@ of another.  Here is a sampling of some of the things it handles:
   depending on whether or not autograd handling is necessary.
 * Applying autocasting when necessary for automatic mixed precision.
 * Applying batching rules when an operator is run under a ``vmap`` call.
-* Tracing execution of operations, if you are tracing a model for export.
+* :term:`Tracing` execution of operations, if you are tracing a model for export.
 
 If in your `custom operator code <torch_script_custom_ops>`_ you find yourself
 manually writing if statements to handle these cases, the dispatcher APIs can
@@ -403,8 +403,8 @@ a kernel at the Batched dispatch key.
 Tracer
 ^^^^^^
 
-The Tracer dispatch key implements support for recording invocations of operators
-into a trace when you run ``torch.jit.trace``.  We intend to provide a
+The Tracer dispatch key implements support for recording invocations of :term:`operations<Operation>`
+into a trace when you run ``torch.jit.trace`` (:term:`Tracing`).
 boxed fallback that will implement tracing for arbitrary operations,
 see `issue #41478 <https://github.com/pytorch/pytorch/issues/41478>`_ to track
 progress.
diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst
index 12f15355f5f..2422a5ef337 100644
--- a/advanced_source/extend_dispatcher.rst
+++ b/advanced_source/extend_dispatcher.rst
@@ -3,10 +3,13 @@ Extending dispatcher for a new backend in C++
 
 In this tutorial we will walk through all necessary steps to extend the dispatcher to
 add a new device living outside ``pytorch/pytorch`` repo and maintain it to keep in
-sync with native PyTorch devices.  Here we'll assume that you're familiar with how
+with native PyTorch devices.  Here we'll assume that you're familiar with how
 to `register a dispatched operator in C++ <dispatcher>`_ and how to write a
 `custom autograd function <cpp_autograd>`_.
 
+Note: This tutorial covers extending the dispatcher for custom backends that
+implement :term:`device kernels<Device Kernel>` for :term:`operations<Operation>`.
+
 
 .. note::
 
@@ -295,7 +298,7 @@ JIT support
 
 As we mentioned in `Registering a Dispatched Operator in C++ <dispatcher>`_, kernels registered through `m.impl()` API
 support being called in both unboxed and boxed ways. In other words your customized backend can also work with our
-JIT tracing/scripting frontend just like the in-tree backends like CPU or CUDA do.  You could potentially also write specialized optimization
+:term:`JIT` :term:`tracing<Tracing>`/:term:`scripting<Scripting>` frontend just like the in-tree backends like CPU or CUDA do.
 passes for your backend on a JIT graph.  But we will not discuss it here since we haven't finalized the integration point
 in JIT, so the current backend support will focus on the eager frontend for now.
 
@@ -377,4 +380,3 @@ any feature requests or bug reports, please `file an issue on github <https://gi
 
 If you're interested in helping in any of the future work items above (e.g adding more ``Math``
 kernels for PyTorch operators in C++), please reach out to us through Github or Slack!
-
diff --git a/advanced_source/torch-script-parallelism.rst b/advanced_source/torch-script-parallelism.rst
index 56c4bcbaae7..25ac0713bfd 100644
--- a/advanced_source/torch-script-parallelism.rst
+++ b/advanced_source/torch-script-parallelism.rst
@@ -1,3 +1,3 @@
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
index 01bc497d38e..dceeb7d78bb 100644
--- a/advanced_source/torch_script_custom_ops.rst
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -2,5 +2,5 @@
   TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
 
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/compilers_index.rst b/compilers_index.rst
index ec426cecc80..4820a1627a8 100644
--- a/compilers_index.rst
+++ b/compilers_index.rst
@@ -10,7 +10,7 @@ control, as well as third-party backend solutions.
 
 .. warning::
 
-   TorchScript is no longer in active development.
+   :term:`TorchScript` is no longer in active development.
 
 .. raw:: html
 
diff --git a/recipes_source/compiling_optimizer.rst b/recipes_source/compiling_optimizer.rst
index 951495ca4fa..2352116a983 100644
--- a/recipes_source/compiling_optimizer.rst
+++ b/recipes_source/compiling_optimizer.rst
@@ -1,12 +1,12 @@
-(beta) Compiling the optimizer with torch.compile
+(beta) Compiling the :term:`optimizer` with :term:`torch.compile`
 ==========================================================================================
 
 **Author:** `Michael Lazos <https://github.com/mlazos>`_
 
-The optimizer is a key algorithm for training any deep learning model.
+The :term:`optimizer` is a key algorithm for training any deep learning model.
 Since it is responsible for updating every model parameter, it can often
-become the bottleneck in training performance for large models. In this recipe, 
-we will apply ``torch.compile`` to the optimizer to observe the GPU performance 
+become the bottleneck in training performance for large models. In this recipe,
+we will apply ``torch.compile`` to the optimizer to observe the :term:`GPU` performance
 improvement.
 
 .. note::
@@ -24,7 +24,7 @@ Depending on what machine you are using, your exact results may vary.
 .. code-block:: python
 
    import torch
-   
+
    model = torch.nn.Sequential(
        *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
    )
@@ -39,7 +39,7 @@ and create a helper function to wrap the step()
 in ``torch.compile()``.
 
 .. note::
-   
+
    ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0
 
 .. code-block:: python
@@ -57,12 +57,12 @@ in ``torch.compile()``.
    @torch.compile(fullgraph=False)
    def fn():
        opt.step()
-   
-   
+
+
    # Let's define a helpful benchmarking function:
    import torch.utils.benchmark as benchmark
-   
-   
+
+
    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
        t0 = benchmark.Timer(
            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
@@ -73,12 +73,12 @@ in ``torch.compile()``.
    # Warmup runs to compile the function
    for _ in range(5):
        fn()
-   
+
    eager_runtime = benchmark_torch_function_in_microseconds(opt.step)
    compiled_runtime = benchmark_torch_function_in_microseconds(fn)
-   
+
    assert eager_runtime > compiled_runtime
-   
+
    print(f"eager runtime: {eager_runtime}us")
    print(f"compiled runtime: {compiled_runtime}us")
 
diff --git a/recipes_source/distributed_optim_torchscript.rst b/recipes_source/distributed_optim_torchscript.rst
index 01bc497d38e..dceeb7d78bb 100644
--- a/recipes_source/distributed_optim_torchscript.rst
+++ b/recipes_source/distributed_optim_torchscript.rst
@@ -2,5 +2,5 @@
   TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
 
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/recipes_source/torchscript_inference.rst b/recipes_source/torchscript_inference.rst
index 01bc497d38e..dceeb7d78bb 100644
--- a/recipes_source/torchscript_inference.rst
+++ b/recipes_source/torchscript_inference.rst
@@ -2,5 +2,5 @@
   TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
 
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
\ No newline at end of file
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.

From e987b6197098d35b172e160d3c19008fd553c348 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 21:57:37 -0500
Subject: [PATCH 05/13] Fixing text.

---
 advanced_source/cpp_frontend.rst | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst
index 2739a051a94..56b8d587f8a 100644
--- a/advanced_source/cpp_frontend.rst
+++ b/advanced_source/cpp_frontend.rst
@@ -36,10 +36,10 @@ This tutorial will walk you through an end-to-end example of training a model
 with the C++ frontend. Concretely, we will be training a `DCGAN
 <https://arxiv.org/abs/1511.06434>`_ -- a kind of generative model -- to
 generate images of MNIST digits. While conceptually a simple example, it should
-be enough to give you a whirlwind overview of the PyTorch C++ frontend and wet
-your appetite for training more complex models. We will begin with some
-We'll begin with some
-motivating words for why you would want to use the C++ frontend to begin with,
+be enough to give you a whirlwind overview of the PyTorch C++ frontend and whet
+your appetite for training more complex models.
+
+We'll begin with some motivating words for why you would want to use the C++ frontend to begin with,
 and then dive straight into defining and training our model. In this tutorial, we'll train
 a model on :term:`GPU` for optimal performance.
 
@@ -962,9 +962,8 @@ Writing the Training Loop
 
 Let's now finish the algorithmic part of our example and implement the delicate
 dance between the generator and discriminator. First, we'll create two
-optimizers, one for the generator and one for the discriminator. The optimizers
-The :term:`optimizer`s
-we use implement the `Adam <https://arxiv.org/pdf/1412.6980.pdf>`_ algorithm:
+optimizers, one for the generator and one for the discriminator.
+The :term:`optimizer`s we use implement the `Adam <https://arxiv.org/pdf/1412.6980.pdf>`_ algorithm:
 
 .. code-block:: cpp
 

From aec056c725b1d98a05208b799529e4b524518e95 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 22:18:12 -0500
Subject: [PATCH 06/13] Fixing broken link.

---
 conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf.py b/conf.py
index a150e982fbc..056a2aa6e99 100644
--- a/conf.py
+++ b/conf.py
@@ -191,7 +191,7 @@ def wrapper(*args, **kwargs):
     "show_signature": False,
     "first_notebook_cell": (
         "# For tips on running notebooks in Google Colab, see\n"
-        "# https://docs.pytorch.org/tutorials/beginner/colab\n"
+        "# https://docs.pytorch.org/tutorials/beginner/colab"
         "%matplotlib inline"
     ),
     "ignore_pattern": r"_torch_export_nightly_tutorial.py",

From 66c6927dda6c2e74c325e3234b4a4153497ba5f4 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Mon, 15 Dec 2025 23:18:40 -0500
Subject: [PATCH 07/13] Fixing issue with tabs library.

---
 .ci/docker/requirements.txt | 1 +
 conf.py                     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 73a9dcd1632..308ebdbf2ad 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -12,6 +12,7 @@ sphinx_sitemap==2.7.1
 sphinxcontrib-mermaid==1.0.0
 sphinxcontrib.katex==0.9.10
 sphinx-hoverxref==1.4.1
+sphinx-tabs
 pypandoc==1.15
 pandocfilters==1.5.1
 markdown==3.8.2
diff --git a/conf.py b/conf.py
index 056a2aa6e99..68efef63fb0 100644
--- a/conf.py
+++ b/conf.py
@@ -154,6 +154,7 @@ def wrapper(*args, **kwargs):
 }
 hoverxref_tooltip_maxwidth = 600
 hoverxref_tooltip_theme = ["tooltipster-shadow", "tooltipster-shadow-custom"]
+hoverxref_sphinxtabs = True  # Enable sphinx-tabs integration
 
 intersphinx_mapping = {
     "torch": ("https://docs.pytorch.org/docs/stable/", None),

From 452f8acb3a13fd8a735c4250be89fb46810a9c5c Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 18 Dec 2025 10:40:50 -0800
Subject: [PATCH 08/13] Update

---
 .ci/docker/requirements.txt     |   6 +-
 _static/js/glossary-tooltips.js | 335 --------------------------------
 conf.py                         |  22 +--
 glossary.md                     | 168 ++++++++++++++++
 4 files changed, 182 insertions(+), 349 deletions(-)
 delete mode 100644 _static/js/glossary-tooltips.js
 create mode 100644 glossary.md

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 308ebdbf2ad..c8c6fec9ac9 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -11,14 +11,14 @@ sphinx-copybutton==0.5.2
 sphinx_sitemap==2.7.1
 sphinxcontrib-mermaid==1.0.0
 sphinxcontrib.katex==0.9.10
-sphinx-hoverxref==1.4.1
-sphinx-tabs
+sphinx_tippy==0.4.3
 pypandoc==1.15
 pandocfilters==1.5.1
 markdown==3.8.2
 
 # PyTorch Theme
-pytorch_sphinx_theme2==0.2.0
+#pytorch_sphinx_theme2==0.2.0
+git+https://github.com/pytorch/pytorch_sphinx_theme.git@5b6d2df5660d2ccf4b34cf819b7ab7c69f65f20d#egg=pytorch_sphinx_theme2
 
 # Tutorial dependencies
 tqdm==4.66.1
diff --git a/_static/js/glossary-tooltips.js b/_static/js/glossary-tooltips.js
deleted file mode 100644
index cf024479ceb..00000000000
--- a/_static/js/glossary-tooltips.js
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Glossary Tooltips - Custom tooltip implementation for Sphinx glossary terms
- *
- * This script adds hover tooltips to glossary term references throughout the documentation.
- * When hovering over a glossary term link, it fetches the definition from the glossary page
- * and displays it in a tooltip without requiring navigation.
- */
-
-(function () {
-    'use strict';
-
-    // Cache for glossary definitions to avoid repeated fetches
-    const glossaryCache = {};
-    let glossaryContent = null;
-    let tooltip = null;
-    let currentTarget = null;
-    let hideTimeout = null;
-
-    /**
-     * Create the tooltip element
-     */
-    function createTooltip() {
-        tooltip = document.createElement('div');
-        tooltip.className = 'glossary-tooltip';
-        tooltip.style.cssText = `
-            position: absolute;
-            display: none;
-            background: #2c2c2c;
-            color: #e8e8e8;
-            padding: 12px 16px;
-            border-radius: 6px;
-            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
-            max-width: 400px;
-            z-index: 10000;
-            font-size: 14px;
-            line-height: 1.5;
-            pointer-events: none;
-        `;
-        document.body.appendChild(tooltip);
-    }
-
-    /**
-     * Fetch and cache glossary content using iframe (works with file:// URLs)
-     */
-    async function fetchGlossaryContent() {
-        if (glossaryContent) {
-            return glossaryContent;
-        }
-
-        return new Promise((resolve, reject) => {
-            try {
-                // Create hidden iframe to load glossary
-                const iframe = document.createElement('iframe');
-                iframe.style.display = 'none';
-
-                // Determine glossary URL - find the base URL by looking for common patterns
-                const currentPath = window.location.pathname;
-                let basePath = '';
-
-                // Find the root of the documentation
-                const pathParts = currentPath.split('/');
-                for (let i = 0; i < pathParts.length; i++) {
-                    if (pathParts[i] === '_build') {
-                        // For local builds, glossary is at _build/html/glossary.html
-                        basePath = pathParts.slice(0, i + 2).join('/') + '/';
-                        break;
-                    }
-                }
-
-                // If we couldn't find _build, try to find common doc directories
-                if (!basePath) {
-                    const knownDirs = ['advanced', 'beginner', 'intermediate', 'recipes', 'prototype', 'unstable'];
-                    for (let i = pathParts.length - 1; i >= 0; i--) {
-                        if (knownDirs.includes(pathParts[i])) {
-                            basePath = pathParts.slice(0, i).join('/') + '/';
-                            break;
-                        }
-                    }
-                }
-
-                // Fallback to going up directories based on current location
-                if (!basePath) {
-                    basePath = currentPath.substring(0, currentPath.lastIndexOf('/') + 1) + '../';
-                }
-
-                const glossaryUrl = window.location.origin + basePath + 'glossary.html';
-
-                console.log('Loading glossary from:', glossaryUrl);
-
-                iframe.onload = function () {
-                    try {
-                        const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
-
-                        // Verify we got valid content
-                        if (!iframeDoc.getElementById('term-ATen') && !iframeDoc.getElementById('term-JIT')) {
-                            console.warn('Glossary loaded but no terms found');
-                        } else {
-                            console.log('Glossary content successfully loaded');
-                        }
-
-                        // Clone the body content before removing iframe
-                        const clonedBody = iframeDoc.body.cloneNode(true);
-
-                        // Create a container to hold the content
-                        const container = document.createElement('div');
-                        container.innerHTML = clonedBody.innerHTML;
-                        container.style.display = 'none';
-                        container.id = 'glossary-content-cache';
-                        document.body.appendChild(container);
-
-                        glossaryContent = container;
-
-                        // Remove iframe after cloning
-                        if (iframe.parentNode) {
-                            iframe.parentNode.removeChild(iframe);
-                        }
-
-                        resolve(glossaryContent);
-                    } catch (error) {
-                        console.error('Error accessing iframe content:', error);
-                        reject(error);
-                    }
-                };
-
-                iframe.onerror = function (error) {
-                    console.error('Error loading glossary iframe:', error);
-                    reject(error);
-                };
-
-                document.body.appendChild(iframe);
-                iframe.src = glossaryUrl;
-
-                // Timeout after 5 seconds
-                setTimeout(() => {
-                    if (!glossaryContent) {
-                        console.error('Glossary loading timeout');
-                        if (iframe.parentNode) {
-                            iframe.parentNode.removeChild(iframe);
-                        }
-                        reject(new Error('Timeout loading glossary'));
-                    }
-                }, 5000);
-
-            } catch (error) {
-                console.error('Failed to create glossary iframe:', error);
-                reject(error);
-            }
-        });
-    }
-
-    /**
-     * Extract definition text from glossary entry
-     */
-    function getDefinitionText(termId, container) {
-        if (glossaryCache[termId]) {
-            return glossaryCache[termId];
-        }
-
-        try {
-            // Find the term definition in the glossary
-            // container is a div element, so use querySelector instead of getElementById
-            const termElement = container.querySelector('#' + CSS.escape(termId));
-            if (!termElement) {
-                console.warn('Term not found:', termId);
-                return null;
-            }
-
-            // Get the definition - it's in the <dd> that follows the <dt>
-            let definitionElement = termElement.nextElementSibling;
-            while (definitionElement && definitionElement.tagName !== 'DD') {
-                definitionElement = definitionElement.nextElementSibling;
-            }
-
-            if (!definitionElement) {
-                // Try looking for the parent dt and its sibling dd
-                const parentDt = termElement.closest('dt');
-                if (parentDt) {
-                    definitionElement = parentDt.nextElementSibling;
-                    while (definitionElement && definitionElement.tagName !== 'DD') {
-                        definitionElement = definitionElement.nextElementSibling;
-                    }
-                }
-            }
-
-            if (!definitionElement) {
-                console.warn('Definition element not found for:', termId);
-                return null;
-            }
-
-            // Clone the element to manipulate it without affecting the original
-            const clone = definitionElement.cloneNode(true);
-
-            // Remove any internal reference links (keep the text but remove the link)
-            clone.querySelectorAll('a.reference.internal').forEach(link => {
-                const text = document.createTextNode(link.textContent);
-                link.parentNode.replaceChild(text, link);
-            });
-
-            // Get clean text with basic formatting
-            let text = clone.textContent.trim();
-
-            // Limit length and add ellipsis if needed
-            const maxLength = 300;
-            if (text.length > maxLength) {
-                text = text.substring(0, maxLength).trim() + '...';
-            }
-
-            glossaryCache[termId] = text;
-            return text;
-        } catch (error) {
-            console.error('Error extracting definition:', error);
-            return null;
-        }
-    }
-
-    /**
-     * Show tooltip at the given position
-     */
-    function showTooltip(text, target) {
-        if (!tooltip || !text) {
-            return;
-        }
-
-        clearTimeout(hideTimeout);
-
-        tooltip.textContent = text;
-        tooltip.style.display = 'block';
-
-        // Position tooltip
-        const rect = target.getBoundingClientRect();
-        const scrollTop = window.pageYOffset || document.documentElement.scrollTop;
-        const scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
-
-        let top = rect.bottom + scrollTop + 8;
-        let left = rect.left + scrollLeft + (rect.width / 2);
-
-        // Adjust position if tooltip would go off-screen
-        const tooltipRect = tooltip.getBoundingClientRect();
-
-        // Horizontal adjustment
-        if (left + tooltipRect.width / 2 > window.innerWidth) {
-            left = window.innerWidth - tooltipRect.width - 10 + scrollLeft;
-        } else if (left - tooltipRect.width / 2 < 0) {
-            left = 10 + scrollLeft;
-        } else {
-            left = left - tooltipRect.width / 2;
-        }
-
-        // Vertical adjustment - show above if no room below
-        if (rect.bottom + tooltipRect.height + 16 > window.innerHeight + scrollTop) {
-            top = rect.top + scrollTop - tooltipRect.height - 8;
-        }
-
-        tooltip.style.top = top + 'px';
-        tooltip.style.left = left + 'px';
-
-        currentTarget = target;
-    }
-
-    /**
-     * Hide tooltip with delay
-     */
-    function hideTooltip() {
-        hideTimeout = setTimeout(() => {
-            if (tooltip) {
-                tooltip.style.display = 'none';
-                currentTarget = null;
-            }
-        }, 100);
-    }
-
-    /**
-     * Handle mouse enter on glossary term link
-     */
-    async function handleMouseEnter(event) {
-        const link = event.currentTarget;
-        const href = link.getAttribute('href');
-
-        // Check if this is a glossary term link
-        if (!href || !href.includes('glossary.html#term-')) {
-            return;
-        }
-
-        // Extract term ID from href
-        const termId = href.split('#')[1];
-        if (!termId) {
-            return;
-        }
-
-        // Show loading indicator for slow networks
-        const loadingText = 'Loading definition...';
-        showTooltip(loadingText, link);
-
-        // Fetch glossary content if not already cached
-        const doc = await fetchGlossaryContent();
-        if (!doc) {
-            hideTooltip();
-            return;
-        }
-
-        // Get definition text
-        const definition = getDefinitionText(termId, doc);
-        if (definition && currentTarget === link) {
-            showTooltip(definition, link);
-        } else {
-            hideTooltip();
-        }
-    }
-
-    /**
-     * Initialize tooltips for all glossary term links
-     */
-    function initializeGlossaryTooltips() {
-        // Create tooltip element
-        createTooltip();
-
-        // Find all glossary term links
-        const glossaryLinks = document.querySelectorAll('a.reference.internal[href*="glossary.html#term-"]');
-
-        glossaryLinks.forEach(link => {
-            link.addEventListener('mouseenter', handleMouseEnter);
-            link.addEventListener('mouseleave', hideTooltip);
-        });
-
-        console.log(`Initialized glossary tooltips for ${glossaryLinks.length} terms`);
-    }
-
-    // Initialize when DOM is ready
-    if (document.readyState === 'loading') {
-        document.addEventListener('DOMContentLoaded', initializeGlossaryTooltips);
-    } else {
-        initializeGlossaryTooltips();
-    }
-
-})();
diff --git a/conf.py b/conf.py
index 68efef63fb0..128395e4b7c 100644
--- a/conf.py
+++ b/conf.py
@@ -141,20 +141,20 @@ def wrapper(*args, **kwargs):
     "sphinx_sitemap",
     "sphinx_reredirects",
     "sphinxcontrib.mermaid",
-    "hoverxref.extension",
+    "sphinx_tippy",
 ]
 
-# sphinx-hoverxref configuration
-hoverxref_auto_ref = True
-hoverxref_domains = ["py", "std"]  # Add 'std' domain for glossary terms
-hoverxref_role_types = {
-    "term": "tooltip",  # Enable tooltips for glossary terms
-    "ref": "tooltip",
-    "std:term": "tooltip",  # Explicitly enable tooltips for std domain terms
+# sphinx-tippy configuration
+tippy_props = {
+    "placement": "auto-start",
+    "maxWidth": 500,
+    "interactive": True,  # Allow clicking links inside tooltips
+    "theme": "material",
 }
-hoverxref_tooltip_maxwidth = 600
-hoverxref_tooltip_theme = ["tooltipster-shadow", "tooltipster-shadow-custom"]
-hoverxref_sphinxtabs = True  # Enable sphinx-tabs integration
+
+# Skip all URLs except glossary term links (glossary.html#term-*)
+tippy_skip_urls = (r"^(?!.*glossary\.html#term-).*$",)
+tippy_enable_mathjax = True
 
 intersphinx_mapping = {
     "torch": ("https://docs.pytorch.org/docs/stable/", None),
diff --git a/glossary.md b/glossary.md
new file mode 100644
index 00000000000..ff9ad85ce80
--- /dev/null
+++ b/glossary.md
@@ -0,0 +1,168 @@
+(glossary)=
+# PyTorch Glossary
+
+This glossary provides definitions for terms commonly used in PyTorch documentation.
+
+```{glossary}
+ATen
+   Short for "A Tensor Library". The foundational tensor and mathematical
+   operation library on which all else is built.
+
+attention mechanism
+   A technique used in deep learning models, particularly transformer architectures,
+   to selectively focus on certain input elements or tokens when computing output
+   representations, improving performance and interpretability.
+
+backward pass
+   The backward pass is part of the backpropagation algorithm where the error
+   gradients are computed and propagated backwards through the network, adjusting
+   the weights and biases to minimize the loss.
+
+backpropagation
+   An essential algorithm in training neural networks. It calculates the gradient
+   of the loss function with respect to the model's parameters, allowing the
+   network to learn from its mistakes and improve over time.
+
+CNN
+   Convolutional Neural Network: A type of neural network designed for image and
+   video processing, using convolutional and pooling layers to extract features.
+
+Compound Kernel
+   Opposed to {term}`Device Kernels<Device Kernel>`, Compound kernels are usually
+   device-agnostic and belong to {term}`Compound Operations<Compound Operation>`.
+
+Compound Operation
+   A Compound Operation is composed of other operations. Its kernel is usually
+   device-agnostic. Normally it doesn't have its own derivative functions defined.
+   Instead, AutoGrad automatically computes its derivative based on operations it
+   uses.
+
+Composite Operation
+   Same as {term}`Compound Operation`.
+
+Convolutional Neural Network
+   A type of neural network designed for image and video processing, using
+   convolutional and pooling layers to extract features. Also known as CNN.
+
+CUDA
+   Compute Unified Device Architecture: A parallel computing platform developed
+   by NVIDIA that allows developers to use GPUs for general-purpose computing,
+   including machine learning and deep learning applications.
+
+Custom Operation
+   An Operation that is defined by users and is usually a {term}`Compound Operation`.
+   For example, this [tutorial](https://pytorch.org/docs/stable/notes/extending.html)
+   details how to create Custom Operations.
+
+Device Kernel
+   Device-specific kernel of a {term}`Leaf Operation`.
+
+embedding
+   A way to represent categorical variables as dense vectors, often used in
+   natural language processing and recommender systems.
+
+epoch
+   An epoch is a unit of measurement in machine learning that represents one
+   complete pass through the entire training dataset. During each epoch, the
+   model's weights are updated based on the loss calculated from the predictions
+   made on the training data.
+
+forward pass
+   The forward pass is the process of passing input data through a neural network
+   to obtain an output prediction. It's the first step in training a model,
+   followed by the backward pass and optimization.
+
+GPU
+   Graphics Processing Unit: A specialized electronic circuit designed to quickly
+   manipulate and alter memory to accelerate computations. In the context of AI
+   and machine learning, GPUs are used to accelerate computationally intensive
+   tasks like training neural networks.
+
+gradient
+   In machine learning, the gradient represents the rate of change of the loss
+   function with respect to the model's parameters. It's used in backpropagation
+   to update the weights and biases during training.
+
+Inductor
+   A PyTorch component that enables just-in-time (JIT) compilation of PyTorch
+   models, allowing for faster inference times and better performance on CPUs
+   and GPUs. It is the default backend for torch.compile.
+
+inference
+   The process of making predictions or drawing conclusions from a trained AI
+   model, typically involving the application of the learned relationships to
+   new, unseen data.
+
+JIT
+   Just-In-Time Compilation: A compilation technique where code is compiled into
+   machine code at runtime, just before it is executed.
+
+Kernel
+   Implementation of a PyTorch operation, specifying what should be done when an
+   operation executes.
+
+Leaf Operation
+   An operation that's considered a basic operation, as opposed to a {term}`Compound
+   Operation`. Leaf Operation always has dispatch functions defined, usually has a
+   derivative function defined as well.
+
+loss function
+   A loss function, also known as a cost function, is a mathematical function
+   used to evaluate the performance of a machine learning model during training,
+   providing a measure of how well the model is doing.
+
+LSTM
+   Long Short-Term Memory Network: A type of recurrent neural network (RNN)
+   designed to handle sequential data with long-term dependencies. LSTMs use
+   memory cells and gates to selectively retain information over time.
+
+Native Operation
+   An operation that comes natively with PyTorch ATen, for example ``aten::matmul``.
+
+Non-Leaf Operation
+   Same as {term}`Compound Operation`.
+
+Operation
+   A unit of work. For example, the work of matrix multiplication is an operation
+   called ``aten::matmul``.
+
+optimizer
+   An algorithm used to update the weights and biases of a neural network during
+   training to minimize the loss function. Common optimizers include SGD, Adam,
+   and RMSprop.
+
+quantization
+   A technique used to reduce the precision of numerical values in a deep learning
+   model, often to reduce memory usage, improve performance, and enable deployment
+   on resource-constrained devices.
+
+RNN
+   Recurrent Neural Network: A type of neural network designed for sequential data,
+   using recurrent connections to capture temporal dependencies.
+
+Scripting
+   Using ``torch.jit.script`` on a function to inspect source code and compile it as
+   {term}`TorchScript` code.
+
+tensor
+   Tensors are a specialized data structure that are very similar to arrays and
+   matrices. In PyTorch, tensors are used to encode the inputs and outputs of a
+   model, as well as the model's parameters.
+
+torch.compile
+   A PyTorch function that compiles PyTorch code into an optimized form, allowing
+   for faster execution and better performance. It is the main entry point for
+   PyTorch 2.x optimizations.
+
+TorchScript
+   An interface to the TorchScript {term}`JIT` compiler and interpreter.
+
+Tracing
+   Using ``torch.jit.trace`` on a function to get an executable that can be optimized
+   using just-in-time compilation.
+
+transformer
+   A type of neural network architecture introduced in the paper "Attention is All
+   You Need" (Vaswani et al., 2017), which relies entirely on self-attention
+   mechanisms to process sequential data, such as text or images.
+```

From 12e973873557d9d8e8c9ed2bb3f601a7cf0e1a53 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 18 Dec 2025 10:41:15 -0800
Subject: [PATCH 09/13] Update

---
 glossary.rst | 169 ---------------------------------------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 glossary.rst

diff --git a/glossary.rst b/glossary.rst
deleted file mode 100644
index 28027b42ccd..00000000000
--- a/glossary.rst
+++ /dev/null
@@ -1,169 +0,0 @@
-PyTorch Glossary
-================
-
-This glossary provides definitions for terms commonly used in PyTorch documentation.
-
-.. glossary::
-   :sorted:
-
-   ATen
-      Short for "A Tensor Library". The foundational tensor and mathematical
-      operation library on which all else is built.
-
-   attention mechanism
-      A technique used in deep learning models, particularly transformer architectures,
-      to selectively focus on certain input elements or tokens when computing output
-      representations, improving performance and interpretability.
-
-   backward pass
-      The backward pass is part of the backpropagation algorithm where the error
-      gradients are computed and propagated backwards through the network, adjusting
-      the weights and biases to minimize the loss.
-
-   backpropagation
-      An essential algorithm in training neural networks. It calculates the gradient
-      of the loss function with respect to the model's parameters, allowing the
-      network to learn from its mistakes and improve over time.
-
-   CNN
-      Convolutional Neural Network: A type of neural network designed for image and
-      video processing, using convolutional and pooling layers to extract features.
-
-   Convolutional Neural Network
-      A type of neural network designed for image and video processing, using
-      convolutional and pooling layers to extract features. Also known as CNN.
-
-   CUDA
-      Compute Unified Device Architecture: A parallel computing platform developed
-      by NVIDIA that allows developers to use GPUs for general-purpose computing,
-      including machine learning and deep learning applications.
-
-   embedding
-      A way to represent categorical variables as dense vectors, often used in
-      natural language processing and recommender systems.
-
-   epoch
-      An epoch is a unit of measurement in machine learning that represents one
-      complete pass through the entire training dataset. During each epoch, the
-      model's weights are updated based on the loss calculated from the predictions
-      made on the training data.
-
-   forward pass
-      The forward pass is the process of passing input data through a neural network
-      to obtain an output prediction. It's the first step in training a model,
-      followed by the backward pass and optimization.
-
-   GPU
-      Graphics Processing Unit: A specialized electronic circuit designed to quickly
-      manipulate and alter memory to accelerate computations. In the context of AI
-      and machine learning, GPUs are used to accelerate computationally intensive
-      tasks like training neural networks.
-
-   gradient
-      In machine learning, the gradient represents the rate of change of the loss
-      function with respect to the model's parameters. It's used in backpropagation
-      to update the weights and biases during training.
-
-   Inductor
-      A PyTorch component that enables just-in-time (JIT) compilation of PyTorch
-      models, allowing for faster inference times and better performance on CPUs
-      and GPUs. It is the default backend for torch.compile.
-
-   inference
-      The process of making predictions or drawing conclusions from a trained AI
-      model, typically involving the application of the learned relationships to
-      new, unseen data.
-
-   loss function
-      A loss function, also known as a cost function, is a mathematical function
-      used to evaluate the performance of a machine learning model during training,
-      providing a measure of how well the model is doing.
-
-   LSTM
-      Long Short-Term Memory Network: A type of recurrent neural network (RNN)
-      designed to handle sequential data with long-term dependencies. LSTMs use
-      memory cells and gates to selectively retain information over time.
-
-   optimizer
-      An algorithm used to update the weights and biases of a neural network during
-      training to minimize the loss function. Common optimizers include SGD, Adam,
-      and RMSprop.
-
-   quantization
-      A technique used to reduce the precision of numerical values in a deep learning
-      model, often to reduce memory usage, improve performance, and enable deployment
-      on resource-constrained devices.
-
-   RNN
-      Recurrent Neural Network: A type of neural network designed for sequential data,
-      using recurrent connections to capture temporal dependencies.
-
-   tensor
-      Tensors are a specialized data structure that are very similar to arrays and
-      matrices. In PyTorch, tensors are used to encode the inputs and outputs of a
-      model, as well as the model's parameters.
-
-   torch.compile
-      A PyTorch function that compiles PyTorch code into an optimized form, allowing
-      for faster execution and better performance. It is the main entry point for
-      PyTorch 2.x optimizations.
-
-   transformer
-      A type of neural network architecture introduced in the paper "Attention is All
-      You Need" (Vaswani et al., 2017), which relies entirely on self-attention
-      mechanisms to process sequential data, such as text or images.
-
-   Operation
-      A unit of work. For example, the work of matrix multiplication is an operation
-      called ``aten::matmul``.
-
-   Native Operation
-      An operation that comes natively with PyTorch ATen, for example ``aten::matmul``.
-
-   Custom Operation
-      An Operation that is defined by users and is usually a :term:`Compound Operation`.
-      For example, this `tutorial <https://pytorch.org/docs/stable/notes/extending.html>`_
-      details how to create Custom Operations.
-
-   Kernel
-      Implementation of a PyTorch operation, specifying what should be done when an
-      operation executes.
-
-   Compound Operation
-      A Compound Operation is composed of other operations. Its kernel is usually
-      device-agnostic. Normally it doesn't have its own derivative functions defined.
-      Instead, AutoGrad automatically computes its derivative based on operations it
-      uses.
-
-   Composite Operation
-      Same as :term:`Compound Operation`.
-
-   Non-Leaf Operation
-      Same as :term:`Compound Operation`.
-
-   Leaf Operation
-      An operation that's considered a basic operation, as opposed to a :term:`Compound
-      Operation`. Leaf Operation always has dispatch functions defined, usually has a
-      derivative function defined as well.
-
-   Device Kernel
-      Device-specific kernel of a :term:`Leaf Operation`.
-
-   Compound Kernel
-      Opposed to :term:`Device Kernels<Device Kernel>`, Compound kernels are usually
-      device-agnostic and belong to :term:`Compound Operations<Compound Operation>`.
-
-   JIT
-      Just-In-Time Compilation: A compilation technique where code is compiled into
-      machine code at runtime, just before it is executed.
-
-   TorchScript
-      An interface to the TorchScript :term:`JIT` compiler and interpreter.
-
-   Tracing
-      Using ``torch.jit.trace`` on a function to get an executable that can be optimized
-      using just-in-time compilation.
-
-   Scripting
-      Using ``torch.jit.script`` on a function to inspect source code and compile it as
-      :term:`TorchScript` code.

From c7ba14dba322e9bf8f276fed6d09ba25cba0cc06 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 18 Dec 2025 10:42:42 -0800
Subject: [PATCH 10/13] Update

---
 conf.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/conf.py b/conf.py
index 128395e4b7c..8409e28dc95 100644
--- a/conf.py
+++ b/conf.py
@@ -192,7 +192,7 @@ def wrapper(*args, **kwargs):
     "show_signature": False,
     "first_notebook_cell": (
         "# For tips on running notebooks in Google Colab, see\n"
-        "# https://docs.pytorch.org/tutorials/beginner/colab"
+        "# https://docs.pytorch.org/tutorials/beginner/colab\n"
         "%matplotlib inline"
     ),
     "ignore_pattern": r"_torch_export_nightly_tutorial.py",
@@ -451,10 +451,6 @@ def handle_jinja_templates(app, docname, source):
     "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css",
 ]
 
-html_js_files = [
-    "js/glossary-tooltips.js",
-]
-
 
 def html_page_context(app, pagename, templatename, context, doctree):
     # Check if the page is in gallery directories

From 97c6d3b9bcd2a89d9246043909ec3608e5b34872 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 18 Dec 2025 11:20:17 -0800
Subject: [PATCH 11/13] Update

---
 conf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conf.py b/conf.py
index 8409e28dc95..cb730ca8a5c 100644
--- a/conf.py
+++ b/conf.py
@@ -154,7 +154,6 @@ def wrapper(*args, **kwargs):
 
 # Skip all URLs except glossary term links (glossary.html#term-*)
 tippy_skip_urls = (r"^(?!.*glossary\.html#term-).*$",)
-tippy_enable_mathjax = True
 
 intersphinx_mapping = {
     "torch": ("https://docs.pytorch.org/docs/stable/", None),

From b13eb9787f767eb7dad3c2a390b5199d92db8054 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 18 Dec 2025 12:43:54 -0800
Subject: [PATCH 12/13] Update

---
 glossary.md  | 168 -------------------------------------------------
 glossary.rst | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+), 168 deletions(-)
 delete mode 100644 glossary.md
 create mode 100644 glossary.rst

diff --git a/glossary.md b/glossary.md
deleted file mode 100644
index ff9ad85ce80..00000000000
--- a/glossary.md
+++ /dev/null
@@ -1,168 +0,0 @@
-(glossary)=
-# PyTorch Glossary
-
-This glossary provides definitions for terms commonly used in PyTorch documentation.
-
-```{glossary}
-ATen
-   Short for "A Tensor Library". The foundational tensor and mathematical
-   operation library on which all else is built.
-
-attention mechanism
-   A technique used in deep learning models, particularly transformer architectures,
-   to selectively focus on certain input elements or tokens when computing output
-   representations, improving performance and interpretability.
-
-backward pass
-   The backward pass is part of the backpropagation algorithm where the error
-   gradients are computed and propagated backwards through the network, adjusting
-   the weights and biases to minimize the loss.
-
-backpropagation
-   An essential algorithm in training neural networks. It calculates the gradient
-   of the loss function with respect to the model's parameters, allowing the
-   network to learn from its mistakes and improve over time.
-
-CNN
-   Convolutional Neural Network: A type of neural network designed for image and
-   video processing, using convolutional and pooling layers to extract features.
-
-Compound Kernel
-   Opposed to {term}`Device Kernels<Device Kernel>`, Compound kernels are usually
-   device-agnostic and belong to {term}`Compound Operations<Compound Operation>`.
-
-Compound Operation
-   A Compound Operation is composed of other operations. Its kernel is usually
-   device-agnostic. Normally it doesn't have its own derivative functions defined.
-   Instead, AutoGrad automatically computes its derivative based on operations it
-   uses.
-
-Composite Operation
-   Same as {term}`Compound Operation`.
-
-Convolutional Neural Network
-   A type of neural network designed for image and video processing, using
-   convolutional and pooling layers to extract features. Also known as CNN.
-
-CUDA
-   Compute Unified Device Architecture: A parallel computing platform developed
-   by NVIDIA that allows developers to use GPUs for general-purpose computing,
-   including machine learning and deep learning applications.
-
-Custom Operation
-   An Operation that is defined by users and is usually a {term}`Compound Operation`.
-   For example, this [tutorial](https://pytorch.org/docs/stable/notes/extending.html)
-   details how to create Custom Operations.
-
-Device Kernel
-   Device-specific kernel of a {term}`Leaf Operation`.
-
-embedding
-   A way to represent categorical variables as dense vectors, often used in
-   natural language processing and recommender systems.
-
-epoch
-   An epoch is a unit of measurement in machine learning that represents one
-   complete pass through the entire training dataset. During each epoch, the
-   model's weights are updated based on the loss calculated from the predictions
-   made on the training data.
-
-forward pass
-   The forward pass is the process of passing input data through a neural network
-   to obtain an output prediction. It's the first step in training a model,
-   followed by the backward pass and optimization.
-
-GPU
-   Graphics Processing Unit: A specialized electronic circuit designed to quickly
-   manipulate and alter memory to accelerate computations. In the context of AI
-   and machine learning, GPUs are used to accelerate computationally intensive
-   tasks like training neural networks.
-
-gradient
-   In machine learning, the gradient represents the rate of change of the loss
-   function with respect to the model's parameters. It's used in backpropagation
-   to update the weights and biases during training.
-
-Inductor
-   A PyTorch component that enables just-in-time (JIT) compilation of PyTorch
-   models, allowing for faster inference times and better performance on CPUs
-   and GPUs. It is the default backend for torch.compile.
-
-inference
-   The process of making predictions or drawing conclusions from a trained AI
-   model, typically involving the application of the learned relationships to
-   new, unseen data.
-
-JIT
-   Just-In-Time Compilation: A compilation technique where code is compiled into
-   machine code at runtime, just before it is executed.
-
-Kernel
-   Implementation of a PyTorch operation, specifying what should be done when an
-   operation executes.
-
-Leaf Operation
-   An operation that's considered a basic operation, as opposed to a {term}`Compound
-   Operation`. Leaf Operation always has dispatch functions defined, usually has a
-   derivative function defined as well.
-
-loss function
-   A loss function, also known as a cost function, is a mathematical function
-   used to evaluate the performance of a machine learning model during training,
-   providing a measure of how well the model is doing.
-
-LSTM
-   Long Short-Term Memory Network: A type of recurrent neural network (RNN)
-   designed to handle sequential data with long-term dependencies. LSTMs use
-   memory cells and gates to selectively retain information over time.
-
-Native Operation
-   An operation that comes natively with PyTorch ATen, for example ``aten::matmul``.
-
-Non-Leaf Operation
-   Same as {term}`Compound Operation`.
-
-Operation
-   A unit of work. For example, the work of matrix multiplication is an operation
-   called ``aten::matmul``.
-
-optimizer
-   An algorithm used to update the weights and biases of a neural network during
-   training to minimize the loss function. Common optimizers include SGD, Adam,
-   and RMSprop.
-
-quantization
-   A technique used to reduce the precision of numerical values in a deep learning
-   model, often to reduce memory usage, improve performance, and enable deployment
-   on resource-constrained devices.
-
-RNN
-   Recurrent Neural Network: A type of neural network designed for sequential data,
-   using recurrent connections to capture temporal dependencies.
-
-Scripting
-   Using ``torch.jit.script`` on a function to inspect source code and compile it as
-   {term}`TorchScript` code.
-
-tensor
-   Tensors are a specialized data structure that are very similar to arrays and
-   matrices. In PyTorch, tensors are used to encode the inputs and outputs of a
-   model, as well as the model's parameters.
-
-torch.compile
-   A PyTorch function that compiles PyTorch code into an optimized form, allowing
-   for faster execution and better performance. It is the main entry point for
-   PyTorch 2.x optimizations.
-
-TorchScript
-   An interface to the TorchScript {term}`JIT` compiler and interpreter.
-
-Tracing
-   Using ``torch.jit.trace`` on a function to get an executable that can be optimized
-   using just-in-time compilation.
-
-transformer
-   A type of neural network architecture introduced in the paper "Attention is All
-   You Need" (Vaswani et al., 2017), which relies entirely on self-attention
-   mechanisms to process sequential data, such as text or images.
-```
diff --git a/glossary.rst b/glossary.rst
new file mode 100644
index 00000000000..18da44b0587
--- /dev/null
+++ b/glossary.rst
@@ -0,0 +1,172 @@
+.. _glossary:
+
+================
+PyTorch Glossary
+================
+
+This glossary provides definitions for terms commonly used in PyTorch documentation.
+
+.. glossary::
+   :sorted:
+
+   ATen
+      Short for "A Tensor Library". The foundational tensor and mathematical
+      operation library on which all else is built.
+
+   attention mechanism
+      A technique used in deep learning models, particularly transformer architectures,
+      to selectively focus on certain input elements or tokens when computing output
+      representations, improving performance and interpretability.
+
+   backward pass
+      The backward pass is part of the backpropagation algorithm where the error
+      gradients are computed and propagated backwards through the network, adjusting
+      the weights and biases to minimize the loss.
+
+   backpropagation
+      An essential algorithm in training neural networks. It calculates the gradient
+      of the loss function with respect to the model's parameters, allowing the
+      network to learn from its mistakes and improve over time.
+
+   CNN
+      Convolutional Neural Network: A type of neural network designed for image and
+      video processing, using convolutional and pooling layers to extract features.
+
+   Compound Kernel
+      Opposed to :term:`Device Kernel`, Compound kernels are usually
+      device-agnostic and belong to :term:`Compound Operation`.
+
+   Compound Operation
+      A Compound Operation is composed of other operations. Its kernel is usually
+      device-agnostic. Normally it doesn't have its own derivative functions defined.
+      Instead, AutoGrad automatically computes its derivative based on operations it
+      uses.
+
+   Composite Operation
+      Same as :term:`Compound Operation`.
+
+   Convolutional Neural Network
+      A type of neural network designed for image and video processing, using
+      convolutional and pooling layers to extract features. Also known as CNN.
+
+   CUDA
+      Compute Unified Device Architecture: A parallel computing platform developed
+      by NVIDIA that allows developers to use GPUs for general-purpose computing,
+      including machine learning and deep learning applications.
+
+   Custom Operation
+      An Operation that is defined by users and is usually a :term:`Compound Operation`.
+      For example, this `tutorial <https://pytorch.org/docs/stable/notes/extending.html>`_
+      details how to create Custom Operations.
+
+   Device Kernel
+      Device-specific kernel of a :term:`Leaf Operation`.
+
+   embedding
+      A way to represent categorical variables as dense vectors, often used in
+      natural language processing and recommender systems.
+
+   epoch
+      An epoch is a unit of measurement in machine learning that represents one
+      complete pass through the entire training dataset. During each epoch, the
+      model's weights are updated based on the loss calculated from the predictions
+      made on the training data.
+
+   forward pass
+      The forward pass is the process of passing input data through a neural network
+      to obtain an output prediction. It's the first step in training a model,
+      followed by the backward pass and optimization.
+
+   GPU
+      Graphics Processing Unit: A specialized electronic circuit designed to quickly
+      manipulate and alter memory to accelerate computations. In the context of AI
+      and machine learning, GPUs are used to accelerate computationally intensive
+      tasks like training neural networks.
+
+   gradient
+      In machine learning, the gradient represents the rate of change of the loss
+      function with respect to the model's parameters. It's used in backpropagation
+      to update the weights and biases during training.
+
+   Inductor
+      A PyTorch component that enables just-in-time (JIT) compilation of PyTorch
+      models, allowing for faster inference times and better performance on CPUs
+      and GPUs. It is the default backend for torch.compile.
+
+   inference
+      The process of making predictions or drawing conclusions from a trained AI
+      model, typically involving the application of the learned relationships to
+      new, unseen data.
+
+   JIT
+      Just-In-Time Compilation: A compilation technique where code is compiled into
+      machine code at runtime, just before it is executed.
+
+   Kernel
+      Implementation of a PyTorch operation, specifying what should be done when an
+      operation executes.
+
+   Leaf Operation
+      An operation that's considered a basic operation, as opposed to a :term:`Compound
+      Operation`. Leaf Operation always has dispatch functions defined, usually has a
+      derivative function defined as well.
+
+   loss function
+      A loss function, also known as a cost function, is a mathematical function
+      used to evaluate the performance of a machine learning model during training,
+      providing a measure of how well the model is doing.
+
+   LSTM
+      Long Short-Term Memory Network: A type of recurrent neural network (RNN)
+      designed to handle sequential data with long-term dependencies. LSTMs use
+      memory cells and gates to selectively retain information over time.
+
+   Native Operation
+      An operation that comes natively with PyTorch ATen, for example ``aten::matmul``.
+
+   Non-Leaf Operation
+      Same as :term:`Compound Operation`.
+
+   Operation
+      A unit of work. For example, the work of matrix multiplication is an operation
+      called ``aten::matmul``.
+
+   optimizer
+      An algorithm used to update the weights and biases of a neural network during
+      training to minimize the loss function. Common optimizers include SGD, Adam,
+      and RMSprop.
+
+   quantization
+      A technique used to reduce the precision of numerical values in a deep learning
+      model, often to reduce memory usage, improve performance, and enable deployment
+      on resource-constrained devices.
+
+   RNN
+      Recurrent Neural Network: A type of neural network designed for sequential data,
+      using recurrent connections to capture temporal dependencies.
+
+   Scripting
+      Using ``torch.jit.script`` on a function to inspect source code and compile it as
+      :term:`TorchScript` code.
+
+   tensor
+      Tensors are a specialized data structure that are very similar to arrays and
+      matrices. In PyTorch, tensors are used to encode the inputs and outputs of a
+      model, as well as the model's parameters.
+
+   torch.compile
+      A PyTorch function that compiles PyTorch code into an optimized form, allowing
+      for faster execution and better performance. It is the main entry point for
+      PyTorch 2.x optimizations.
+
+   TorchScript
+      An interface to the TorchScript :term:`JIT` compiler and interpreter.
+
+   Tracing
+      Using ``torch.jit.trace`` on a function to get an executable that can be optimized
+      using just-in-time compilation.
+
+   transformer
+      A type of neural network architecture introduced in the paper "Attention is All
+      You Need" (Vaswani et al., 2017), which relies entirely on self-attention
+      mechanisms to process sequential data, such as text or images.

From ceac4801db6ad4b8550b1e4ba5c866cf265eb4d2 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 18 Dec 2025 14:28:38 -0800
Subject: [PATCH 13/13] Update

---
 glossary.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/glossary.rst b/glossary.rst
index 18da44b0587..7c0bfb823cb 100644
--- a/glossary.rst
+++ b/glossary.rst
@@ -160,7 +160,7 @@ This glossary provides definitions for terms commonly used in PyTorch documentat
       PyTorch 2.x optimizations.
 
    TorchScript
-      An interface to the TorchScript :term:`JIT` compiler and interpreter.
+      Deprecated. Use :term:`torch.compile` instead.
 
    Tracing
       Using ``torch.jit.trace`` on a function to get an executable that can be optimized