From ac7630de5121c94080a633c273d1e38f516afc8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 19 Nov 2025 15:00:29 -0500 Subject: [PATCH 1/5] Query OS for chipset I/O cache coherency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mario Bălănică --- kernel-open/common/inc/nv.h | 2 ++ kernel-open/conftest.sh | 13 +++++++++++++ kernel-open/nvidia/nv-dma.c | 11 +++++++++++ kernel-open/nvidia/nvidia.Kbuild | 1 + src/nvidia/arch/nvalloc/unix/include/nv.h | 2 ++ src/nvidia/arch/nvalloc/unix/src/os.c | 10 ++++++++++ src/nvidia/generated/g_os_nvoc.h | 2 ++ src/nvidia/src/kernel/platform/chipset/chipset.c | 7 ------- .../src/kernel/platform/chipset/chipset_pcie.c | 8 ++++++++ 9 files changed, 49 insertions(+), 7 deletions(-) diff --git a/kernel-open/common/inc/nv.h b/kernel-open/common/inc/nv.h index d431423bb..fef788182 100644 --- a/kernel-open/common/inc/nv.h +++ b/kernel-open/common/inc/nv.h @@ -924,6 +924,8 @@ void NV_API_CALL nv_dma_unmap_mmio (nv_dma_device_t *, NvU64, NvU6 void NV_API_CALL nv_dma_cache_invalidate (nv_dma_device_t *, void *); NvBool NV_API_CALL nv_grdma_pci_topology_supported(nv_state_t *, nv_dma_device_t *); +NvBool NV_API_CALL nv_dev_is_dma_coherent (nv_dma_device_t *); + NvS32 NV_API_CALL nv_start_rc_timer (nv_state_t *); NvS32 NV_API_CALL nv_stop_rc_timer (nv_state_t *); diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh index cfa387129..99de2649d 100755 --- a/kernel-open/conftest.sh +++ b/kernel-open/conftest.sh @@ -2467,6 +2467,19 @@ compile_test() { compile_check_conftest "$CODE" "NV_DMA_IS_DIRECT_PRESENT" "" "functions" ;; + dev_is_dma_coherent) + # + # Determine whether dev_is_dma_coherent() exists. + # + CODE=" + #include + void conftest_dev_is_dma_coherent(void) { + dev_is_dma_coherent(); + }" + + compile_check_conftest "$CODE" "NV_DEV_IS_DMA_COHERENT_PRESENT" "" "functions" + ;; + cmd_uphy_display_port_init) # # Determine if CMD_UPHY_DISPLAY_PORT_INIT enum present in bpmp-abi header diff --git a/kernel-open/nvidia/nv-dma.c b/kernel-open/nvidia/nv-dma.c index 2984af848..6530fb152 100644 --- a/kernel-open/nvidia/nv-dma.c +++ b/kernel-open/nvidia/nv-dma.c @@ -909,6 +909,17 @@ void NV_API_CALL nv_dma_cache_invalidate #endif } +NvBool NV_API_CALL nv_dev_is_dma_coherent +( + nv_dma_device_t *dma_dev +) +{ +#if defined(NV_DEV_IS_DMA_COHERENT_PRESENT) + return dev_is_dma_coherent(dma_dev->dev); +#endif + return true; +} + #if defined(NV_DRM_AVAILABLE) static inline void diff --git a/kernel-open/nvidia/nvidia.Kbuild b/kernel-open/nvidia/nvidia.Kbuild index 43416d252..1c08a59b2 100644 --- a/kernel-open/nvidia/nvidia.Kbuild +++ b/kernel-open/nvidia/nvidia.Kbuild @@ -125,6 +125,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += phys_to_dma NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_rebar_get_possible_sizes NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_backlight_device_by_name NV_CONFTEST_FUNCTION_COMPILE_TESTS += dma_direct_map_resource +NV_CONFTEST_FUNCTION_COMPILE_TESTS += dev_is_dma_coherent NV_CONFTEST_FUNCTION_COMPILE_TESTS += flush_cache_all NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_pfn NV_CONFTEST_FUNCTION_COMPILE_TESTS += jiffies_to_timespec diff --git a/src/nvidia/arch/nvalloc/unix/include/nv.h b/src/nvidia/arch/nvalloc/unix/include/nv.h index 519c260cf..4ca3f0ccf 100644 --- a/src/nvidia/arch/nvalloc/unix/include/nv.h +++ b/src/nvidia/arch/nvalloc/unix/include/nv.h @@ -924,6 +924,8 @@ void NV_API_CALL nv_dma_unmap_mmio (nv_dma_device_t *, NvU64, NvU6 void NV_API_CALL nv_dma_cache_invalidate (nv_dma_device_t *, void *); NvBool NV_API_CALL nv_grdma_pci_topology_supported(nv_state_t *, nv_dma_device_t *); +NvBool NV_API_CALL nv_dev_is_dma_coherent (nv_dma_device_t *); + NvS32 NV_API_CALL nv_start_rc_timer (nv_state_t *); NvS32 NV_API_CALL nv_stop_rc_timer (nv_state_t *); diff --git a/src/nvidia/arch/nvalloc/unix/src/os.c b/src/nvidia/arch/nvalloc/unix/src/os.c index 2c1a89c22..1dd6e6c15 100644 --- a/src/nvidia/arch/nvalloc/unix/src/os.c +++ b/src/nvidia/arch/nvalloc/unix/src/os.c @@ -1706,6 +1706,16 @@ void osFlushGpuCoherentCpuCacheRange nv_flush_coherent_cpu_cache_range(pOsGpuInfo, cpuVirtual, size); } +NvBool osDevIsDmaCoherent +( + OBJGPU *pGpu +) +{ + nv_state_t *nv = NV_GET_NV_STATE(pGpu); + + return nv_dev_is_dma_coherent(nv->dma_dev); +} + void osErrorLogV(OBJGPU *pGpu, XidContext context, const char * pFormat, va_list arglist) { NV_STATUS rmStatus; diff --git a/src/nvidia/generated/g_os_nvoc.h b/src/nvidia/generated/g_os_nvoc.h index 08c3d1a3a..58e4631c2 100644 --- a/src/nvidia/generated/g_os_nvoc.h +++ b/src/nvidia/generated/g_os_nvoc.h @@ -799,6 +799,8 @@ void osFlushGpuCoherentCpuCacheRange(OS_GPU_INFO *pOsGpuInfo, NvU64 size); NvBool osUidTokensEqual(PUID_TOKEN arg1, PUID_TOKEN arg2); +NvBool osDevIsDmaCoherent(OBJGPU *pGpu); + NV_STATUS osValidateClientTokens(PSECURITY_TOKEN arg1, PSECURITY_TOKEN arg2); PUID_TOKEN osGetCurrentUidToken(void); diff --git a/src/nvidia/src/kernel/platform/chipset/chipset.c b/src/nvidia/src/kernel/platform/chipset/chipset.c index c76ef1028..ad48145b1 100644 --- a/src/nvidia/src/kernel/platform/chipset/chipset.c +++ b/src/nvidia/src/kernel/platform/chipset/chipset.c @@ -50,13 +50,6 @@ clConstruct_IMPL(OBJCL *pCl) pCl->pPcieConfigSpaceBase = NULL; - // - // We set this property by default. - // Chipset setup function can override this. - // Right now only Tegra chipsets overide this setting. - // - pCl->setProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT, NV_TRUE); - return NV_OK; } diff --git a/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c b/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c index d834ab424..d057d8132 100644 --- a/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c +++ b/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c @@ -971,6 +971,14 @@ clUpdatePcieConfig_IMPL(OBJGPU *pGpu, OBJCL *pCl) objClBuildPcieAtomicsAllowList(pGpu, pCl); + // + // Check if the GPU device is on a cache-coherent bus. + // + if (osDevIsDmaCoherent(pGpu)) + { + pCl->setProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT, NV_TRUE); + } + objClInitPcieChipset(pGpu, pCl); // From 95bc81d278b66e6b7bbed6b7d8eca8feb9cdce4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 19 Nov 2025 15:00:44 -0500 Subject: [PATCH 2/5] Disable WC iomaps by default for unknown Arm chipsets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arm platforms have historically had issues (corruption, bus errors) with non-Device MMIO mappings. Unlike DMA coherency, there's no way to check for this at runtime. Therefore, in the absence of better chipset info, disable WC iomaps by default. Signed-off-by: Mario Bălănică --- src/nvidia/arch/nvalloc/common/inc/nvcst.h | 8 +++---- .../kernel/platform/chipset/chipset_info.c | 21 +++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/nvidia/arch/nvalloc/common/inc/nvcst.h b/src/nvidia/arch/nvalloc/common/inc/nvcst.h index 33bb8d495..feb0f7269 100644 --- a/src/nvidia/arch/nvalloc/common/inc/nvcst.h +++ b/src/nvidia/arch/nvalloc/common/inc/nvcst.h @@ -97,7 +97,7 @@ CHIPSET_SETUP_FUNC(PLDA_XpressRichAXI_setupFunc) CHIPSET_SETUP_FUNC(Riscv_generic_setupFunc) CHIPSET_SETUP_FUNC(Intel_A70D_setupFunc) CHIPSET_SETUP_FUNC(AMD_14D8_setupFunc) - +CHIPSET_SETUP_FUNC(Generic_setupFunc) // Keep string length <=32 (including termination) to avoid string copy overflow CSINFO chipsetInfo[] = @@ -276,8 +276,8 @@ CSINFO chipsetInfo[] = {PCI_VENDOR_ID_AMPERE, 0xE110, CS_AMPERE_ALTRA, "Ampere Altra", Ampere_Altra_setupFunc}, {PCI_VENDOR_ID_ARM, 0x0100, CS_ARM_NEOVERSEN1, "Arm Neoverse N1", Arm_NeoverseN1_setupFunc}, {PCI_VENDOR_ID_HYGON, 0x790E, CS_HYGON_C86, "Hygon-C86-7151", NULL}, - {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN96XX, "Marvell Octeon CN96xx", ARMV8_generic_setupFunc}, - {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN98XX, "Marvell Octeon CN98xx", ARMV8_generic_setupFunc}, + {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN96XX, "Marvell Octeon CN96xx", NULL}, + {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN98XX, "Marvell Octeon CN98xx", NULL}, {PCI_VENDOR_ID_SIFIVE, 0x0000, CS_SIFIVE_FU740_C000, "SiFive FU740-000", Riscv_generic_setupFunc}, {PCI_VENDOR_ID_PLDA, 0x1111, CS_PLDA_XPRESSRICH_AXI_REF, "XpressRich-AXI Ref Design", PLDA_XpressRichAXI_setupFunc}, {PCI_VENDOR_ID_AMPERE, 0xE200, CS_AMPERE_AMPEREONE160, "Ampere AmpereOne-160", Ampere_AmpereOne_setupFunc}, @@ -302,7 +302,7 @@ CSINFO chipsetInfo[] = /////////////////////////////////////////////////////////////////////////////////////////////////// // last element must have chipset CS_UNKNOWN (zero) - {0, 0, CS_UNKNOWN, "Unknown", NULL} + {0, 0, CS_UNKNOWN, "Unknown", Generic_setupFunc} }; diff --git a/src/nvidia/src/kernel/platform/chipset/chipset_info.c b/src/nvidia/src/kernel/platform/chipset/chipset_info.c index 9e546e62e..10c82088d 100644 --- a/src/nvidia/src/kernel/platform/chipset/chipset_info.c +++ b/src/nvidia/src/kernel/platform/chipset/chipset_info.c @@ -1179,6 +1179,14 @@ ARMV8_generic_setupFunc OBJCL *pCl ) { + // + // Arm platforms have historically had issues (corruption, bus errors) with + // non-Device MMIO mappings. Unlike DMA coherency, there's no way to check + // for this at runtime. Therefore, in the absence of better chipset info, + // disable WC iomaps by default. + // + pCl->setProperty(pCl, PDB_PROP_CL_DISABLE_IOMAP_WC, NV_TRUE); + return NV_OK; } @@ -1351,6 +1359,19 @@ Ampere_AmpereOne_setupFunc return NV_OK; } +// Generic setup function +static NV_STATUS +Generic_setupFunc +( + OBJCL *pCl +) +{ +#if NVCPU_IS_FAMILY_ARM + return ARMV8_generic_setupFunc(pCl); +#endif + return NV_OK; +} + void csGetInfoStrings ( From 9861907db5ee333e74fa31153d7269ca7028f2c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 19 Nov 2025 15:00:55 -0500 Subject: [PATCH 3/5] Never skip cache flushing on dma_map_*() calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not set the `DMA_ATTR_SKIP_CPU_SYNC` flag on dma_map_*() calls even for memory marked as "uncached". On Arm, we always allocate cacheable pages and then use aliased (vmap) uncached mappings when necessary. Without explicit flushing right after allocation, previous stale data in these backing pages could be evicted at any point and end up clobbering memory that was already written through the aliased mapping. Note that no flushing will be performed on cache-coherent hardware. This is not an issue in the unmap path since no further writes are made to the cached mappings. Signed-off-by: Mario Bălănică --- kernel-open/nvidia/nv-dma.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/kernel-open/nvidia/nv-dma.c b/kernel-open/nvidia/nv-dma.c index 6530fb152..00545e416 100644 --- a/kernel-open/nvidia/nv-dma.c +++ b/kernel-open/nvidia/nv-dma.c @@ -61,11 +61,18 @@ static NV_STATUS nv_dma_map_contig( NvU64 *va ) { - *va = dma_map_page_attrs(dma_map->dev, dma_map->pages[0], 0, + /* + * Do not set DMA_ATTR_SKIP_CPU_SYNC here even if memory is "uncached". + * On Arm, we always allocate cacheable pages and then use aliased (vmap) + * uncached mappings when necessary. Without explicit flushing right after + * allocation, previous stale data in these backing pages could be evicted + * at any point and end up clobbering memory that was already written + * through the aliased mapping. Note that no flushing will be performed on + * cache-coherent hardware. + */ + *va = dma_map_page(dma_map->dev, dma_map->pages[0], 0, dma_map->page_count * PAGE_SIZE, - DMA_BIDIRECTIONAL, - (dma_map->cache_type == NV_MEMORY_UNCACHED) ? - DMA_ATTR_SKIP_CPU_SYNC : 0); + DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_map->dev, *va)) { return NV_ERR_OPERATING_SYSTEM; @@ -93,7 +100,7 @@ static void nv_dma_unmap_contig(nv_dma_map_t *dma_map) dma_unmap_page_attrs(dma_map->dev, dma_map->mapping.contig.dma_addr, dma_map->page_count * PAGE_SIZE, DMA_BIDIRECTIONAL, - (dma_map->cache_type == NV_MEMORY_UNCACHED) ? + (dma_map->cache_type != NV_MEMORY_CACHED) ? DMA_ATTR_SKIP_CPU_SYNC : 0); } @@ -214,6 +221,7 @@ NV_STATUS nv_map_dma_map_scatterlist(nv_dma_map_t *dma_map) nv_dma_submap_t *submap; NvU64 i; + /* See the comment in nv_dma_map_contig() */ NV_FOR_EACH_DMA_SUBMAP(dma_map, submap, i) { /* Imported SGTs will have already been mapped by the exporter. */ @@ -256,9 +264,11 @@ void nv_unmap_dma_map_scatterlist(nv_dma_map_t *dma_map) continue; } - dma_unmap_sg(dma_map->dev, submap->sgt.sgl, + dma_unmap_sg_attrs(dma_map->dev, submap->sgt.sgl, submap->sgt.orig_nents, - DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL, + (dma_map->cache_type != NV_MEMORY_CACHED) ? + DMA_ATTR_SKIP_CPU_SYNC : 0); } } From 492cb5275fb49e8c7b9c044f0e313808a68c7655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 19 Nov 2025 15:01:06 -0500 Subject: [PATCH 4/5] Rework DMA cache maintenance helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support both CPU-side flushing (to device) and invalidation (from device) for cached memory descriptors. The previous logic was entirely broken: - `dma_sync_*_for_device()` in `nv_dma_cache_invalidate()` actually performed flushing (cleaning) rather than invalidation, since the direction argument is ignored on ARM64. The correct API variant for invalidation is `dma_sync_*_for_cpu()`. - `flush_cache_all()` was removed a long time ago from the ARM64 kernel because there's no reliable way to flush all cache lines on this arch. This notably fixes `cliresCtrlCmdOsUnixFlushUserCache_IMPL()` and will also be needed in other places where cached memory is used. However, paths calling `memdescMapInternal/memdescUnmapInternal()` in streaming DMA fashion should be fine as these functions now properly handle synchronization. Signed-off-by: Mario Bălănică --- kernel-open/common/inc/nv.h | 6 +- kernel-open/common/inc/os-interface.h | 2 - kernel-open/conftest.sh | 16 --- kernel-open/nvidia/nv-dma.c | 37 ++++-- kernel-open/nvidia/nvidia.Kbuild | 1 - kernel-open/nvidia/os-interface.c | 51 --------- src/nvidia/arch/nvalloc/unix/include/nv.h | 6 +- .../arch/nvalloc/unix/include/os-interface.h | 2 - src/nvidia/arch/nvalloc/unix/src/os.c | 108 +++++++++--------- src/nvidia/generated/g_mem_desc_nvoc.h | 1 - src/nvidia/generated/g_os_nvoc.h | 9 +- .../gpu/bus/arch/maxwell/kern_bus_gm107.c | 4 +- .../gpu/bus/arch/turing/kern_bus_tu102.c | 2 +- src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c | 28 +---- .../kernel/gpu/spdm/arch/hopper/spdm_gh100.c | 2 +- 15 files changed, 105 insertions(+), 170 deletions(-) diff --git a/kernel-open/common/inc/nv.h b/kernel-open/common/inc/nv.h index fef788182..c2b7aa943 100644 --- a/kernel-open/common/inc/nv.h +++ b/kernel-open/common/inc/nv.h @@ -864,6 +864,9 @@ static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length) #define NV_ALIGN_DOWN(v,g) ((v) & ~((g) - 1)) #endif +#define NV_OS_DMA_SYNC_TO_DEVICE NVBIT(0) // CPU flush +#define NV_OS_DMA_SYNC_FROM_DEVICE NVBIT(1) // CPU invalidate +#define NV_OS_DMA_SYNC_TO_FROM_DEVICE (NV_OS_DMA_SYNC_TO_DEVICE | NV_OS_DMA_SYNC_FROM_DEVICE) // CPU flush + invalidate /* * driver internal interfaces @@ -921,7 +924,8 @@ void NV_API_CALL nv_dma_unmap_peer (nv_dma_device_t *, NvU64, NvU6 NV_STATUS NV_API_CALL nv_dma_map_mmio (nv_dma_device_t *, NvU64, NvU64 *); void NV_API_CALL nv_dma_unmap_mmio (nv_dma_device_t *, NvU64, NvU64); -void NV_API_CALL nv_dma_cache_invalidate (nv_dma_device_t *, void *); +void NV_API_CALL nv_dma_sync (nv_dma_device_t *, void *, NvU32); + NvBool NV_API_CALL nv_grdma_pci_topology_supported(nv_state_t *, nv_dma_device_t *); NvBool NV_API_CALL nv_dev_is_dma_coherent (nv_dma_device_t *); diff --git a/kernel-open/common/inc/os-interface.h b/kernel-open/common/inc/os-interface.h index 523368eaa..03a6cb89b 100644 --- a/kernel-open/common/inc/os-interface.h +++ b/kernel-open/common/inc/os-interface.h @@ -109,8 +109,6 @@ void NV_API_CALL os_unmap_kernel_space (void *, NvU64); void* NV_API_CALL os_map_user_space (MemoryArea *, NvU32, NvU32, void **); void NV_API_CALL os_unmap_user_space (void *, NvU64, void *); #endif -NV_STATUS NV_API_CALL os_flush_cpu_cache_all (void); -NV_STATUS NV_API_CALL os_flush_user_cache (void); void NV_API_CALL os_flush_cpu_write_combine_buffer(void); NvU8 NV_API_CALL os_io_read_byte (NvU32); NvU16 NV_API_CALL os_io_read_word (NvU32); diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh index 99de2649d..c5c5f27d5 100755 --- a/kernel-open/conftest.sh +++ b/kernel-open/conftest.sh @@ -625,22 +625,6 @@ compile_test() { compile_check_conftest "$CODE" "NV_SET_PAGES_ARRAY_UC_PRESENT" "" "functions" ;; - flush_cache_all) - # - # Determine if flush_cache_all() function is present - # - # flush_cache_all() was removed by commit id - # 68234df4ea79 ("arm64: kill flush_cache_all()") in 4.2 (2015-04-20) - # for aarch64 - # - CODE=" - #include - int conftest_flush_cache_all(void) { - return flush_cache_all(); - }" - compile_check_conftest "$CODE" "NV_FLUSH_CACHE_ALL_PRESENT" "" "functions" - ;; - ioremap_cache) # # Determine if the ioremap_cache() function is present. diff --git a/kernel-open/nvidia/nv-dma.c b/kernel-open/nvidia/nv-dma.c index 00545e416..879cf4a36 100644 --- a/kernel-open/nvidia/nv-dma.c +++ b/kernel-open/nvidia/nv-dma.c @@ -880,17 +880,18 @@ void NV_API_CALL nv_dma_unmap_mmio } /* - * Invalidate DMA mapping in CPU caches by "syncing" to the device. + * Flush/invalidate DMA mapping in CPU caches by "syncing" to the device. * * This is only implemented for ARM platforms, since other supported * platforms are cache coherent and have not required this (we * explicitly haven't supported SWIOTLB bounce buffering either where * this would be needed). */ -void NV_API_CALL nv_dma_cache_invalidate +void NV_API_CALL nv_dma_sync ( nv_dma_device_t *dma_dev, - void *priv + void *priv, + NvU32 dir ) { #if defined(NVCPU_AARCH64) @@ -898,10 +899,17 @@ void NV_API_CALL nv_dma_cache_invalidate if (dma_map->contiguous) { - dma_sync_single_for_device(dma_dev->dev, - dma_map->mapping.contig.dma_addr, - (size_t) PAGE_SIZE * dma_map->page_count, - DMA_FROM_DEVICE); + if (dir & NV_OS_DMA_SYNC_TO_DEVICE) + dma_sync_single_for_device(dma_dev->dev, + dma_map->mapping.contig.dma_addr, + (size_t)PAGE_SIZE * dma_map->page_count, + DMA_TO_DEVICE); + + if (dir & NV_OS_DMA_SYNC_FROM_DEVICE) + dma_sync_single_for_cpu(dma_dev->dev, + dma_map->mapping.contig.dma_addr, + (size_t)PAGE_SIZE * dma_map->page_count, + DMA_FROM_DEVICE); } else { @@ -910,10 +918,17 @@ void NV_API_CALL nv_dma_cache_invalidate NV_FOR_EACH_DMA_SUBMAP(dma_map, submap, i) { - dma_sync_sg_for_device(dma_dev->dev, - submap->sgt.sgl, - submap->sgt.orig_nents, - DMA_FROM_DEVICE); + if (dir & NV_OS_DMA_SYNC_TO_DEVICE) + dma_sync_sg_for_device(dma_dev->dev, + submap->sgt.sgl, + submap->sgt.orig_nents, + DMA_TO_DEVICE); + + if (dir & NV_OS_DMA_SYNC_FROM_DEVICE) + dma_sync_sg_for_cpu(dma_dev->dev, + submap->sgt.sgl, + submap->sgt.orig_nents, + DMA_FROM_DEVICE); } } #endif diff --git a/kernel-open/nvidia/nvidia.Kbuild b/kernel-open/nvidia/nvidia.Kbuild index 1c08a59b2..fd0e40dc8 100644 --- a/kernel-open/nvidia/nvidia.Kbuild +++ b/kernel-open/nvidia/nvidia.Kbuild @@ -126,7 +126,6 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_rebar_get_possible_sizes NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_backlight_device_by_name NV_CONFTEST_FUNCTION_COMPILE_TESTS += dma_direct_map_resource NV_CONFTEST_FUNCTION_COMPILE_TESTS += dev_is_dma_coherent -NV_CONFTEST_FUNCTION_COMPILE_TESTS += flush_cache_all NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_pfn NV_CONFTEST_FUNCTION_COMPILE_TESTS += jiffies_to_timespec NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64 diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c index a03a3b88c..1d4a29cd0 100644 --- a/kernel-open/nvidia/os-interface.c +++ b/kernel-open/nvidia/os-interface.c @@ -1034,57 +1034,6 @@ void NV_API_CALL os_unmap_kernel_space( nv_iounmap(addr, size_bytes); } -#if NVCPU_IS_AARCH64 - -static inline void nv_flush_cache_cpu(void *info) -{ - if (!nvos_is_chipset_io_coherent()) - { -#if defined(NV_FLUSH_CACHE_ALL_PRESENT) - flush_cache_all(); -#else - WARN_ONCE(0, "kernel does not provide flush_cache_all()\n"); -#endif - } -} - -// flush the cache of all cpus -NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void) -{ - on_each_cpu(nv_flush_cache_cpu, NULL, 1); - return NV_OK; -} - -NV_STATUS NV_API_CALL os_flush_user_cache(void) -{ - if (!NV_MAY_SLEEP()) - { - return NV_ERR_NOT_SUPPORTED; - } - - // - // The Linux kernel does not export an interface for flushing a range, - // although it is possible. For now, just flush the entire cache to be - // safe. - // - on_each_cpu(nv_flush_cache_cpu, NULL, 1); - return NV_OK; -} - -#else // NVCPU_IS_AARCH64 - -NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void) -{ - return NV_ERR_NOT_SUPPORTED; -} - -NV_STATUS NV_API_CALL os_flush_user_cache(void) -{ - return NV_ERR_NOT_SUPPORTED; -} - -#endif - void NV_API_CALL os_flush_cpu_write_combine_buffer(void) { #if defined(NVCPU_X86_64) diff --git a/src/nvidia/arch/nvalloc/unix/include/nv.h b/src/nvidia/arch/nvalloc/unix/include/nv.h index 4ca3f0ccf..493c5afb1 100644 --- a/src/nvidia/arch/nvalloc/unix/include/nv.h +++ b/src/nvidia/arch/nvalloc/unix/include/nv.h @@ -864,6 +864,9 @@ static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length) #define NV_ALIGN_DOWN(v,g) ((v) & ~((g) - 1)) #endif +#define NV_OS_DMA_SYNC_TO_DEVICE NVBIT(0) // CPU flush +#define NV_OS_DMA_SYNC_FROM_DEVICE NVBIT(1) // CPU invalidate +#define NV_OS_DMA_SYNC_TO_FROM_DEVICE (NV_OS_DMA_SYNC_TO_DEVICE | NV_OS_DMA_SYNC_FROM_DEVICE) // CPU flush + invalidate /* * driver internal interfaces @@ -921,7 +924,8 @@ void NV_API_CALL nv_dma_unmap_peer (nv_dma_device_t *, NvU64, NvU6 NV_STATUS NV_API_CALL nv_dma_map_mmio (nv_dma_device_t *, NvU64, NvU64 *); void NV_API_CALL nv_dma_unmap_mmio (nv_dma_device_t *, NvU64, NvU64); -void NV_API_CALL nv_dma_cache_invalidate (nv_dma_device_t *, void *); +void NV_API_CALL nv_dma_sync (nv_dma_device_t *, void *, NvU32); + NvBool NV_API_CALL nv_grdma_pci_topology_supported(nv_state_t *, nv_dma_device_t *); NvBool NV_API_CALL nv_dev_is_dma_coherent (nv_dma_device_t *); diff --git a/src/nvidia/arch/nvalloc/unix/include/os-interface.h b/src/nvidia/arch/nvalloc/unix/include/os-interface.h index 6eb955964..7fdb7be70 100644 --- a/src/nvidia/arch/nvalloc/unix/include/os-interface.h +++ b/src/nvidia/arch/nvalloc/unix/include/os-interface.h @@ -105,8 +105,6 @@ NvBool NV_API_CALL os_pci_remove_supported (void); void NV_API_CALL os_pci_remove (void *); void* NV_API_CALL os_map_kernel_space (NvU64, NvU64, NvU32); void NV_API_CALL os_unmap_kernel_space (void *, NvU64); -NV_STATUS NV_API_CALL os_flush_cpu_cache_all (void); -NV_STATUS NV_API_CALL os_flush_user_cache (void); void NV_API_CALL os_flush_cpu_write_combine_buffer(void); NvU8 NV_API_CALL os_io_read_byte (NvU32); NvU16 NV_API_CALL os_io_read_word (NvU32); diff --git a/src/nvidia/arch/nvalloc/unix/src/os.c b/src/nvidia/arch/nvalloc/unix/src/os.c index 1dd6e6c15..b3a4edab6 100644 --- a/src/nvidia/arch/nvalloc/unix/src/os.c +++ b/src/nvidia/arch/nvalloc/unix/src/os.c @@ -1665,9 +1665,53 @@ NV_STATUS osUserHandleToKernelPtr(NvHandle hClient, NvP64 hEvent, NvP64 *pEvent) return result; } -NV_STATUS osFlushCpuCache(void) +ct_assert(OS_DMA_SYNC_TO_DEVICE == NV_OS_DMA_SYNC_TO_DEVICE); +ct_assert(OS_DMA_SYNC_FROM_DEVICE == NV_OS_DMA_SYNC_FROM_DEVICE); +ct_assert(OS_DMA_SYNC_TO_FROM_DEVICE == NV_OS_DMA_SYNC_TO_FROM_DEVICE); + +NV_STATUS osDmaSyncMem +( + MEMORY_DESCRIPTOR *pMemDesc, + NvU32 dir +) { - return os_flush_cpu_cache_all(); + OBJGPU *pGpu = pMemDesc->pGpu; + KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu); + + if ((pKernelBif == NULL) || + kbifIsSnoopDmaCapable(pGpu, pKernelBif) || + (memdescGetCpuCacheAttrib(pMemDesc) != NV_MEMORY_CACHED)) + { + return NV_OK; + } + + nv_state_t *nv = NV_GET_NV_STATE(pGpu); + if (nv->iovaspace_id == NV_IOVA_DOMAIN_NONE) + { + return NV_ERR_INVALID_ARGUMENT; + } + + PIOVAMAPPING pIovaMapping = memdescGetIommuMap(pMemDesc, nv->iovaspace_id); + // + // This should only be called for devices that map memory descriptors + // through the nv-dma library, where the memory descriptor data + // contains all the kernel-specific context we need for the + // cache maintenance. + // + // (These checks match those in osIovaUnmap() leading up to + // nv_dma_unmap_alloc()). + // + if (pIovaMapping == NULL || + pIovaMapping->pOsData == NULL || + memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_GUEST_ALLOCATED) || + memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_PEER_IO_MEM)) + { + return NV_ERR_INVALID_ARGUMENT; + } + + nv_dma_sync(nv->dma_dev, pIovaMapping->pOsData, dir); + + return NV_OK; } void osFlushCpuWriteCombineBuffer(void) @@ -2073,7 +2117,7 @@ cliresCtrlCmdOsUnixFlushUserCache_IMPL Memory *pMemory; MEMORY_DESCRIPTOR *pMemDesc; NvU64 start, end; - NvBool bInvalidateOnly; + NvU32 syncDir; NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, memGetByHandle(RES_GET_CLIENT(pRmCliRes), @@ -2101,13 +2145,16 @@ cliresCtrlCmdOsUnixFlushUserCache_IMPL switch(pAddressSpaceParams->cacheOps) { - case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_FLUSH_INVALIDATE: case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_FLUSH: - bInvalidateOnly = NV_FALSE; + syncDir = OS_DMA_SYNC_TO_DEVICE; break; case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_INVALIDATE: - bInvalidateOnly = NV_TRUE; + syncDir = OS_DMA_SYNC_FROM_DEVICE; + break; + + case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_FLUSH_INVALIDATE: + syncDir = OS_DMA_SYNC_TO_FROM_DEVICE; break; default: @@ -2123,54 +2170,7 @@ cliresCtrlCmdOsUnixFlushUserCache_IMPL return NV_ERR_INVALID_LIMIT; } - if (bInvalidateOnly) - { - // - // XXX: this seems fishy - I'm not sure if invalidating by the kernel - // VA only as nv_dma_cache_invalidate() does here is sufficient for - // this control call. - // pAddressSpaceParams->internalOnly is expected to be the RM client - // VA for this control call; if we wanted to invalidate the user VA we - // could do so using that. - // - // For I/O coherent platforms this won't actually do anything. - // On non-I/O-coherent platforms, there's no need to do a second - // invalidation after the full flush. - // - nv_state_t *nv = NV_GET_NV_STATE(pMemDesc->pGpu); - if (nv->iovaspace_id != NV_IOVA_DOMAIN_NONE) - { - PIOVAMAPPING pIovaMapping = memdescGetIommuMap(pMemDesc, nv->iovaspace_id); - // - // This should only be called for devices that map memory descriptors - // through the nv-dma library, where the memory descriptor data - // contains all the kernel-specific context we need for the - // invalidation. - // - // (These checks match those in osIovaUnmap() leading up to - // nv_dma_unmap_alloc()). - // - if (pIovaMapping == NULL || - pIovaMapping->pOsData == NULL || - memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_GUEST_ALLOCATED) || - memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_PEER_IO_MEM)) - { - return NV_ERR_INVALID_ARGUMENT; - } - - nv_dma_cache_invalidate(nv->dma_dev, pIovaMapping->pOsData); - } - else - { - return NV_ERR_INVALID_ARGUMENT; - } - } - else - { - return os_flush_user_cache(); - } - - return NV_OK; + return osDmaSyncMem(pMemDesc, syncDir); } static NV_STATUS diff --git a/src/nvidia/generated/g_mem_desc_nvoc.h b/src/nvidia/generated/g_mem_desc_nvoc.h index 0eb04d518..cf67fa3c6 100644 --- a/src/nvidia/generated/g_mem_desc_nvoc.h +++ b/src/nvidia/generated/g_mem_desc_nvoc.h @@ -1511,7 +1511,6 @@ NV_STATUS memdescSendMemDescToGSP(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc, NvH // cache maintenance functions void memdescFlushGpuCaches(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc); -void memdescFlushCpuCaches(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc); // Map memory descriptor for RM internal access void* memdescMapInternal(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc, NvU32 flags); diff --git a/src/nvidia/generated/g_os_nvoc.h b/src/nvidia/generated/g_os_nvoc.h index 58e4631c2..f94dc0f3d 100644 --- a/src/nvidia/generated/g_os_nvoc.h +++ b/src/nvidia/generated/g_os_nvoc.h @@ -649,12 +649,10 @@ NV_STATUS __nvoc_objCreate_OBJOS(OBJOS**, Dynamic*, NvU32); NV_STATUS addProbe(OBJGPU *, NvU32); -typedef NV_STATUS OSFlushCpuCache(void); typedef void OSAddRecordForCrashLog(void *, NvU32); typedef void OSDeleteRecordForCrashLog(void *); -OSFlushCpuCache osFlushCpuCache; OSAddRecordForCrashLog osAddRecordForCrashLog; OSDeleteRecordForCrashLog osDeleteRecordForCrashLog; @@ -794,6 +792,13 @@ NV_STATUS rm_is_vgpu_supported_device(OS_GPU_INFO *pNv, NvU32 pmc_boot_1, NV_STATUS osLockPageableDataSection(RM_PAGEABLE_SECTION *pSection); NV_STATUS osUnlockPageableDataSection(RM_PAGEABLE_SECTION *pSection); +#define OS_DMA_SYNC_TO_DEVICE NVBIT(0) // CPU flush +#define OS_DMA_SYNC_FROM_DEVICE NVBIT(1) // CPU invalidate +#define OS_DMA_SYNC_TO_FROM_DEVICE (OS_DMA_SYNC_TO_DEVICE | OS_DMA_SYNC_FROM_DEVICE) // CPU flush + invalidate + +NV_STATUS osDmaSyncMem(MEMORY_DESCRIPTOR *pMemDesc, + NvU32 dir); + void osFlushGpuCoherentCpuCacheRange(OS_GPU_INFO *pOsGpuInfo, NvU64 cpuVirtual, NvU64 size); diff --git a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c index d1e49249e..7b2def3bb 100644 --- a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c +++ b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c @@ -2954,7 +2954,7 @@ _kbusInternalBar1Unmap mapRemove(&pVaInfo->reverseMap, ppVaToType); // TODO: investigate whether the tegra wbinvd flush is really necessary, seems only useful for SYSMEM_COH - memdescFlushCpuCaches(pGpu, pType->pMemDesc); + osDmaSyncMem(pType->pMemDesc, OS_DMA_SYNC_TO_DEVICE); dmaFreeMapping_HAL(pGpu, pDma, pVAS, virtRange.start, pType->pMemDesc, 0, NULL); } @@ -3263,7 +3263,7 @@ kbusUnmapFbAperture_GM107 OBJVASPACE *pVAS = pBar1VaInfo->pVAS; // TODO: investigate whether the tegra wbinvd flush is really necessary, seems only useful for SYSMEM_COH - memdescFlushCpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_TO_DEVICE); dmaFreeMapping_HAL(pGpu, pDma, pVAS, memArea.pRanges[0].start, pMemDesc, 0, NULL); goto done; diff --git a/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c b/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c index ce3346e1e..2d82d2824 100644 --- a/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c +++ b/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c @@ -939,7 +939,7 @@ NV_STATUS kbusDecreaseStaticBar1Refcount_TU102 NV_ERR_INVALID_STATE); // TODO: investigate whether the tegra wbinvd flush is really necessary, seems only useful for SYSMEM_COH - memdescFlushCpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_TO_DEVICE); pRootMemDesc = memdescGetRootMemDesc(pMemDesc, NULL); diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c index a2a01ff0e..e771ec942 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c @@ -2107,29 +2107,6 @@ memdescFlushGpuCaches } } -void -memdescFlushCpuCaches -( - OBJGPU *pGpu, - MEMORY_DESCRIPTOR *pMemDesc -) -{ - // Flush WC to get the data written to this mapping out to memory - osFlushCpuWriteCombineBuffer(); - - KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu); - - // Special care is needed on SOC, where the GPU cannot snoop the CPU L2 - if ((pKernelBif != NULL) && - !kbifIsSnoopDmaCapable(pGpu, pKernelBif) && - (memdescGetCpuCacheAttrib(pMemDesc) == NV_MEMORY_CACHED)) - { - // Flush CPU L2 so that the GPU will see any changes the CPU made - osFlushCpuCache(); - } -} - - /* * @brief map memory descriptor for internal access * @@ -2158,7 +2135,10 @@ memdescMapInternal // We need to flush & invalidate GPU L2 cache only for directed BAR mappings. // Reflected BAR mappings will access memory via GPU, and hence go through GPU L2 cache. if (mapType == MEMDESC_MAP_INTERNAL_TYPE_SYSMEM_DIRECT) + { memdescFlushGpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_FROM_DEVICE); + } if (pMemDesc->_pInternalMapping != NULL) { @@ -2234,7 +2214,7 @@ void memdescUnmapInternal if (mapType == MEMDESC_MAP_INTERNAL_TYPE_SYSMEM_DIRECT || mapType == MEMDESC_MAP_INTERNAL_TYPE_BAR2) { - memdescFlushCpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_TO_DEVICE); } if (--pMemDesc->_internalMappingRefCount == 0) diff --git a/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c b/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c index af0e35761..f057c3755 100644 --- a/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c +++ b/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c @@ -848,7 +848,7 @@ spdmMessageProcess_GH100 // First copy payload to shared buffer portMemCopy(pPayloadBuffer, requestSize, pRequest, requestSize); - memdescFlushCpuCaches(pGpu, pSpdm->pPayloadBufferMemDesc); + osDmaSyncMem(pSpdm->pPayloadBufferMemDesc, OS_DMA_SYNC_TO_DEVICE); // Trigger message pending value, then poll for response from GSP kflcnRegWrite_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX0, NV_SPDM_REQUESTER_MESSAGE_PENDING_TOKEN); From 10072734b2f88f3580cdb036778ec27d2b4f2fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 19 Nov 2025 15:05:21 -0500 Subject: [PATCH 5/5] Fix cached DMA allocations on non-coherent hardware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repurpose `NV_MEMORY_DEFAULT` to hand out either cached or uncached CPU mappings based on hardware cache-coherency support. This type should be preferred over `NV_MEMORY_CACHED`, unless there's a good reason not to: - explicit cache maintenance is done where necessary (does not seem the case for most allocations so far). - there are certain memory requirements (e.g. atomics usually need cached memory on Arm). Most `NV_MEMORY_CACHED` allocations are replaced with this default type, except in cases where I've seen cache maintenance or uncached memory caused issues. There are some remaining cached allocations (e.g. imported from user memory, RUSD) that I haven't looked into - it's unclear whether those are subject to DMA coherency issues. In practice, all things I've tested (games, benchmarks, monitoring tools, CUDA) appear to work fine now on a non-coherent system (RK3588-based). Signed-off-by: Mario Bălănică --- src/nvidia/arch/nvalloc/unix/src/os.c | 5 +++ src/nvidia/arch/nvalloc/unix/src/osmemdesc.c | 4 +- src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c | 2 +- .../kernel/gpu/disp/inst_mem/disp_inst_mem.c | 2 +- .../gpu/falcon/kernel_crashcat_engine.c | 4 +- .../arch/volta/kernel_channel_group_gv100.c | 4 +- .../gpu/fsp/arch/hopper/kern_fsp_gh100.c | 4 +- .../src/kernel/gpu/gpu_user_shared_data.c | 9 +++- .../src/kernel/gpu/gr/kernel_graphics.c | 2 +- .../gpu/gsp/arch/hopper/kernel_gsp_gh100.c | 2 +- .../gpu/gsp/arch/turing/kernel_gsp_tu102.c | 4 +- src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c | 12 +++--- .../src/kernel/gpu/gsp/message_queue_cpu.c | 2 +- .../gpu/mem_mgr/arch/maxwell/fbsr_gm107.c | 4 +- src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c | 22 ++++++++++ src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c | 2 +- src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c | 8 ++-- .../sec2/arch/blackwell/kernel_sec2_gb10b.c | 2 +- .../sec2/arch/blackwell/kernel_sec2_gb20b.c | 2 +- src/nvidia/src/kernel/gpu/spdm/spdm.c | 2 +- .../src/kernel/gpu/uvm/arch/volta/uvm_gv100.c | 2 +- src/nvidia/src/kernel/gpu/uvm/uvm.c | 2 +- src/nvidia/src/kernel/mem_mgr/mem.c | 42 ++++++++++++------- src/nvidia/src/kernel/rmapi/nv_gpu_ops.c | 15 ++++--- src/nvidia/src/kernel/vgpu/rpc.c | 2 +- src/nvidia/src/kernel/vgpu/vgpu_util.c | 2 +- 26 files changed, 107 insertions(+), 56 deletions(-) diff --git a/src/nvidia/arch/nvalloc/unix/src/os.c b/src/nvidia/arch/nvalloc/unix/src/os.c index b3a4edab6..912a7e7df 100644 --- a/src/nvidia/arch/nvalloc/unix/src/os.c +++ b/src/nvidia/arch/nvalloc/unix/src/os.c @@ -908,6 +908,11 @@ NV_STATUS osAllocPagesInternal( memdescSetAddress(pMemDesc, NvP64_NULL); memdescSetMemData(pMemDesc, NULL, NULL); + // + // XXX: Is this a workaround for hardware with broken NoSnoop? + // If so, consider checking PDB_PROP_CL_NOSNOOP_NOT_CAPABLE and + // move this to memdescSetCpuCacheAttrib(). + // #if (defined(NVCPU_AARCH64) && RMCFG_MODULE_CL) { OBJCL *pCl = SYS_GET_CL(pSys); diff --git a/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c b/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c index 85866213b..3d84012ca 100644 --- a/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c +++ b/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c @@ -593,7 +593,7 @@ osCreateOsDescriptorFromPhysAddr MEMORY_DESCRIPTOR *pMemDesc; NvU64 *pPteArray; NvU64 base = 0; - NvU32 cache_type = NV_MEMORY_CACHED; + NvU32 cache_type = NV_MEMORY_DEFAULT; NvU64 memdescFlags = MEMDESC_FLAGS_NONE; NvU64 *pPhys_addrs; NvU64 num_os_pages; @@ -750,7 +750,7 @@ _createMemdescFromDmaBufSgtHelper } else if (!FLD_TEST_DRF(OS02, _FLAGS, _COHERENCY, _UNCACHED, flags)) { - cacheType = NV_MEMORY_CACHED; + cacheType = NV_MEMORY_DEFAULT; } if (FLD_TEST_DRF(OS02, _FLAGS, _GPU_CACHEABLE, _YES, flags)) diff --git a/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c b/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c index c31d829d8..c942e0788 100644 --- a/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c +++ b/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c @@ -94,7 +94,7 @@ _kccuAllocMemory // Create a memory descriptor data structure for the shared buffer status = memdescCreate(&pKernelCcu->pMemDesc[idx], pGpu, shrBufSize, 0, NV_MEMORY_CONTIGUOUS, - aperture, NV_MEMORY_CACHED, flags); + aperture, NV_MEMORY_DEFAULT, flags); if (status != NV_OK) { NV_PRINTF(LEVEL_ERROR, "CCU memdescCreate failed for(%u) with status: 0x%x\n", idx, status); diff --git a/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c b/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c index 0d962d095..ed4a9a0c6 100644 --- a/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c +++ b/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c @@ -243,7 +243,7 @@ instmemInitMemDesc // dispInstMemAttr to NV_MEMORY_CACHED this needs to be set based on system configuration/registry parameter. // instmemSetMemory(pGpu, pInstMem, - ADDR_SYSMEM, NV_MEMORY_CACHED, + ADDR_SYSMEM, NV_MEMORY_DEFAULT, 0 /* base */, instMemSize); } else if (IS_GSP_CLIENT(pGpu)) diff --git a/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c b/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c index 95f0adcdb..c4a66dff6 100644 --- a/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c +++ b/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c @@ -59,7 +59,7 @@ NV_STATUS kcrashcatEngineConfigure_IMPL NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, memdescCreate(&pKernelCrashCatEng->pQueueMemDesc, pKernelCrashCatEng->pGpu, pEngConfig->allocQueueSize, CRASHCAT_QUEUE_ALIGNMENT, NV_TRUE, - ADDR_SYSMEM, NV_MEMORY_CACHED, MEMDESC_FLAGS_NONE)); + ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE)); NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, memdescAlloc(pKernelCrashCatEng->pQueueMemDesc), @@ -230,7 +230,7 @@ static MEMORY_DESCRIPTOR *_kcrashcatEngineCreateBufferMemDesc NV_ADDRESS_SPACE bufAddrSpace = _crashcatApertureToAddressSpace(pBufDesc->aperture); NV_CHECK_OK_OR_ELSE(status, LEVEL_ERROR, memdescCreate(&pMemDesc, pKernelCrashCatEng->pGpu, pBufDesc->size, 0, - NV_TRUE, bufAddrSpace, NV_MEMORY_CACHED, MEMDESC_FLAGS_NONE), + NV_TRUE, bufAddrSpace, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE), return NULL;); memdescDescribe(pMemDesc, bufAddrSpace, pBufDesc->physOffset, pBufDesc->size); diff --git a/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c b/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c index 4ade402e3..570ee156e 100644 --- a/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c +++ b/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c @@ -47,7 +47,7 @@ kchangrpAllocFaultMethodBuffers_GV100 NvU32 runQueues = kfifoGetNumRunqueues_HAL(pGpu, pKernelFifo); NvU32 index = 0; NvU32 faultBufApert = ADDR_SYSMEM; - NvU32 faultBufAttr = NV_MEMORY_CACHED; + NvU32 faultBufAttr = NV_MEMORY_DEFAULT; NvU64 memDescFlags = MEMDESC_FLAGS_LOST_ON_SUSPEND; HW_ENG_FAULT_METHOD_BUFFER *pFaultMthdBuf = NULL; NvU32 gfid = pKernelChannelGroup->gfid; @@ -85,14 +85,12 @@ kchangrpAllocFaultMethodBuffers_GV100 // host, force fault buffer aperture to vid mem. // faultBufApert = ADDR_FBMEM; - faultBufAttr = NV_MEMORY_CACHED; memDescFlags |= MEMDESC_FLAGS_OWNED_BY_CURRENT_DEVICE; } else { // Get the right aperture/attribute faultBufApert = ADDR_SYSMEM; - faultBufAttr = NV_MEMORY_CACHED; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_3, _FAULT_METHOD_BUFFER, pGpu->instLocOverrides3), "fault method buffer", &faultBufApert, &faultBufAttr); if (faultBufApert == ADDR_FBMEM) diff --git a/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c b/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c index 6cc75cb38..94facb30b 100644 --- a/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c +++ b/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c @@ -1036,7 +1036,7 @@ kfspSetupGspImages { NV_ASSERT(pKernelFsp->pGspFmcMemdesc == NULL); // If we assert the pointer becomes a zombie. status = memdescCreate(&pKernelFsp->pGspFmcMemdesc, pGpu, pGspImageMapSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_7, @@ -1381,7 +1381,7 @@ kfspPrepareBootCommands_GH100 { NV_ASSERT(pKernelFsp->pSysmemFrtsMemdesc == NULL); // If we assert the pointer becomes a zombie. status = memdescCreate(&pKernelFsp->pSysmemFrtsMemdesc, pGpu, frtsSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_8, diff --git a/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c b/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c index 70e5d8d5b..bf63f8d1c 100644 --- a/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c +++ b/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c @@ -288,7 +288,14 @@ gpuCreateRusdMemory_IMPL if ((sysGetStaticConfig(SYS_GET_INSTANCE()))->bOsCCEnabled) return NV_OK; - // Create a kernel-side mapping for writing RUSD data + // + // Create a kernel-side mapping for writing RUSD data. + // This must be cached memory due to atomic intrinsic usage, which is not + // supported on uncached memory by some Arm platforms. + // + // XXX: There might be coherency issues with this allocation, although + // statistics appear fine at a quick glance. + // NV_ASSERT_OK_OR_RETURN(memdescCreate(ppMemDesc, pGpu, sizeof(NV00DE_SHARED_DATA), 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, MEMDESC_FLAGS_USER_READ_ONLY)); diff --git a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c index f78f0f9ff..55a16597f 100644 --- a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c +++ b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c @@ -179,7 +179,7 @@ kgraphicsConstructEngine_IMPL // FECS event buffer defaults to cached SYSMEM pKernelGraphics->globalCtxBuffersInfo.globalCtxAttr[GR_GLOBALCTX_BUFFER_FECS_EVENT].pAllocList = ADDRLIST_SYSMEM_ONLY; - pKernelGraphics->globalCtxBuffersInfo.globalCtxAttr[GR_GLOBALCTX_BUFFER_FECS_EVENT].cpuAttr = NV_MEMORY_CACHED; + pKernelGraphics->globalCtxBuffersInfo.globalCtxAttr[GR_GLOBALCTX_BUFFER_FECS_EVENT].cpuAttr = NV_MEMORY_DEFAULT; // Process instloc overrides { diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c index c581db3b2..3f65e8a43 100644 --- a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c +++ b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c @@ -162,7 +162,7 @@ kgspAllocBootArgs_GH100 NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pKernelGsp->pGspFmcArgumentsDescriptor, pGpu, sizeof(GSP_FMC_BOOT_PARAMS), 0x1000, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), _kgspAllocBootArgs_exit_cleanup); diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c index 39d9635b6..bd57883a9 100644 --- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c +++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c @@ -118,7 +118,7 @@ kgspAllocBootArgs_TU102 NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pKernelGsp->pWprMetaDescriptor, pGpu, 0x1000, 0x1000, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), _kgspAllocBootArgs_exit_cleanup); @@ -174,7 +174,7 @@ kgspAllocBootArgs_TU102 NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pKernelGsp->pGspArgumentsDescriptor, pGpu, 0x1000, 0x1000, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), _kgspAllocBootArgs_exit_cleanup); diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c index e262e8fc3..1565e752a 100644 --- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c +++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c @@ -2638,7 +2638,7 @@ _setupLogBufferVgpu pGpu, logVgpuSetupParams.bufSize, RM_PAGE_SIZE, - NV_TRUE, ADDR_FBMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_FBMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE), exit); @@ -2942,7 +2942,7 @@ _setupLogBufferBaremetal pGpu, size, RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), exit); @@ -3337,7 +3337,7 @@ _kgspSetupTaskRMCoverageStructure ( pGpu, BULLSEYE_GSP_RM_COVERAGE_SIZE, RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE), done); memdescTagAlloc(nvStatus, @@ -4339,7 +4339,7 @@ kgspPrepareBootBinaryImage_IMPL pGpu, bufSizeAligned, RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), fail); @@ -4424,7 +4424,7 @@ _kgspCreateSignatureMemdesc NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, memdescCreate(&pKernelGsp->pSignatureMemdesc, pGpu, NV_ALIGN_UP(pGspFw->signatureSize, 256), 256, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags)); + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags)); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_16, pKernelGsp->pSignatureMemdesc); @@ -4712,7 +4712,7 @@ kgspCreateRadix3_IMPL LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, flags), done); diff --git a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c index 80c7212c5..66de4947e 100644 --- a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c +++ b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c @@ -230,7 +230,7 @@ GspMsgQueuesInit // NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pMQCollection->pSharedMemDesc, pGpu, sharedBufSize, - RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_CACHED, + RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), done); diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c b/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c index a9505db36..a1194f701 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c @@ -202,7 +202,7 @@ fbsrInit_GM107(OBJGPU *pGpu, OBJFBSR *pFbsr) // to to use cached memory. // status = memdescCreate(&pFbsr->pSysMemDesc, pGpu, memSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE); if (status != NV_OK) { @@ -371,7 +371,7 @@ fbsrBegin_GM107(OBJGPU *pGpu, OBJFBSR *pFbsr, FBSR_OP_TYPE op) // On Windows, pageable memory is also cacheable. status = memdescCreate(&pFbsr->pSysMemDesc, pGpu, pFbsr->length, 0, NV_FALSE, - ADDR_SYSMEM, NV_MEMORY_CACHED, + ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_PAGED_SYSMEM); } if (status != NV_OK) diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c index e771ec942..1a723ffed 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c @@ -38,6 +38,7 @@ #include "mem_mgr/virt_mem_mgr.h" #include "core/system.h" #include "vgpu/vgpu_util.h" +#include "platform/chipset/chipset.h" #include "platform/sli/sli.h" #include "resserv/rs_client.h" @@ -3640,6 +3641,27 @@ void memdescSetCpuCacheAttrib NvU32 cpuCacheAttrib ) { + // + // Use NV_MEMORY_DEFAULT to get a reasonable default caching type for the + // given descriptor (i.e. DMA coherent), unless explicit cache maintenance + // is done (for performance reasons) or there are certain memory requirements + // (e.g. atomics need NV_MEMORY_CACHED on Arm). + // + if (cpuCacheAttrib == NV_MEMORY_DEFAULT) + { + OBJCL *pCl = SYS_GET_CL(SYS_GET_INSTANCE()); + + if (memdescGetFlag(pMemDesc, MEMDESC_FLAGS_CPU_ONLY) || + ((pCl != NULL) && pCl->getProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT))) + { + cpuCacheAttrib = NV_MEMORY_CACHED; + } + else + { + cpuCacheAttrib = NV_MEMORY_UNCACHED; + } + } + // // When running 64-bit MODS on ARM v8, we need to force all CPU mappings as WC. // This seems to be an issue with glibc. See bug 1556221. diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c index c1318ad49..d9dc103b4 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c @@ -162,7 +162,7 @@ _memmgrAllocAndMapSurface NV_ASSERT_OK_OR_RETURN( memdescCreate(ppMemDesc, pGpu, size, RM_PAGE_SIZE, NV_TRUE, - ADDR_SYSMEM, NV_MEMORY_CACHED, flags)); + ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags)); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_77, (*ppMemDesc)); diff --git a/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c b/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c index ee4f0c75d..ab1a2072e 100644 --- a/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c +++ b/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c @@ -999,14 +999,14 @@ kgmmuFaultBufferGetAddressSpace_IMPL if (index == NON_REPLAYABLE_FAULT_BUFFER) { faultBufferAddrSpace = bAllocInVidmem ? ADDR_FBMEM : ADDR_SYSMEM; - faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_CACHED; + faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_DEFAULT; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_3, _UVM_FAULT_BUFFER_NONREPLAYABLE, pGpu->instLocOverrides3), "UVM non-replayable fault", &faultBufferAddrSpace, &faultBufferAttr); } else if (index == REPLAYABLE_FAULT_BUFFER) { faultBufferAddrSpace = bAllocInVidmem ? ADDR_FBMEM : ADDR_SYSMEM; - faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_CACHED; + faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_DEFAULT; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_4, _UVM_FAULT_BUFFER_REPLAYABLE, pGpu->instLocOverrides4), "UVM replayable fault", &faultBufferAddrSpace, &faultBufferAttr); } @@ -1493,7 +1493,7 @@ _kgmmuClientShadowFaultBufferQueueAllocate status = memdescCreate(&pQueueMemDesc, pGpu, sizeof(GMMU_SHADOW_FAULT_BUF), RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); if (status != NV_OK) { @@ -1591,7 +1591,7 @@ _kgmmuClientShadowFaultBufferPagesAllocate status = memdescCreate(&pMemDesc, pGpu, shadowFaultBufferSizeTotal, RM_PAGE_SIZE, - NV_FALSE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_FALSE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); if (status != NV_OK) { diff --git a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c index a4950c29d..792c46b73 100644 --- a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c +++ b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c @@ -485,7 +485,7 @@ ksec2SetupGspImages_GB10B pGspImageMapSize = NV_ALIGN_UP(pGspImageSize, 0x1000); status = memdescCreate(&pKernelSec2->pGspFmcMemdesc, pGpu, pGspImageMapSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_7, diff --git a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c index 466eae4e4..633a73246 100644 --- a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c +++ b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c @@ -779,7 +779,7 @@ ksec2SetupGspImages_GB20B pGspImageMapSize = NV_ALIGN_UP(pGspImageSize, 0x1000); status = memdescCreate(&pKernelSec2->pGspFmcMemdesc, pGpu, pGspImageMapSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_7, diff --git a/src/nvidia/src/kernel/gpu/spdm/spdm.c b/src/nvidia/src/kernel/gpu/spdm/spdm.c index 3c35b78c3..1eb8cfa1c 100644 --- a/src/nvidia/src/kernel/gpu/spdm/spdm.c +++ b/src/nvidia/src/kernel/gpu/spdm/spdm.c @@ -274,7 +274,7 @@ spdmSetupCommunicationBuffers_IMPL // Create memory descriptor for payload buffer status = memdescCreate(&pSpdm->pPayloadBufferMemDesc, pGpu, NV_SPDM_SYSMEM_SURFACE_SIZE_PAGE_ALIGNED, NV_SPDM_SYSMEM_SURFACE_ALIGNMENT_IN_BYTES, NV_TRUE, ADDR_SYSMEM, - NV_MEMORY_CACHED, MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY); + NV_MEMORY_DEFAULT, MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY); if (status != NV_OK || pSpdm->pPayloadBufferMemDesc == NULL) { status = NV_ERR_INSUFFICIENT_RESOURCES; diff --git a/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c b/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c index 1ff46bab6..1661a8d21 100644 --- a/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c +++ b/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c @@ -292,7 +292,7 @@ uvmInitAccessCntrBuffer_GV100 accessCntrBufferSize = uvmGetAccessCounterBufferSize_HAL(pGpu, pUvm, pAccessCounterBuffer->accessCounterIndex); accessCntrBufferAperture = ADDR_SYSMEM; - accessCntrBufferAttr = NV_MEMORY_CACHED; + accessCntrBufferAttr = NV_MEMORY_DEFAULT; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_4, _UVM_FAULT_BUFFER_REPLAYABLE, pGpu->instLocOverrides4), "UVM access counter", &accessCntrBufferAperture, &accessCntrBufferAttr); diff --git a/src/nvidia/src/kernel/gpu/uvm/uvm.c b/src/nvidia/src/kernel/gpu/uvm/uvm.c index 6565ff868..7f164b08f 100644 --- a/src/nvidia/src/kernel/gpu/uvm/uvm.c +++ b/src/nvidia/src/kernel/gpu/uvm/uvm.c @@ -242,7 +242,7 @@ uvmAccessCntrBufferRegister_IMPL NV_STATUS status; MEMORY_DESCRIPTOR *pMemDesc; NvU32 addrSpace = ADDR_SYSMEM; - NvU32 attr = NV_MEMORY_CACHED; + NvU32 attr = NV_MEMORY_DEFAULT; if (pUvm->pAccessCounterBuffers == NULL) { diff --git a/src/nvidia/src/kernel/mem_mgr/mem.c b/src/nvidia/src/kernel/mem_mgr/mem.c index 2a81684a5..76e5f99f8 100644 --- a/src/nvidia/src/kernel/mem_mgr/mem.c +++ b/src/nvidia/src/kernel/mem_mgr/mem.c @@ -1214,20 +1214,34 @@ void memSetSysmemCacheAttrib_IMPL gpuCacheAttrib = NV_MEMORY_UNCACHED; } - if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_UNCACHED) - cpuCacheAttrib = NV_MEMORY_UNCACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_CACHED) - cpuCacheAttrib = NV_MEMORY_CACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_COMBINE) - cpuCacheAttrib = NV_MEMORY_WRITECOMBINED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_THROUGH) - cpuCacheAttrib = NV_MEMORY_CACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_PROTECT) - cpuCacheAttrib = NV_MEMORY_CACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_BACK) - cpuCacheAttrib = NV_MEMORY_CACHED; - else - cpuCacheAttrib = 0; + switch (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr)) + { + case NVOS32_ATTR_COHERENCY_UNCACHED: + cpuCacheAttrib = NV_MEMORY_UNCACHED; + break; + case NVOS32_ATTR_COHERENCY_WRITE_COMBINE: + cpuCacheAttrib = NV_MEMORY_WRITECOMBINED; + break; + case NVOS32_ATTR_COHERENCY_CACHED: + case NVOS32_ATTR_COHERENCY_WRITE_THROUGH: + case NVOS32_ATTR_COHERENCY_WRITE_PROTECT: + case NVOS32_ATTR_COHERENCY_WRITE_BACK: + // + // XXX: It's unclear in which cases the clients will perform their own + // CPU cache maintenance, but it only seems to happen when the GPU mapping + // is also cached (cliresCtrlCmdOsUnixFlushUserCache() will be called). + // This indicates that not all clients factor in hardware coherency support + // when requesting cached mappings, so it may be safer to just always use + // NV_MEMORY_DEFAULT, which only gives cached memory on coherent hardware. + // + cpuCacheAttrib = (gpuCacheAttrib == NV_MEMORY_CACHED) ? NV_MEMORY_CACHED : + NV_MEMORY_DEFAULT; + break; + default: + NV_ASSERT(0); + cpuCacheAttrib = NV_MEMORY_UNCACHED; + break; + } ct_assert(NVOS32_ATTR_COHERENCY_UNCACHED == NVOS02_FLAGS_COHERENCY_UNCACHED); ct_assert(NVOS32_ATTR_COHERENCY_CACHED == NVOS02_FLAGS_COHERENCY_CACHED); diff --git a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c index a4a95b045..aff781ea1 100644 --- a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c +++ b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c @@ -110,6 +110,7 @@ #include #include #include +#include #include #include #include @@ -4836,6 +4837,7 @@ static NV_STATUS nvGpuOpsAllocPhysical(struct gpuDevice *device, NV_MEMORY_ALLOCATION_PARAMS memAllocParams = {0}; NV_STATUS status = NV_OK; RM_API *pRmApi = rmapiGetInterface(RMAPI_EXTERNAL_KERNEL); + OBJGPU *pGpu = NULL; NvHandle physHandle = 0; @@ -4843,6 +4845,9 @@ static NV_STATUS nvGpuOpsAllocPhysical(struct gpuDevice *device, NV_ASSERT(device); NV_ASSERT(paOffset); + status = _nvGpuOpsGetGpuFromDevice(device, &pGpu); + NV_ASSERT_OR_RETURN((status == NV_OK) && (pGpu != NULL), NV_ERR_INVALID_ARGUMENT); + // then allocate the physical memory in either sysmem or fb. memAllocParams.owner = HEAP_OWNER_RM_KERNEL_CLIENT; @@ -4858,9 +4863,9 @@ static NV_STATUS nvGpuOpsAllocPhysical(struct gpuDevice *device, DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) : DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM); - // Always enable caching for System Memory as all the currently supported - // platforms are IO coherent. - NvBool bCached = isSystemMemory; + // Set CPU caching attribute + KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu); + NvBool bCached = isSystemMemory && kbifIsSnoopDmaCapable(pGpu, pKernelBif); memAllocParams.attr |= bCached ? DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED): DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED); @@ -10295,7 +10300,7 @@ _shadowMemdescCreateFlcn(gpuRetainedChannel *retainedChannel, pCtxBufferInfo->alignment, pCtxBufferInfo->bIsContigous, pCtxBufferInfo->aperture, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE ); NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, status); @@ -10395,7 +10400,7 @@ _shadowMemdescCreate(gpuRetainedChannel *retainedChannel, pCtxBufferInfo->alignment, pCtxBufferInfo->bIsContigous, pCtxBufferInfo->aperture, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE ); if (status != NV_OK) diff --git a/src/nvidia/src/kernel/vgpu/rpc.c b/src/nvidia/src/kernel/vgpu/rpc.c index d2c5fe01f..c98344957 100644 --- a/src/nvidia/src/kernel/vgpu/rpc.c +++ b/src/nvidia/src/kernel/vgpu/rpc.c @@ -260,7 +260,7 @@ _allocRpcMemDescSysmem( 0, bContig, ADDR_SYSMEM, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, memdescFlag)); memdescSetFlag(*ppMemDesc, MEMDESC_FLAGS_KERNEL_MODE, NV_TRUE); diff --git a/src/nvidia/src/kernel/vgpu/vgpu_util.c b/src/nvidia/src/kernel/vgpu/vgpu_util.c index 9db90f622..5e1ec5551 100644 --- a/src/nvidia/src/kernel/vgpu/vgpu_util.c +++ b/src/nvidia/src/kernel/vgpu/vgpu_util.c @@ -143,7 +143,7 @@ NV_STATUS vgpuAllocSysmemPfnBitMapNode(OBJGPU *pGpu, VGPU_SYSMEM_PFN_BITMAP_NODE 0, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, memFlags); if (status != NV_OK) {