diff --git a/kernel-open/common/inc/nv.h b/kernel-open/common/inc/nv.h index d431423bb4..c2b7aa9434 100644 --- a/kernel-open/common/inc/nv.h +++ b/kernel-open/common/inc/nv.h @@ -864,6 +864,9 @@ static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length) #define NV_ALIGN_DOWN(v,g) ((v) & ~((g) - 1)) #endif +#define NV_OS_DMA_SYNC_TO_DEVICE NVBIT(0) // CPU flush +#define NV_OS_DMA_SYNC_FROM_DEVICE NVBIT(1) // CPU invalidate +#define NV_OS_DMA_SYNC_TO_FROM_DEVICE (NV_OS_DMA_SYNC_TO_DEVICE | NV_OS_DMA_SYNC_FROM_DEVICE) // CPU flush + invalidate /* * driver internal interfaces @@ -921,9 +924,12 @@ void NV_API_CALL nv_dma_unmap_peer (nv_dma_device_t *, NvU64, NvU6 NV_STATUS NV_API_CALL nv_dma_map_mmio (nv_dma_device_t *, NvU64, NvU64 *); void NV_API_CALL nv_dma_unmap_mmio (nv_dma_device_t *, NvU64, NvU64); -void NV_API_CALL nv_dma_cache_invalidate (nv_dma_device_t *, void *); +void NV_API_CALL nv_dma_sync (nv_dma_device_t *, void *, NvU32); + NvBool NV_API_CALL nv_grdma_pci_topology_supported(nv_state_t *, nv_dma_device_t *); +NvBool NV_API_CALL nv_dev_is_dma_coherent (nv_dma_device_t *); + NvS32 NV_API_CALL nv_start_rc_timer (nv_state_t *); NvS32 NV_API_CALL nv_stop_rc_timer (nv_state_t *); diff --git a/kernel-open/common/inc/os-interface.h b/kernel-open/common/inc/os-interface.h index 523368eaa4..03a6cb89bd 100644 --- a/kernel-open/common/inc/os-interface.h +++ b/kernel-open/common/inc/os-interface.h @@ -109,8 +109,6 @@ void NV_API_CALL os_unmap_kernel_space (void *, NvU64); void* NV_API_CALL os_map_user_space (MemoryArea *, NvU32, NvU32, void **); void NV_API_CALL os_unmap_user_space (void *, NvU64, void *); #endif -NV_STATUS NV_API_CALL os_flush_cpu_cache_all (void); -NV_STATUS NV_API_CALL os_flush_user_cache (void); void NV_API_CALL os_flush_cpu_write_combine_buffer(void); NvU8 NV_API_CALL os_io_read_byte (NvU32); NvU16 NV_API_CALL os_io_read_word (NvU32); diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh index cfa3871297..c5c5f27d5b 100755 --- a/kernel-open/conftest.sh +++ b/kernel-open/conftest.sh @@ -625,22 +625,6 @@ compile_test() { compile_check_conftest "$CODE" "NV_SET_PAGES_ARRAY_UC_PRESENT" "" "functions" ;; - flush_cache_all) - # - # Determine if flush_cache_all() function is present - # - # flush_cache_all() was removed by commit id - # 68234df4ea79 ("arm64: kill flush_cache_all()") in 4.2 (2015-04-20) - # for aarch64 - # - CODE=" - #include - int conftest_flush_cache_all(void) { - return flush_cache_all(); - }" - compile_check_conftest "$CODE" "NV_FLUSH_CACHE_ALL_PRESENT" "" "functions" - ;; - ioremap_cache) # # Determine if the ioremap_cache() function is present. @@ -2467,6 +2451,19 @@ compile_test() { compile_check_conftest "$CODE" "NV_DMA_IS_DIRECT_PRESENT" "" "functions" ;; + dev_is_dma_coherent) + # + # Determine whether dev_is_dma_coherent() exists. + # + CODE=" + #include + void conftest_dev_is_dma_coherent(void) { + dev_is_dma_coherent(); + }" + + compile_check_conftest "$CODE" "NV_DEV_IS_DMA_COHERENT_PRESENT" "" "functions" + ;; + cmd_uphy_display_port_init) # # Determine if CMD_UPHY_DISPLAY_PORT_INIT enum present in bpmp-abi header diff --git a/kernel-open/nvidia/nv-dma.c b/kernel-open/nvidia/nv-dma.c index 2984af848d..879cf4a36d 100644 --- a/kernel-open/nvidia/nv-dma.c +++ b/kernel-open/nvidia/nv-dma.c @@ -61,11 +61,18 @@ static NV_STATUS nv_dma_map_contig( NvU64 *va ) { - *va = dma_map_page_attrs(dma_map->dev, dma_map->pages[0], 0, + /* + * Do not set DMA_ATTR_SKIP_CPU_SYNC here even if memory is "uncached". + * On Arm, we always allocate cacheable pages and then use aliased (vmap) + * uncached mappings when necessary. Without explicit flushing right after + * allocation, previous stale data in these backing pages could be evicted + * at any point and end up clobbering memory that was already written + * through the aliased mapping. Note that no flushing will be performed on + * cache-coherent hardware. + */ + *va = dma_map_page(dma_map->dev, dma_map->pages[0], 0, dma_map->page_count * PAGE_SIZE, - DMA_BIDIRECTIONAL, - (dma_map->cache_type == NV_MEMORY_UNCACHED) ? - DMA_ATTR_SKIP_CPU_SYNC : 0); + DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_map->dev, *va)) { return NV_ERR_OPERATING_SYSTEM; @@ -93,7 +100,7 @@ static void nv_dma_unmap_contig(nv_dma_map_t *dma_map) dma_unmap_page_attrs(dma_map->dev, dma_map->mapping.contig.dma_addr, dma_map->page_count * PAGE_SIZE, DMA_BIDIRECTIONAL, - (dma_map->cache_type == NV_MEMORY_UNCACHED) ? + (dma_map->cache_type != NV_MEMORY_CACHED) ? DMA_ATTR_SKIP_CPU_SYNC : 0); } @@ -214,6 +221,7 @@ NV_STATUS nv_map_dma_map_scatterlist(nv_dma_map_t *dma_map) nv_dma_submap_t *submap; NvU64 i; + /* See the comment in nv_dma_map_contig() */ NV_FOR_EACH_DMA_SUBMAP(dma_map, submap, i) { /* Imported SGTs will have already been mapped by the exporter. */ @@ -256,9 +264,11 @@ void nv_unmap_dma_map_scatterlist(nv_dma_map_t *dma_map) continue; } - dma_unmap_sg(dma_map->dev, submap->sgt.sgl, + dma_unmap_sg_attrs(dma_map->dev, submap->sgt.sgl, submap->sgt.orig_nents, - DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL, + (dma_map->cache_type != NV_MEMORY_CACHED) ? + DMA_ATTR_SKIP_CPU_SYNC : 0); } } @@ -870,17 +880,18 @@ void NV_API_CALL nv_dma_unmap_mmio } /* - * Invalidate DMA mapping in CPU caches by "syncing" to the device. + * Flush/invalidate DMA mapping in CPU caches by "syncing" to the device. * * This is only implemented for ARM platforms, since other supported * platforms are cache coherent and have not required this (we * explicitly haven't supported SWIOTLB bounce buffering either where * this would be needed). */ -void NV_API_CALL nv_dma_cache_invalidate +void NV_API_CALL nv_dma_sync ( nv_dma_device_t *dma_dev, - void *priv + void *priv, + NvU32 dir ) { #if defined(NVCPU_AARCH64) @@ -888,10 +899,17 @@ void NV_API_CALL nv_dma_cache_invalidate if (dma_map->contiguous) { - dma_sync_single_for_device(dma_dev->dev, - dma_map->mapping.contig.dma_addr, - (size_t) PAGE_SIZE * dma_map->page_count, - DMA_FROM_DEVICE); + if (dir & NV_OS_DMA_SYNC_TO_DEVICE) + dma_sync_single_for_device(dma_dev->dev, + dma_map->mapping.contig.dma_addr, + (size_t)PAGE_SIZE * dma_map->page_count, + DMA_TO_DEVICE); + + if (dir & NV_OS_DMA_SYNC_FROM_DEVICE) + dma_sync_single_for_cpu(dma_dev->dev, + dma_map->mapping.contig.dma_addr, + (size_t)PAGE_SIZE * dma_map->page_count, + DMA_FROM_DEVICE); } else { @@ -900,15 +918,33 @@ void NV_API_CALL nv_dma_cache_invalidate NV_FOR_EACH_DMA_SUBMAP(dma_map, submap, i) { - dma_sync_sg_for_device(dma_dev->dev, - submap->sgt.sgl, - submap->sgt.orig_nents, - DMA_FROM_DEVICE); + if (dir & NV_OS_DMA_SYNC_TO_DEVICE) + dma_sync_sg_for_device(dma_dev->dev, + submap->sgt.sgl, + submap->sgt.orig_nents, + DMA_TO_DEVICE); + + if (dir & NV_OS_DMA_SYNC_FROM_DEVICE) + dma_sync_sg_for_cpu(dma_dev->dev, + submap->sgt.sgl, + submap->sgt.orig_nents, + DMA_FROM_DEVICE); } } #endif } +NvBool NV_API_CALL nv_dev_is_dma_coherent +( + nv_dma_device_t *dma_dev +) +{ +#if defined(NV_DEV_IS_DMA_COHERENT_PRESENT) + return dev_is_dma_coherent(dma_dev->dev); +#endif + return true; +} + #if defined(NV_DRM_AVAILABLE) static inline void diff --git a/kernel-open/nvidia/nvidia.Kbuild b/kernel-open/nvidia/nvidia.Kbuild index 43416d2527..fd0e40dc8b 100644 --- a/kernel-open/nvidia/nvidia.Kbuild +++ b/kernel-open/nvidia/nvidia.Kbuild @@ -125,7 +125,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += phys_to_dma NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_rebar_get_possible_sizes NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_backlight_device_by_name NV_CONFTEST_FUNCTION_COMPILE_TESTS += dma_direct_map_resource -NV_CONFTEST_FUNCTION_COMPILE_TESTS += flush_cache_all +NV_CONFTEST_FUNCTION_COMPILE_TESTS += dev_is_dma_coherent NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_pfn NV_CONFTEST_FUNCTION_COMPILE_TESTS += jiffies_to_timespec NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64 diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c index a03a3b88cf..1d4a29cd06 100644 --- a/kernel-open/nvidia/os-interface.c +++ b/kernel-open/nvidia/os-interface.c @@ -1034,57 +1034,6 @@ void NV_API_CALL os_unmap_kernel_space( nv_iounmap(addr, size_bytes); } -#if NVCPU_IS_AARCH64 - -static inline void nv_flush_cache_cpu(void *info) -{ - if (!nvos_is_chipset_io_coherent()) - { -#if defined(NV_FLUSH_CACHE_ALL_PRESENT) - flush_cache_all(); -#else - WARN_ONCE(0, "kernel does not provide flush_cache_all()\n"); -#endif - } -} - -// flush the cache of all cpus -NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void) -{ - on_each_cpu(nv_flush_cache_cpu, NULL, 1); - return NV_OK; -} - -NV_STATUS NV_API_CALL os_flush_user_cache(void) -{ - if (!NV_MAY_SLEEP()) - { - return NV_ERR_NOT_SUPPORTED; - } - - // - // The Linux kernel does not export an interface for flushing a range, - // although it is possible. For now, just flush the entire cache to be - // safe. - // - on_each_cpu(nv_flush_cache_cpu, NULL, 1); - return NV_OK; -} - -#else // NVCPU_IS_AARCH64 - -NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void) -{ - return NV_ERR_NOT_SUPPORTED; -} - -NV_STATUS NV_API_CALL os_flush_user_cache(void) -{ - return NV_ERR_NOT_SUPPORTED; -} - -#endif - void NV_API_CALL os_flush_cpu_write_combine_buffer(void) { #if defined(NVCPU_X86_64) diff --git a/src/nvidia/arch/nvalloc/common/inc/nvcst.h b/src/nvidia/arch/nvalloc/common/inc/nvcst.h index 33bb8d4959..feb0f72693 100644 --- a/src/nvidia/arch/nvalloc/common/inc/nvcst.h +++ b/src/nvidia/arch/nvalloc/common/inc/nvcst.h @@ -97,7 +97,7 @@ CHIPSET_SETUP_FUNC(PLDA_XpressRichAXI_setupFunc) CHIPSET_SETUP_FUNC(Riscv_generic_setupFunc) CHIPSET_SETUP_FUNC(Intel_A70D_setupFunc) CHIPSET_SETUP_FUNC(AMD_14D8_setupFunc) - +CHIPSET_SETUP_FUNC(Generic_setupFunc) // Keep string length <=32 (including termination) to avoid string copy overflow CSINFO chipsetInfo[] = @@ -276,8 +276,8 @@ CSINFO chipsetInfo[] = {PCI_VENDOR_ID_AMPERE, 0xE110, CS_AMPERE_ALTRA, "Ampere Altra", Ampere_Altra_setupFunc}, {PCI_VENDOR_ID_ARM, 0x0100, CS_ARM_NEOVERSEN1, "Arm Neoverse N1", Arm_NeoverseN1_setupFunc}, {PCI_VENDOR_ID_HYGON, 0x790E, CS_HYGON_C86, "Hygon-C86-7151", NULL}, - {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN96XX, "Marvell Octeon CN96xx", ARMV8_generic_setupFunc}, - {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN98XX, "Marvell Octeon CN98xx", ARMV8_generic_setupFunc}, + {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN96XX, "Marvell Octeon CN96xx", NULL}, + {PCI_VENDOR_ID_MARVELL, 0xA02D, CS_MARVELL_OCTEON_CN98XX, "Marvell Octeon CN98xx", NULL}, {PCI_VENDOR_ID_SIFIVE, 0x0000, CS_SIFIVE_FU740_C000, "SiFive FU740-000", Riscv_generic_setupFunc}, {PCI_VENDOR_ID_PLDA, 0x1111, CS_PLDA_XPRESSRICH_AXI_REF, "XpressRich-AXI Ref Design", PLDA_XpressRichAXI_setupFunc}, {PCI_VENDOR_ID_AMPERE, 0xE200, CS_AMPERE_AMPEREONE160, "Ampere AmpereOne-160", Ampere_AmpereOne_setupFunc}, @@ -302,7 +302,7 @@ CSINFO chipsetInfo[] = /////////////////////////////////////////////////////////////////////////////////////////////////// // last element must have chipset CS_UNKNOWN (zero) - {0, 0, CS_UNKNOWN, "Unknown", NULL} + {0, 0, CS_UNKNOWN, "Unknown", Generic_setupFunc} }; diff --git a/src/nvidia/arch/nvalloc/unix/include/nv.h b/src/nvidia/arch/nvalloc/unix/include/nv.h index 519c260cfb..493c5afb13 100644 --- a/src/nvidia/arch/nvalloc/unix/include/nv.h +++ b/src/nvidia/arch/nvalloc/unix/include/nv.h @@ -864,6 +864,9 @@ static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length) #define NV_ALIGN_DOWN(v,g) ((v) & ~((g) - 1)) #endif +#define NV_OS_DMA_SYNC_TO_DEVICE NVBIT(0) // CPU flush +#define NV_OS_DMA_SYNC_FROM_DEVICE NVBIT(1) // CPU invalidate +#define NV_OS_DMA_SYNC_TO_FROM_DEVICE (NV_OS_DMA_SYNC_TO_DEVICE | NV_OS_DMA_SYNC_FROM_DEVICE) // CPU flush + invalidate /* * driver internal interfaces @@ -921,9 +924,12 @@ void NV_API_CALL nv_dma_unmap_peer (nv_dma_device_t *, NvU64, NvU6 NV_STATUS NV_API_CALL nv_dma_map_mmio (nv_dma_device_t *, NvU64, NvU64 *); void NV_API_CALL nv_dma_unmap_mmio (nv_dma_device_t *, NvU64, NvU64); -void NV_API_CALL nv_dma_cache_invalidate (nv_dma_device_t *, void *); +void NV_API_CALL nv_dma_sync (nv_dma_device_t *, void *, NvU32); + NvBool NV_API_CALL nv_grdma_pci_topology_supported(nv_state_t *, nv_dma_device_t *); +NvBool NV_API_CALL nv_dev_is_dma_coherent (nv_dma_device_t *); + NvS32 NV_API_CALL nv_start_rc_timer (nv_state_t *); NvS32 NV_API_CALL nv_stop_rc_timer (nv_state_t *); diff --git a/src/nvidia/arch/nvalloc/unix/include/os-interface.h b/src/nvidia/arch/nvalloc/unix/include/os-interface.h index 6eb9559640..7fdb7be705 100644 --- a/src/nvidia/arch/nvalloc/unix/include/os-interface.h +++ b/src/nvidia/arch/nvalloc/unix/include/os-interface.h @@ -105,8 +105,6 @@ NvBool NV_API_CALL os_pci_remove_supported (void); void NV_API_CALL os_pci_remove (void *); void* NV_API_CALL os_map_kernel_space (NvU64, NvU64, NvU32); void NV_API_CALL os_unmap_kernel_space (void *, NvU64); -NV_STATUS NV_API_CALL os_flush_cpu_cache_all (void); -NV_STATUS NV_API_CALL os_flush_user_cache (void); void NV_API_CALL os_flush_cpu_write_combine_buffer(void); NvU8 NV_API_CALL os_io_read_byte (NvU32); NvU16 NV_API_CALL os_io_read_word (NvU32); diff --git a/src/nvidia/arch/nvalloc/unix/src/os.c b/src/nvidia/arch/nvalloc/unix/src/os.c index 2c1a89c22c..912a7e7dfa 100644 --- a/src/nvidia/arch/nvalloc/unix/src/os.c +++ b/src/nvidia/arch/nvalloc/unix/src/os.c @@ -908,6 +908,11 @@ NV_STATUS osAllocPagesInternal( memdescSetAddress(pMemDesc, NvP64_NULL); memdescSetMemData(pMemDesc, NULL, NULL); + // + // XXX: Is this a workaround for hardware with broken NoSnoop? + // If so, consider checking PDB_PROP_CL_NOSNOOP_NOT_CAPABLE and + // move this to memdescSetCpuCacheAttrib(). + // #if (defined(NVCPU_AARCH64) && RMCFG_MODULE_CL) { OBJCL *pCl = SYS_GET_CL(pSys); @@ -1665,9 +1670,53 @@ NV_STATUS osUserHandleToKernelPtr(NvHandle hClient, NvP64 hEvent, NvP64 *pEvent) return result; } -NV_STATUS osFlushCpuCache(void) +ct_assert(OS_DMA_SYNC_TO_DEVICE == NV_OS_DMA_SYNC_TO_DEVICE); +ct_assert(OS_DMA_SYNC_FROM_DEVICE == NV_OS_DMA_SYNC_FROM_DEVICE); +ct_assert(OS_DMA_SYNC_TO_FROM_DEVICE == NV_OS_DMA_SYNC_TO_FROM_DEVICE); + +NV_STATUS osDmaSyncMem +( + MEMORY_DESCRIPTOR *pMemDesc, + NvU32 dir +) { - return os_flush_cpu_cache_all(); + OBJGPU *pGpu = pMemDesc->pGpu; + KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu); + + if ((pKernelBif == NULL) || + kbifIsSnoopDmaCapable(pGpu, pKernelBif) || + (memdescGetCpuCacheAttrib(pMemDesc) != NV_MEMORY_CACHED)) + { + return NV_OK; + } + + nv_state_t *nv = NV_GET_NV_STATE(pGpu); + if (nv->iovaspace_id == NV_IOVA_DOMAIN_NONE) + { + return NV_ERR_INVALID_ARGUMENT; + } + + PIOVAMAPPING pIovaMapping = memdescGetIommuMap(pMemDesc, nv->iovaspace_id); + // + // This should only be called for devices that map memory descriptors + // through the nv-dma library, where the memory descriptor data + // contains all the kernel-specific context we need for the + // cache maintenance. + // + // (These checks match those in osIovaUnmap() leading up to + // nv_dma_unmap_alloc()). + // + if (pIovaMapping == NULL || + pIovaMapping->pOsData == NULL || + memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_GUEST_ALLOCATED) || + memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_PEER_IO_MEM)) + { + return NV_ERR_INVALID_ARGUMENT; + } + + nv_dma_sync(nv->dma_dev, pIovaMapping->pOsData, dir); + + return NV_OK; } void osFlushCpuWriteCombineBuffer(void) @@ -1706,6 +1755,16 @@ void osFlushGpuCoherentCpuCacheRange nv_flush_coherent_cpu_cache_range(pOsGpuInfo, cpuVirtual, size); } +NvBool osDevIsDmaCoherent +( + OBJGPU *pGpu +) +{ + nv_state_t *nv = NV_GET_NV_STATE(pGpu); + + return nv_dev_is_dma_coherent(nv->dma_dev); +} + void osErrorLogV(OBJGPU *pGpu, XidContext context, const char * pFormat, va_list arglist) { NV_STATUS rmStatus; @@ -2063,7 +2122,7 @@ cliresCtrlCmdOsUnixFlushUserCache_IMPL Memory *pMemory; MEMORY_DESCRIPTOR *pMemDesc; NvU64 start, end; - NvBool bInvalidateOnly; + NvU32 syncDir; NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, memGetByHandle(RES_GET_CLIENT(pRmCliRes), @@ -2091,13 +2150,16 @@ cliresCtrlCmdOsUnixFlushUserCache_IMPL switch(pAddressSpaceParams->cacheOps) { - case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_FLUSH_INVALIDATE: case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_FLUSH: - bInvalidateOnly = NV_FALSE; + syncDir = OS_DMA_SYNC_TO_DEVICE; break; case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_INVALIDATE: - bInvalidateOnly = NV_TRUE; + syncDir = OS_DMA_SYNC_FROM_DEVICE; + break; + + case NV0000_CTRL_OS_UNIX_FLAGS_USER_CACHE_FLUSH_INVALIDATE: + syncDir = OS_DMA_SYNC_TO_FROM_DEVICE; break; default: @@ -2113,54 +2175,7 @@ cliresCtrlCmdOsUnixFlushUserCache_IMPL return NV_ERR_INVALID_LIMIT; } - if (bInvalidateOnly) - { - // - // XXX: this seems fishy - I'm not sure if invalidating by the kernel - // VA only as nv_dma_cache_invalidate() does here is sufficient for - // this control call. - // pAddressSpaceParams->internalOnly is expected to be the RM client - // VA for this control call; if we wanted to invalidate the user VA we - // could do so using that. - // - // For I/O coherent platforms this won't actually do anything. - // On non-I/O-coherent platforms, there's no need to do a second - // invalidation after the full flush. - // - nv_state_t *nv = NV_GET_NV_STATE(pMemDesc->pGpu); - if (nv->iovaspace_id != NV_IOVA_DOMAIN_NONE) - { - PIOVAMAPPING pIovaMapping = memdescGetIommuMap(pMemDesc, nv->iovaspace_id); - // - // This should only be called for devices that map memory descriptors - // through the nv-dma library, where the memory descriptor data - // contains all the kernel-specific context we need for the - // invalidation. - // - // (These checks match those in osIovaUnmap() leading up to - // nv_dma_unmap_alloc()). - // - if (pIovaMapping == NULL || - pIovaMapping->pOsData == NULL || - memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_GUEST_ALLOCATED) || - memdescGetFlag(pIovaMapping->pPhysMemDesc, MEMDESC_FLAGS_PEER_IO_MEM)) - { - return NV_ERR_INVALID_ARGUMENT; - } - - nv_dma_cache_invalidate(nv->dma_dev, pIovaMapping->pOsData); - } - else - { - return NV_ERR_INVALID_ARGUMENT; - } - } - else - { - return os_flush_user_cache(); - } - - return NV_OK; + return osDmaSyncMem(pMemDesc, syncDir); } static NV_STATUS diff --git a/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c b/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c index 85866213b3..3d84012ca8 100644 --- a/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c +++ b/src/nvidia/arch/nvalloc/unix/src/osmemdesc.c @@ -593,7 +593,7 @@ osCreateOsDescriptorFromPhysAddr MEMORY_DESCRIPTOR *pMemDesc; NvU64 *pPteArray; NvU64 base = 0; - NvU32 cache_type = NV_MEMORY_CACHED; + NvU32 cache_type = NV_MEMORY_DEFAULT; NvU64 memdescFlags = MEMDESC_FLAGS_NONE; NvU64 *pPhys_addrs; NvU64 num_os_pages; @@ -750,7 +750,7 @@ _createMemdescFromDmaBufSgtHelper } else if (!FLD_TEST_DRF(OS02, _FLAGS, _COHERENCY, _UNCACHED, flags)) { - cacheType = NV_MEMORY_CACHED; + cacheType = NV_MEMORY_DEFAULT; } if (FLD_TEST_DRF(OS02, _FLAGS, _GPU_CACHEABLE, _YES, flags)) diff --git a/src/nvidia/generated/g_mem_desc_nvoc.h b/src/nvidia/generated/g_mem_desc_nvoc.h index 0eb04d518e..cf67fa3c61 100644 --- a/src/nvidia/generated/g_mem_desc_nvoc.h +++ b/src/nvidia/generated/g_mem_desc_nvoc.h @@ -1511,7 +1511,6 @@ NV_STATUS memdescSendMemDescToGSP(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc, NvH // cache maintenance functions void memdescFlushGpuCaches(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc); -void memdescFlushCpuCaches(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc); // Map memory descriptor for RM internal access void* memdescMapInternal(OBJGPU *pGpu, MEMORY_DESCRIPTOR *pMemDesc, NvU32 flags); diff --git a/src/nvidia/generated/g_os_nvoc.h b/src/nvidia/generated/g_os_nvoc.h index 08c3d1a3a9..f94dc0f3d2 100644 --- a/src/nvidia/generated/g_os_nvoc.h +++ b/src/nvidia/generated/g_os_nvoc.h @@ -649,12 +649,10 @@ NV_STATUS __nvoc_objCreate_OBJOS(OBJOS**, Dynamic*, NvU32); NV_STATUS addProbe(OBJGPU *, NvU32); -typedef NV_STATUS OSFlushCpuCache(void); typedef void OSAddRecordForCrashLog(void *, NvU32); typedef void OSDeleteRecordForCrashLog(void *); -OSFlushCpuCache osFlushCpuCache; OSAddRecordForCrashLog osAddRecordForCrashLog; OSDeleteRecordForCrashLog osDeleteRecordForCrashLog; @@ -794,11 +792,20 @@ NV_STATUS rm_is_vgpu_supported_device(OS_GPU_INFO *pNv, NvU32 pmc_boot_1, NV_STATUS osLockPageableDataSection(RM_PAGEABLE_SECTION *pSection); NV_STATUS osUnlockPageableDataSection(RM_PAGEABLE_SECTION *pSection); +#define OS_DMA_SYNC_TO_DEVICE NVBIT(0) // CPU flush +#define OS_DMA_SYNC_FROM_DEVICE NVBIT(1) // CPU invalidate +#define OS_DMA_SYNC_TO_FROM_DEVICE (OS_DMA_SYNC_TO_DEVICE | OS_DMA_SYNC_FROM_DEVICE) // CPU flush + invalidate + +NV_STATUS osDmaSyncMem(MEMORY_DESCRIPTOR *pMemDesc, + NvU32 dir); + void osFlushGpuCoherentCpuCacheRange(OS_GPU_INFO *pOsGpuInfo, NvU64 cpuVirtual, NvU64 size); NvBool osUidTokensEqual(PUID_TOKEN arg1, PUID_TOKEN arg2); +NvBool osDevIsDmaCoherent(OBJGPU *pGpu); + NV_STATUS osValidateClientTokens(PSECURITY_TOKEN arg1, PSECURITY_TOKEN arg2); PUID_TOKEN osGetCurrentUidToken(void); diff --git a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c index d1e49249ef..7b2def3bb1 100644 --- a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c +++ b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c @@ -2954,7 +2954,7 @@ _kbusInternalBar1Unmap mapRemove(&pVaInfo->reverseMap, ppVaToType); // TODO: investigate whether the tegra wbinvd flush is really necessary, seems only useful for SYSMEM_COH - memdescFlushCpuCaches(pGpu, pType->pMemDesc); + osDmaSyncMem(pType->pMemDesc, OS_DMA_SYNC_TO_DEVICE); dmaFreeMapping_HAL(pGpu, pDma, pVAS, virtRange.start, pType->pMemDesc, 0, NULL); } @@ -3263,7 +3263,7 @@ kbusUnmapFbAperture_GM107 OBJVASPACE *pVAS = pBar1VaInfo->pVAS; // TODO: investigate whether the tegra wbinvd flush is really necessary, seems only useful for SYSMEM_COH - memdescFlushCpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_TO_DEVICE); dmaFreeMapping_HAL(pGpu, pDma, pVAS, memArea.pRanges[0].start, pMemDesc, 0, NULL); goto done; diff --git a/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c b/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c index ce3346e1ef..2d82d28249 100644 --- a/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c +++ b/src/nvidia/src/kernel/gpu/bus/arch/turing/kern_bus_tu102.c @@ -939,7 +939,7 @@ NV_STATUS kbusDecreaseStaticBar1Refcount_TU102 NV_ERR_INVALID_STATE); // TODO: investigate whether the tegra wbinvd flush is really necessary, seems only useful for SYSMEM_COH - memdescFlushCpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_TO_DEVICE); pRootMemDesc = memdescGetRootMemDesc(pMemDesc, NULL); diff --git a/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c b/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c index c31d829d84..c942e07887 100644 --- a/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c +++ b/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c @@ -94,7 +94,7 @@ _kccuAllocMemory // Create a memory descriptor data structure for the shared buffer status = memdescCreate(&pKernelCcu->pMemDesc[idx], pGpu, shrBufSize, 0, NV_MEMORY_CONTIGUOUS, - aperture, NV_MEMORY_CACHED, flags); + aperture, NV_MEMORY_DEFAULT, flags); if (status != NV_OK) { NV_PRINTF(LEVEL_ERROR, "CCU memdescCreate failed for(%u) with status: 0x%x\n", idx, status); diff --git a/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c b/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c index 0d962d0954..ed4a9a0c60 100644 --- a/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c +++ b/src/nvidia/src/kernel/gpu/disp/inst_mem/disp_inst_mem.c @@ -243,7 +243,7 @@ instmemInitMemDesc // dispInstMemAttr to NV_MEMORY_CACHED this needs to be set based on system configuration/registry parameter. // instmemSetMemory(pGpu, pInstMem, - ADDR_SYSMEM, NV_MEMORY_CACHED, + ADDR_SYSMEM, NV_MEMORY_DEFAULT, 0 /* base */, instMemSize); } else if (IS_GSP_CLIENT(pGpu)) diff --git a/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c b/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c index 95f0adcdbb..c4a66dff62 100644 --- a/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c +++ b/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c @@ -59,7 +59,7 @@ NV_STATUS kcrashcatEngineConfigure_IMPL NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, memdescCreate(&pKernelCrashCatEng->pQueueMemDesc, pKernelCrashCatEng->pGpu, pEngConfig->allocQueueSize, CRASHCAT_QUEUE_ALIGNMENT, NV_TRUE, - ADDR_SYSMEM, NV_MEMORY_CACHED, MEMDESC_FLAGS_NONE)); + ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE)); NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, memdescAlloc(pKernelCrashCatEng->pQueueMemDesc), @@ -230,7 +230,7 @@ static MEMORY_DESCRIPTOR *_kcrashcatEngineCreateBufferMemDesc NV_ADDRESS_SPACE bufAddrSpace = _crashcatApertureToAddressSpace(pBufDesc->aperture); NV_CHECK_OK_OR_ELSE(status, LEVEL_ERROR, memdescCreate(&pMemDesc, pKernelCrashCatEng->pGpu, pBufDesc->size, 0, - NV_TRUE, bufAddrSpace, NV_MEMORY_CACHED, MEMDESC_FLAGS_NONE), + NV_TRUE, bufAddrSpace, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE), return NULL;); memdescDescribe(pMemDesc, bufAddrSpace, pBufDesc->physOffset, pBufDesc->size); diff --git a/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c b/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c index 4ade402e3a..570ee156e3 100644 --- a/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c +++ b/src/nvidia/src/kernel/gpu/fifo/arch/volta/kernel_channel_group_gv100.c @@ -47,7 +47,7 @@ kchangrpAllocFaultMethodBuffers_GV100 NvU32 runQueues = kfifoGetNumRunqueues_HAL(pGpu, pKernelFifo); NvU32 index = 0; NvU32 faultBufApert = ADDR_SYSMEM; - NvU32 faultBufAttr = NV_MEMORY_CACHED; + NvU32 faultBufAttr = NV_MEMORY_DEFAULT; NvU64 memDescFlags = MEMDESC_FLAGS_LOST_ON_SUSPEND; HW_ENG_FAULT_METHOD_BUFFER *pFaultMthdBuf = NULL; NvU32 gfid = pKernelChannelGroup->gfid; @@ -85,14 +85,12 @@ kchangrpAllocFaultMethodBuffers_GV100 // host, force fault buffer aperture to vid mem. // faultBufApert = ADDR_FBMEM; - faultBufAttr = NV_MEMORY_CACHED; memDescFlags |= MEMDESC_FLAGS_OWNED_BY_CURRENT_DEVICE; } else { // Get the right aperture/attribute faultBufApert = ADDR_SYSMEM; - faultBufAttr = NV_MEMORY_CACHED; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_3, _FAULT_METHOD_BUFFER, pGpu->instLocOverrides3), "fault method buffer", &faultBufApert, &faultBufAttr); if (faultBufApert == ADDR_FBMEM) diff --git a/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c b/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c index 6cc75cb38d..94facb30b7 100644 --- a/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c +++ b/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c @@ -1036,7 +1036,7 @@ kfspSetupGspImages { NV_ASSERT(pKernelFsp->pGspFmcMemdesc == NULL); // If we assert the pointer becomes a zombie. status = memdescCreate(&pKernelFsp->pGspFmcMemdesc, pGpu, pGspImageMapSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_7, @@ -1381,7 +1381,7 @@ kfspPrepareBootCommands_GH100 { NV_ASSERT(pKernelFsp->pSysmemFrtsMemdesc == NULL); // If we assert the pointer becomes a zombie. status = memdescCreate(&pKernelFsp->pSysmemFrtsMemdesc, pGpu, frtsSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_8, diff --git a/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c b/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c index 70e5d8d5b6..bf63f8d1cb 100644 --- a/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c +++ b/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c @@ -288,7 +288,14 @@ gpuCreateRusdMemory_IMPL if ((sysGetStaticConfig(SYS_GET_INSTANCE()))->bOsCCEnabled) return NV_OK; - // Create a kernel-side mapping for writing RUSD data + // + // Create a kernel-side mapping for writing RUSD data. + // This must be cached memory due to atomic intrinsic usage, which is not + // supported on uncached memory by some Arm platforms. + // + // XXX: There might be coherency issues with this allocation, although + // statistics appear fine at a quick glance. + // NV_ASSERT_OK_OR_RETURN(memdescCreate(ppMemDesc, pGpu, sizeof(NV00DE_SHARED_DATA), 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, MEMDESC_FLAGS_USER_READ_ONLY)); diff --git a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c index f78f0f9ff3..55a16597fc 100644 --- a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c +++ b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c @@ -179,7 +179,7 @@ kgraphicsConstructEngine_IMPL // FECS event buffer defaults to cached SYSMEM pKernelGraphics->globalCtxBuffersInfo.globalCtxAttr[GR_GLOBALCTX_BUFFER_FECS_EVENT].pAllocList = ADDRLIST_SYSMEM_ONLY; - pKernelGraphics->globalCtxBuffersInfo.globalCtxAttr[GR_GLOBALCTX_BUFFER_FECS_EVENT].cpuAttr = NV_MEMORY_CACHED; + pKernelGraphics->globalCtxBuffersInfo.globalCtxAttr[GR_GLOBALCTX_BUFFER_FECS_EVENT].cpuAttr = NV_MEMORY_DEFAULT; // Process instloc overrides { diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c index c581db3b22..3f65e8a43a 100644 --- a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c +++ b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c @@ -162,7 +162,7 @@ kgspAllocBootArgs_GH100 NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pKernelGsp->pGspFmcArgumentsDescriptor, pGpu, sizeof(GSP_FMC_BOOT_PARAMS), 0x1000, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), _kgspAllocBootArgs_exit_cleanup); diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c index 39d9635b69..bd57883a9d 100644 --- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c +++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c @@ -118,7 +118,7 @@ kgspAllocBootArgs_TU102 NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pKernelGsp->pWprMetaDescriptor, pGpu, 0x1000, 0x1000, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), _kgspAllocBootArgs_exit_cleanup); @@ -174,7 +174,7 @@ kgspAllocBootArgs_TU102 NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pKernelGsp->pGspArgumentsDescriptor, pGpu, 0x1000, 0x1000, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), _kgspAllocBootArgs_exit_cleanup); diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c index e262e8fc3f..1565e752a7 100644 --- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c +++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c @@ -2638,7 +2638,7 @@ _setupLogBufferVgpu pGpu, logVgpuSetupParams.bufSize, RM_PAGE_SIZE, - NV_TRUE, ADDR_FBMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_FBMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE), exit); @@ -2942,7 +2942,7 @@ _setupLogBufferBaremetal pGpu, size, RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), exit); @@ -3337,7 +3337,7 @@ _kgspSetupTaskRMCoverageStructure ( pGpu, BULLSEYE_GSP_RM_COVERAGE_SIZE, RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE), done); memdescTagAlloc(nvStatus, @@ -4339,7 +4339,7 @@ kgspPrepareBootBinaryImage_IMPL pGpu, bufSizeAligned, RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), fail); @@ -4424,7 +4424,7 @@ _kgspCreateSignatureMemdesc NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, memdescCreate(&pKernelGsp->pSignatureMemdesc, pGpu, NV_ALIGN_UP(pGspFw->signatureSize, 256), 256, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags)); + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags)); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_16, pKernelGsp->pSignatureMemdesc); @@ -4712,7 +4712,7 @@ kgspCreateRadix3_IMPL LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, flags), done); diff --git a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c index 80c7212c5a..66de4947e3 100644 --- a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c +++ b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c @@ -230,7 +230,7 @@ GspMsgQueuesInit // NV_ASSERT_OK_OR_GOTO(nvStatus, memdescCreate(&pMQCollection->pSharedMemDesc, pGpu, sharedBufSize, - RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_CACHED, + RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags), done); diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c b/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c index a9505db36c..a1194f701c 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/fbsr_gm107.c @@ -202,7 +202,7 @@ fbsrInit_GM107(OBJGPU *pGpu, OBJFBSR *pFbsr) // to to use cached memory. // status = memdescCreate(&pFbsr->pSysMemDesc, pGpu, memSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE); if (status != NV_OK) { @@ -371,7 +371,7 @@ fbsrBegin_GM107(OBJGPU *pGpu, OBJFBSR *pFbsr, FBSR_OP_TYPE op) // On Windows, pageable memory is also cacheable. status = memdescCreate(&pFbsr->pSysMemDesc, pGpu, pFbsr->length, 0, NV_FALSE, - ADDR_SYSMEM, NV_MEMORY_CACHED, + ADDR_SYSMEM, NV_MEMORY_DEFAULT, MEMDESC_FLAGS_PAGED_SYSMEM); } if (status != NV_OK) diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c index a2a01ff0e7..1a723ffed9 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c @@ -38,6 +38,7 @@ #include "mem_mgr/virt_mem_mgr.h" #include "core/system.h" #include "vgpu/vgpu_util.h" +#include "platform/chipset/chipset.h" #include "platform/sli/sli.h" #include "resserv/rs_client.h" @@ -2107,29 +2108,6 @@ memdescFlushGpuCaches } } -void -memdescFlushCpuCaches -( - OBJGPU *pGpu, - MEMORY_DESCRIPTOR *pMemDesc -) -{ - // Flush WC to get the data written to this mapping out to memory - osFlushCpuWriteCombineBuffer(); - - KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu); - - // Special care is needed on SOC, where the GPU cannot snoop the CPU L2 - if ((pKernelBif != NULL) && - !kbifIsSnoopDmaCapable(pGpu, pKernelBif) && - (memdescGetCpuCacheAttrib(pMemDesc) == NV_MEMORY_CACHED)) - { - // Flush CPU L2 so that the GPU will see any changes the CPU made - osFlushCpuCache(); - } -} - - /* * @brief map memory descriptor for internal access * @@ -2158,7 +2136,10 @@ memdescMapInternal // We need to flush & invalidate GPU L2 cache only for directed BAR mappings. // Reflected BAR mappings will access memory via GPU, and hence go through GPU L2 cache. if (mapType == MEMDESC_MAP_INTERNAL_TYPE_SYSMEM_DIRECT) + { memdescFlushGpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_FROM_DEVICE); + } if (pMemDesc->_pInternalMapping != NULL) { @@ -2234,7 +2215,7 @@ void memdescUnmapInternal if (mapType == MEMDESC_MAP_INTERNAL_TYPE_SYSMEM_DIRECT || mapType == MEMDESC_MAP_INTERNAL_TYPE_BAR2) { - memdescFlushCpuCaches(pGpu, pMemDesc); + osDmaSyncMem(pMemDesc, OS_DMA_SYNC_TO_DEVICE); } if (--pMemDesc->_internalMappingRefCount == 0) @@ -3660,6 +3641,27 @@ void memdescSetCpuCacheAttrib NvU32 cpuCacheAttrib ) { + // + // Use NV_MEMORY_DEFAULT to get a reasonable default caching type for the + // given descriptor (i.e. DMA coherent), unless explicit cache maintenance + // is done (for performance reasons) or there are certain memory requirements + // (e.g. atomics need NV_MEMORY_CACHED on Arm). + // + if (cpuCacheAttrib == NV_MEMORY_DEFAULT) + { + OBJCL *pCl = SYS_GET_CL(SYS_GET_INSTANCE()); + + if (memdescGetFlag(pMemDesc, MEMDESC_FLAGS_CPU_ONLY) || + ((pCl != NULL) && pCl->getProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT))) + { + cpuCacheAttrib = NV_MEMORY_CACHED; + } + else + { + cpuCacheAttrib = NV_MEMORY_UNCACHED; + } + } + // // When running 64-bit MODS on ARM v8, we need to force all CPU mappings as WC. // This seems to be an issue with glibc. See bug 1556221. diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c index c1318ad499..d9dc103b43 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_utils.c @@ -162,7 +162,7 @@ _memmgrAllocAndMapSurface NV_ASSERT_OK_OR_RETURN( memdescCreate(ppMemDesc, pGpu, size, RM_PAGE_SIZE, NV_TRUE, - ADDR_SYSMEM, NV_MEMORY_CACHED, flags)); + ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags)); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_77, (*ppMemDesc)); diff --git a/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c b/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c index ee4f0c75d0..ab1a2072e9 100644 --- a/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c +++ b/src/nvidia/src/kernel/gpu/mmu/kern_gmmu.c @@ -999,14 +999,14 @@ kgmmuFaultBufferGetAddressSpace_IMPL if (index == NON_REPLAYABLE_FAULT_BUFFER) { faultBufferAddrSpace = bAllocInVidmem ? ADDR_FBMEM : ADDR_SYSMEM; - faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_CACHED; + faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_DEFAULT; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_3, _UVM_FAULT_BUFFER_NONREPLAYABLE, pGpu->instLocOverrides3), "UVM non-replayable fault", &faultBufferAddrSpace, &faultBufferAttr); } else if (index == REPLAYABLE_FAULT_BUFFER) { faultBufferAddrSpace = bAllocInVidmem ? ADDR_FBMEM : ADDR_SYSMEM; - faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_CACHED; + faultBufferAttr = bAllocInVidmem ? NV_MEMORY_UNCACHED : NV_MEMORY_DEFAULT; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_4, _UVM_FAULT_BUFFER_REPLAYABLE, pGpu->instLocOverrides4), "UVM replayable fault", &faultBufferAddrSpace, &faultBufferAttr); } @@ -1493,7 +1493,7 @@ _kgmmuClientShadowFaultBufferQueueAllocate status = memdescCreate(&pQueueMemDesc, pGpu, sizeof(GMMU_SHADOW_FAULT_BUF), RM_PAGE_SIZE, - NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); if (status != NV_OK) { @@ -1591,7 +1591,7 @@ _kgmmuClientShadowFaultBufferPagesAllocate status = memdescCreate(&pMemDesc, pGpu, shadowFaultBufferSizeTotal, RM_PAGE_SIZE, - NV_FALSE, ADDR_SYSMEM, NV_MEMORY_CACHED, + NV_FALSE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); if (status != NV_OK) { diff --git a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c index a4950c29d3..792c46b73a 100644 --- a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c +++ b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb10b.c @@ -485,7 +485,7 @@ ksec2SetupGspImages_GB10B pGspImageMapSize = NV_ALIGN_UP(pGspImageSize, 0x1000); status = memdescCreate(&pKernelSec2->pGspFmcMemdesc, pGpu, pGspImageMapSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_7, diff --git a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c index 466eae4e4a..633a73246a 100644 --- a/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c +++ b/src/nvidia/src/kernel/gpu/sec2/arch/blackwell/kernel_sec2_gb20b.c @@ -779,7 +779,7 @@ ksec2SetupGspImages_GB20B pGspImageMapSize = NV_ALIGN_UP(pGspImageSize, 0x1000); status = memdescCreate(&pKernelSec2->pGspFmcMemdesc, pGpu, pGspImageMapSize, - 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags); + 0, NV_TRUE, ADDR_SYSMEM, NV_MEMORY_DEFAULT, flags); NV_ASSERT_OR_GOTO(status == NV_OK, failed); memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_7, diff --git a/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c b/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c index af0e35761f..f057c37558 100644 --- a/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c +++ b/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c @@ -848,7 +848,7 @@ spdmMessageProcess_GH100 // First copy payload to shared buffer portMemCopy(pPayloadBuffer, requestSize, pRequest, requestSize); - memdescFlushCpuCaches(pGpu, pSpdm->pPayloadBufferMemDesc); + osDmaSyncMem(pSpdm->pPayloadBufferMemDesc, OS_DMA_SYNC_TO_DEVICE); // Trigger message pending value, then poll for response from GSP kflcnRegWrite_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX0, NV_SPDM_REQUESTER_MESSAGE_PENDING_TOKEN); diff --git a/src/nvidia/src/kernel/gpu/spdm/spdm.c b/src/nvidia/src/kernel/gpu/spdm/spdm.c index 3c35b78c31..1eb8cfa1c9 100644 --- a/src/nvidia/src/kernel/gpu/spdm/spdm.c +++ b/src/nvidia/src/kernel/gpu/spdm/spdm.c @@ -274,7 +274,7 @@ spdmSetupCommunicationBuffers_IMPL // Create memory descriptor for payload buffer status = memdescCreate(&pSpdm->pPayloadBufferMemDesc, pGpu, NV_SPDM_SYSMEM_SURFACE_SIZE_PAGE_ALIGNED, NV_SPDM_SYSMEM_SURFACE_ALIGNMENT_IN_BYTES, NV_TRUE, ADDR_SYSMEM, - NV_MEMORY_CACHED, MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY); + NV_MEMORY_DEFAULT, MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY); if (status != NV_OK || pSpdm->pPayloadBufferMemDesc == NULL) { status = NV_ERR_INSUFFICIENT_RESOURCES; diff --git a/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c b/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c index 1ff46bab61..1661a8d213 100644 --- a/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c +++ b/src/nvidia/src/kernel/gpu/uvm/arch/volta/uvm_gv100.c @@ -292,7 +292,7 @@ uvmInitAccessCntrBuffer_GV100 accessCntrBufferSize = uvmGetAccessCounterBufferSize_HAL(pGpu, pUvm, pAccessCounterBuffer->accessCounterIndex); accessCntrBufferAperture = ADDR_SYSMEM; - accessCntrBufferAttr = NV_MEMORY_CACHED; + accessCntrBufferAttr = NV_MEMORY_DEFAULT; memdescOverrideInstLoc(DRF_VAL(_REG_STR_RM, _INST_LOC_4, _UVM_FAULT_BUFFER_REPLAYABLE, pGpu->instLocOverrides4), "UVM access counter", &accessCntrBufferAperture, &accessCntrBufferAttr); diff --git a/src/nvidia/src/kernel/gpu/uvm/uvm.c b/src/nvidia/src/kernel/gpu/uvm/uvm.c index 6565ff8684..7f164b08f8 100644 --- a/src/nvidia/src/kernel/gpu/uvm/uvm.c +++ b/src/nvidia/src/kernel/gpu/uvm/uvm.c @@ -242,7 +242,7 @@ uvmAccessCntrBufferRegister_IMPL NV_STATUS status; MEMORY_DESCRIPTOR *pMemDesc; NvU32 addrSpace = ADDR_SYSMEM; - NvU32 attr = NV_MEMORY_CACHED; + NvU32 attr = NV_MEMORY_DEFAULT; if (pUvm->pAccessCounterBuffers == NULL) { diff --git a/src/nvidia/src/kernel/mem_mgr/mem.c b/src/nvidia/src/kernel/mem_mgr/mem.c index 2a81684a5a..76e5f99f8c 100644 --- a/src/nvidia/src/kernel/mem_mgr/mem.c +++ b/src/nvidia/src/kernel/mem_mgr/mem.c @@ -1214,20 +1214,34 @@ void memSetSysmemCacheAttrib_IMPL gpuCacheAttrib = NV_MEMORY_UNCACHED; } - if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_UNCACHED) - cpuCacheAttrib = NV_MEMORY_UNCACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_CACHED) - cpuCacheAttrib = NV_MEMORY_CACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_COMBINE) - cpuCacheAttrib = NV_MEMORY_WRITECOMBINED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_THROUGH) - cpuCacheAttrib = NV_MEMORY_CACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_PROTECT) - cpuCacheAttrib = NV_MEMORY_CACHED; - else if (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr) == NVOS32_ATTR_COHERENCY_WRITE_BACK) - cpuCacheAttrib = NV_MEMORY_CACHED; - else - cpuCacheAttrib = 0; + switch (DRF_VAL(OS32, _ATTR, _COHERENCY, pAllocData->attr)) + { + case NVOS32_ATTR_COHERENCY_UNCACHED: + cpuCacheAttrib = NV_MEMORY_UNCACHED; + break; + case NVOS32_ATTR_COHERENCY_WRITE_COMBINE: + cpuCacheAttrib = NV_MEMORY_WRITECOMBINED; + break; + case NVOS32_ATTR_COHERENCY_CACHED: + case NVOS32_ATTR_COHERENCY_WRITE_THROUGH: + case NVOS32_ATTR_COHERENCY_WRITE_PROTECT: + case NVOS32_ATTR_COHERENCY_WRITE_BACK: + // + // XXX: It's unclear in which cases the clients will perform their own + // CPU cache maintenance, but it only seems to happen when the GPU mapping + // is also cached (cliresCtrlCmdOsUnixFlushUserCache() will be called). + // This indicates that not all clients factor in hardware coherency support + // when requesting cached mappings, so it may be safer to just always use + // NV_MEMORY_DEFAULT, which only gives cached memory on coherent hardware. + // + cpuCacheAttrib = (gpuCacheAttrib == NV_MEMORY_CACHED) ? NV_MEMORY_CACHED : + NV_MEMORY_DEFAULT; + break; + default: + NV_ASSERT(0); + cpuCacheAttrib = NV_MEMORY_UNCACHED; + break; + } ct_assert(NVOS32_ATTR_COHERENCY_UNCACHED == NVOS02_FLAGS_COHERENCY_UNCACHED); ct_assert(NVOS32_ATTR_COHERENCY_CACHED == NVOS02_FLAGS_COHERENCY_CACHED); diff --git a/src/nvidia/src/kernel/platform/chipset/chipset.c b/src/nvidia/src/kernel/platform/chipset/chipset.c index c76ef1028c..ad48145b1e 100644 --- a/src/nvidia/src/kernel/platform/chipset/chipset.c +++ b/src/nvidia/src/kernel/platform/chipset/chipset.c @@ -50,13 +50,6 @@ clConstruct_IMPL(OBJCL *pCl) pCl->pPcieConfigSpaceBase = NULL; - // - // We set this property by default. - // Chipset setup function can override this. - // Right now only Tegra chipsets overide this setting. - // - pCl->setProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT, NV_TRUE); - return NV_OK; } diff --git a/src/nvidia/src/kernel/platform/chipset/chipset_info.c b/src/nvidia/src/kernel/platform/chipset/chipset_info.c index 9e546e62e2..10c82088d0 100644 --- a/src/nvidia/src/kernel/platform/chipset/chipset_info.c +++ b/src/nvidia/src/kernel/platform/chipset/chipset_info.c @@ -1179,6 +1179,14 @@ ARMV8_generic_setupFunc OBJCL *pCl ) { + // + // Arm platforms have historically had issues (corruption, bus errors) with + // non-Device MMIO mappings. Unlike DMA coherency, there's no way to check + // for this at runtime. Therefore, in the absence of better chipset info, + // disable WC iomaps by default. + // + pCl->setProperty(pCl, PDB_PROP_CL_DISABLE_IOMAP_WC, NV_TRUE); + return NV_OK; } @@ -1351,6 +1359,19 @@ Ampere_AmpereOne_setupFunc return NV_OK; } +// Generic setup function +static NV_STATUS +Generic_setupFunc +( + OBJCL *pCl +) +{ +#if NVCPU_IS_FAMILY_ARM + return ARMV8_generic_setupFunc(pCl); +#endif + return NV_OK; +} + void csGetInfoStrings ( diff --git a/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c b/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c index d834ab4240..d057d81321 100644 --- a/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c +++ b/src/nvidia/src/kernel/platform/chipset/chipset_pcie.c @@ -971,6 +971,14 @@ clUpdatePcieConfig_IMPL(OBJGPU *pGpu, OBJCL *pCl) objClBuildPcieAtomicsAllowList(pGpu, pCl); + // + // Check if the GPU device is on a cache-coherent bus. + // + if (osDevIsDmaCoherent(pGpu)) + { + pCl->setProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT, NV_TRUE); + } + objClInitPcieChipset(pGpu, pCl); // diff --git a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c index a4a95b045c..aff781ea1a 100644 --- a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c +++ b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c @@ -110,6 +110,7 @@ #include #include #include +#include #include #include #include @@ -4836,6 +4837,7 @@ static NV_STATUS nvGpuOpsAllocPhysical(struct gpuDevice *device, NV_MEMORY_ALLOCATION_PARAMS memAllocParams = {0}; NV_STATUS status = NV_OK; RM_API *pRmApi = rmapiGetInterface(RMAPI_EXTERNAL_KERNEL); + OBJGPU *pGpu = NULL; NvHandle physHandle = 0; @@ -4843,6 +4845,9 @@ static NV_STATUS nvGpuOpsAllocPhysical(struct gpuDevice *device, NV_ASSERT(device); NV_ASSERT(paOffset); + status = _nvGpuOpsGetGpuFromDevice(device, &pGpu); + NV_ASSERT_OR_RETURN((status == NV_OK) && (pGpu != NULL), NV_ERR_INVALID_ARGUMENT); + // then allocate the physical memory in either sysmem or fb. memAllocParams.owner = HEAP_OWNER_RM_KERNEL_CLIENT; @@ -4858,9 +4863,9 @@ static NV_STATUS nvGpuOpsAllocPhysical(struct gpuDevice *device, DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) : DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM); - // Always enable caching for System Memory as all the currently supported - // platforms are IO coherent. - NvBool bCached = isSystemMemory; + // Set CPU caching attribute + KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu); + NvBool bCached = isSystemMemory && kbifIsSnoopDmaCapable(pGpu, pKernelBif); memAllocParams.attr |= bCached ? DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED): DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED); @@ -10295,7 +10300,7 @@ _shadowMemdescCreateFlcn(gpuRetainedChannel *retainedChannel, pCtxBufferInfo->alignment, pCtxBufferInfo->bIsContigous, pCtxBufferInfo->aperture, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE ); NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, status); @@ -10395,7 +10400,7 @@ _shadowMemdescCreate(gpuRetainedChannel *retainedChannel, pCtxBufferInfo->alignment, pCtxBufferInfo->bIsContigous, pCtxBufferInfo->aperture, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, MEMDESC_FLAGS_NONE ); if (status != NV_OK) diff --git a/src/nvidia/src/kernel/vgpu/rpc.c b/src/nvidia/src/kernel/vgpu/rpc.c index d2c5fe01ff..c983449578 100644 --- a/src/nvidia/src/kernel/vgpu/rpc.c +++ b/src/nvidia/src/kernel/vgpu/rpc.c @@ -260,7 +260,7 @@ _allocRpcMemDescSysmem( 0, bContig, ADDR_SYSMEM, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, memdescFlag)); memdescSetFlag(*ppMemDesc, MEMDESC_FLAGS_KERNEL_MODE, NV_TRUE); diff --git a/src/nvidia/src/kernel/vgpu/vgpu_util.c b/src/nvidia/src/kernel/vgpu/vgpu_util.c index 9db90f622b..5e1ec55516 100644 --- a/src/nvidia/src/kernel/vgpu/vgpu_util.c +++ b/src/nvidia/src/kernel/vgpu/vgpu_util.c @@ -143,7 +143,7 @@ NV_STATUS vgpuAllocSysmemPfnBitMapNode(OBJGPU *pGpu, VGPU_SYSMEM_PFN_BITMAP_NODE 0, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, - NV_MEMORY_CACHED, + NV_MEMORY_DEFAULT, memFlags); if (status != NV_OK) {