From 942bbfc9dccc2b229ff07ff01bd7abf78e93497c Mon Sep 17 00:00:00 2001 From: aendk Date: Mon, 1 Dec 2025 11:39:46 +0100 Subject: [PATCH 1/2] Adds optimization to do less syncs between tokens in the CUDA backend --- ggml/src/ggml-backend-impl.h | 13 +++++++------ ggml/src/ggml-backend.cpp | 31 ++++++++++++++++++++++++------- ggml/src/ggml-cpu/amx/amx.cpp | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 10 ++++++++++ 4 files changed, 42 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 6792ba986e8..e3fc45a622a 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -46,15 +46,16 @@ extern "C" { // (optional) initialize a tensor in the buffer (eg. add tensor extras) enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // tensor data access - void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*set_tensor_sync_optional) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, bool sync); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported) - bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // clear the entire buffer - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); // (optional) reset any internal state due to tensor initialization, such as tensor extras - void (*reset) (ggml_backend_buffer_t buffer); + void (*reset) (ggml_backend_buffer_t buffer); }; struct ggml_backend_buffer { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index eeaf35c169f..fc713d6f801 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef __APPLE__ #include @@ -284,7 +285,14 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - buf->iface.set_tensor(buf, tensor, data, offset, size); + static bool disable_sync_optimization = (getenv("GGML_CUDA_DISABLE_SYNC_OPTIMIZATION") != nullptr); + + if (!disable_sync_optimization && buf->iface.set_tensor_sync_optional != NULL) { + buf->iface.set_tensor_sync_optional(buf, tensor, data, offset, size, false); + } else { + buf->iface.set_tensor(buf, tensor, data, offset, size); + } + } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -602,6 +610,7 @@ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { /* .init_tensor = */ NULL, /* .memset_tensor = */ NULL, /* .set_tensor = */ NULL, + /* .set_tensor_s_o = */ NULL, /* .get_tensor = */ NULL, /* .cpy_tensor = */ NULL, /* .clear = */ ggml_backend_multi_buffer_clear, @@ -1426,6 +1435,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + std::unordered_map backends_to_sync; // copy the input tensors to the split backend for (int input_id = 0; input_id < split->n_inputs; input_id++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]); @@ -1437,7 +1447,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); } else { - ggml_backend_synchronize(split_backend); + backends_to_sync[split_backend] = true; } ggml_backend_tensor_copy(input, input_cpy); } else { @@ -1445,7 +1455,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); } else { - ggml_backend_synchronize(split_backend); + backends_to_sync[split_backend] = true; } // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used @@ -1460,7 +1470,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1]; const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1]; - ggml_backend_synchronize(input_backend); + backends_to_sync[input_backend] = true; // get the ids ggml_tensor * ids_tensor = node->src[2]; @@ -1479,7 +1489,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (ids_tensor != prev_ids_tensor) { ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t)); ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor)); - ggml_backend_synchronize(ids_backend); + backends_to_sync[ids_backend] = true; // find the used experts used_ids.clear(); @@ -1537,11 +1547,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { - ggml_backend_synchronize(input_backend); + backends_to_sync[input_backend] = true; if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); } else { - ggml_backend_synchronize(split_backend); + backends_to_sync[split_backend] = true; } ggml_backend_tensor_copy(input, input_cpy); } @@ -1549,6 +1559,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + // sync in bulk instead of between async copies + for (auto& elem : backends_to_sync) { + ggml_backend_synchronize(elem.first); + } + if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { @@ -2118,6 +2133,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { /* .init_tensor = */ NULL, // no initialization required /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .set_tensor_s_o = */ NULL, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, @@ -2130,6 +2146,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { /* .init_tensor = */ NULL, // no initialization required /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .set_tensor_s_o = */ NULL, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 895a5713753..07c82bac93a 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -110,6 +110,7 @@ static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, + /* .set_tensor_s_o = */ nullptr, /* .get_tensor = */ nullptr, /* .cpy_tensor = */ nullptr, /* .clear = */ ggml_backend_amx_buffer_clear, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index aa6570765ad..5db937339eb 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -625,6 +625,14 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } +static void ggml_backend_cuda_buffer_set_tensor_sync_optional(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, bool sync) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + if (sync) CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +} + static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -668,6 +676,7 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, + /* .set_tensor_s_o = */ ggml_backend_cuda_buffer_set_tensor_sync_optional, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, /* .clear = */ ggml_backend_cuda_buffer_clear, @@ -980,6 +989,7 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .set_tensor_s_o = */ NULL, /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, /* .cpy_tensor = */ NULL, /* .clear = */ ggml_backend_cuda_split_buffer_clear, From f6b408d843c0b3332905f51da073d3cf6bf88f26 Mon Sep 17 00:00:00 2001 From: aendk Date: Thu, 4 Dec 2025 18:03:12 +0100 Subject: [PATCH 2/2] Renamed new async function, fixed some whitespace --- ggml/src/ggml-backend-impl.h | 14 +++---- ggml/src/ggml-backend.cpp | 65 +++++++++++++++++---------------- ggml/src/ggml-cpu/amx/amx.cpp | 20 +++++----- ggml/src/ggml-cuda/ggml-cuda.cu | 52 ++++++++++++-------------- 4 files changed, 74 insertions(+), 77 deletions(-) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index e3fc45a622a..9a0ba7a54b2 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -46,16 +46,16 @@ extern "C" { // (optional) initialize a tensor in the buffer (eg. add tensor extras) enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // tensor data access - void (*memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*set_tensor_sync_optional) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, bool sync); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*set_tensor_async) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported) - bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // clear the entire buffer - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); // (optional) reset any internal state due to tensor initialization, such as tensor extras - void (*reset) (ggml_backend_buffer_t buffer); + void (*reset) (ggml_backend_buffer_t buffer); }; struct ggml_backend_buffer { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index fc713d6f801..088bdcee1d0 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -285,10 +285,11 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + // do not synchronize directly after dispatching async tensor copies static bool disable_sync_optimization = (getenv("GGML_CUDA_DISABLE_SYNC_OPTIMIZATION") != nullptr); - if (!disable_sync_optimization && buf->iface.set_tensor_sync_optional != NULL) { - buf->iface.set_tensor_sync_optional(buf, tensor, data, offset, size, false); + if (!disable_sync_optimization && buf->iface.set_tensor_async != NULL) { + buf->iface.set_tensor_async(buf, tensor, data, offset, size); } else { buf->iface.set_tensor(buf, tensor, data, offset, size); } @@ -605,16 +606,16 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { - /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, - /* .get_base = */ NULL, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ NULL, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ NULL, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_multi_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, + /* .get_base = */ NULL, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ NULL, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ NULL, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_multi_buffer_clear, + /* .reset = */ NULL, }; ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { @@ -2128,29 +2129,29 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { - /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // CPU backend buffer type diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 07c82bac93a..c6333d10052 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -105,16 +105,16 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { - /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, - /* .get_base = */ ggml_backend_amx_buffer_get_base, - /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, - /* .set_tensor_s_o = */ nullptr, - /* .get_tensor = */ nullptr, - /* .cpy_tensor = */ nullptr, - /* .clear = */ ggml_backend_amx_buffer_clear, - /* .reset = */ nullptr, + /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, + /* .get_base = */ ggml_backend_amx_buffer_get_base, + /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, + /* .set_tensor_async = */ nullptr, + /* .get_tensor = */ nullptr, + /* .cpy_tensor = */ nullptr, + /* .clear = */ ggml_backend_amx_buffer_clear, + /* .reset = */ nullptr, }; static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5db937339eb..313f86f7464 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -617,20 +617,16 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_set_tensor_async(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -static void ggml_backend_cuda_buffer_set_tensor_sync_optional(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, bool sync) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - if (sync) CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_cuda_buffer_set_tensor_async(buffer, tensor, data, offset, size); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -671,16 +667,16 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { - /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, - /* .get_base = */ ggml_backend_cuda_buffer_get_base, - /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, - /* .set_tensor_s_o = */ ggml_backend_cuda_buffer_set_tensor_sync_optional, - /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cuda_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, + /* .set_tensor_async = */ ggml_backend_cuda_buffer_set_tensor_async, + /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, }; // cuda buffer type @@ -984,16 +980,16 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u } static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { - /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, - /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, - /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_cuda_split_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, }; // cuda split buffer type