ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
- CUDA_CHECK(cudaDeviceSynchronize());
- CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
- CUDA_CHECK(cudaDeviceSynchronize());
+ CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {