Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
pixelpipe_cache_cl.c
Go to the documentation of this file.
1/*
2 This file is part of the Ansel project.
3 Copyright (C) 2026 Aurélien PIERRE.
4
5 Ansel is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 Ansel is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with Ansel. If not, see <http://www.gnu.org/licenses/>.
17*/
18
87#include "common/atomic.h"
88#include "common/darktable.h"
89#include "common/opencl.h"
90#include "develop/pixelpipe.h"
92
93#include <stddef.h>
94
95#ifdef HAVE_OPENCL
96
123static gboolean _cl_is_zero_copy_image(const int devid, cl_mem mem, void *host_ptr, const dt_iop_roi_t *roi,
124 const size_t bpp)
125{
126 if(devid < 0 || !mem || !host_ptr || !roi || roi->width <= 0 || roi->height <= 0 || bpp == 0) return FALSE;
127
128 void *mapped = dt_opencl_map_image(devid, mem, TRUE, CL_MAP_READ, roi->width, roi->height, (int)bpp);
129 if(!mapped) return FALSE;
130
131 const gboolean ptr_matches = (mapped == host_ptr);
132 const gboolean is_zero_copy = ptr_matches;
133 const cl_int unmap_err = dt_opencl_unmap_mem_object(devid, mem, mapped);
134 if(unmap_err != CL_SUCCESS) return FALSE;
135
136 // Use clFinish rather than event wait: some drivers disable event tracking, but we still need to guarantee
137 // the unmap (and implicit sync) is complete before touching host memory or unlocking the cache entry.
138 dt_opencl_finish(devid);
139
140 return is_zero_copy;
141}
142
162static void *_gpu_try_reuse_pinned_from_cache(dt_pixel_cache_entry_t *cache_entry, void *host_ptr, int devid,
163 const dt_iop_roi_t *roi, const size_t bpp, const int flags,
164 int *out_cst, gboolean *out_reused)
165{
166 if(out_reused) *out_reused = FALSE;
167 if(!cache_entry || !host_ptr || devid < 0) return NULL;
168
169 int cached_cst = IOP_CS_NONE;
170 void *mem = dt_pixel_cache_clmem_get(cache_entry, host_ptr, devid, roi->width, roi->height, (int)bpp, flags,
171 &cached_cst);
172 if(mem)
173 {
174 if(out_reused) *out_reused = TRUE;
175 if(out_cst && cached_cst != IOP_CS_NONE) *out_cst = cached_cst;
176 }
177
178 return mem;
179}
180
181static inline gboolean _is_gamma_rgba8_output(const dt_iop_module_t *module, const size_t bpp,
182 const char *message)
183{
184 return module && message && bpp == 4 * sizeof(uint8_t) && strcmp(module->op, "gamma") == 0
185 && strcmp(message, "output") == 0;
186}
187
188static void *_gpu_alloc_device_with_flush(int devid, const dt_iop_roi_t *roi, const size_t bpp,
189 const dt_iop_module_t *module, const char *message,
190 void *keep);
191static void *_gpu_try_reuse_device_from_cache(dt_pixel_cache_entry_t *cache_entry, int devid,
192 const dt_iop_roi_t *roi, const size_t bpp,
193 gboolean *out_reused);
194
208static void *_gpu_get_pinned_or_alloc(int devid, void *host_ptr, const dt_iop_roi_t *roi, const size_t bpp,
209 dt_pixel_cache_entry_t *cache_entry, const gboolean reuse_pinned,
210 int *out_cst, gboolean *out_reused,
211 const dt_iop_module_t *module, const char *message)
212{
213 const int flags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR;
214 void *mem = NULL;
215 const gboolean gamma_rgba8 = _is_gamma_rgba8_output(module, bpp, message);
216 const int cl_bpp = gamma_rgba8 ? DT_OPENCL_BPP_ENCODE_RGBA8((int)bpp) : (int)bpp;
217
218 if(out_reused) *out_reused = FALSE;
219
220 if(reuse_pinned)
221 mem = _gpu_try_reuse_pinned_from_cache(cache_entry, host_ptr, devid, roi, bpp, flags, out_cst, out_reused);
222
223 if(!mem)
224 mem = dt_opencl_alloc_device_use_host_pointer(devid, roi->width, roi->height, cl_bpp, host_ptr, flags);
225
226 if(!mem)
227 {
229 if(reuse_pinned)
230 mem = _gpu_try_reuse_pinned_from_cache(cache_entry, host_ptr, devid, roi, bpp, flags, out_cst, out_reused);
231 if(!mem)
232 mem = dt_opencl_alloc_device_use_host_pointer(devid, roi->width, roi->height, cl_bpp, host_ptr, flags);
233 }
234
235 return mem;
236}
237
247static void *_gpu_alloc_device_with_flush(int devid, const dt_iop_roi_t *roi, const size_t bpp,
248 const dt_iop_module_t *module, const char *message,
249 void *keep)
250{
251 const gboolean gamma_rgba8 = _is_gamma_rgba8_output(module, bpp, message);
252 const int cl_bpp = gamma_rgba8 ? DT_OPENCL_BPP_ENCODE_RGBA8((int)bpp) : (int)bpp;
253 void *mem = dt_opencl_alloc_device(devid, roi->width, roi->height, cl_bpp);
254 if(!mem)
255 {
257 mem = dt_opencl_alloc_device(devid, roi->width, roi->height, cl_bpp);
258 }
259 return mem;
260}
261
262static void *_gpu_try_reuse_device_from_cache(dt_pixel_cache_entry_t *cache_entry, int devid,
263 const dt_iop_roi_t *roi, const size_t bpp,
264 gboolean *out_reused)
265{
266 if(out_reused) *out_reused = FALSE;
267 if(!cache_entry || devid < 0 || !roi || roi->width <= 0 || roi->height <= 0 || bpp == 0) return NULL;
268
269 // Device-only allocations are tracked with a NULL host_ptr key and a
270 // normalized READ_WRITE flag so we can reliably match across drivers.
271 void *mem = dt_pixel_cache_clmem_get(cache_entry, NULL, devid, roi->width, roi->height, (int)bpp,
272 CL_MEM_READ_WRITE, NULL);
273 if(mem && out_reused) *out_reused = TRUE;
274 return mem;
275}
276
284static void _gpu_log_pinned_reuse(dt_iop_module_t *module, const gboolean reused_from_cache)
285{
286 static dt_atomic_int clmem_reuse_hits;
287 static dt_atomic_int clmem_reuse_misses;
288
289 if(reused_from_cache)
290 {
291 const int hits = dt_atomic_add_int(&clmem_reuse_hits, 1) + 1;
292 const int misses = dt_atomic_get_int(&clmem_reuse_misses);
294 "[opencl_pixelpipe] %s reused pinned input from cache (hits=%d, misses=%d)\n",
295 module ? module->name() : "unknown", hits, misses);
296 }
297 else
298 {
299 (void)dt_atomic_add_int(&clmem_reuse_misses, 1);
300 }
301}
302
324static void *_gpu_init_buffer(int devid, void *const host_ptr, const dt_iop_roi_t *roi, const size_t bpp,
325 dt_iop_module_t *module, const char *message,
326 dt_pixel_cache_entry_t *cache_entry, const gboolean reuse_pinned,
327 const gboolean reuse_device,
328 int *out_cst, gboolean *out_reused, void *keep)
329{
330 // Need to use read-write mode because of in-place color space conversions.
331 void *cl_mem_input = NULL;
332 gboolean reused_from_cache = FALSE;
333 const gboolean allow_reuse_pinned = reuse_pinned;
334 const gboolean allow_reuse_device = reuse_device;
335
336 if(out_reused) *out_reused = FALSE;
337
338 if(host_ptr)
339 {
340 cl_mem_input = _gpu_get_pinned_or_alloc(devid, host_ptr, roi, bpp, cache_entry, allow_reuse_pinned,
341 out_cst, &reused_from_cache, module, message);
342 }
343 else
344 {
345 if(allow_reuse_device)
346 cl_mem_input = _gpu_try_reuse_device_from_cache(cache_entry, devid, roi, bpp, &reused_from_cache);
347
348 if(!cl_mem_input)
349 cl_mem_input = _gpu_alloc_device_with_flush(devid, roi, bpp, module, message, keep);
350 }
351
352 if(cl_mem_input == NULL)
353 {
354 dt_print(DT_DEBUG_OPENCL, "[opencl_pixelpipe] couldn't generate %s buffer for module %s\n", message,
355 module ? module->op : "unknown");
356 }
357 else if(allow_reuse_pinned && cache_entry && host_ptr)
358 {
359 if(out_reused) *out_reused = reused_from_cache;
360 _gpu_log_pinned_reuse(module, reused_from_cache);
361 }
362 else if(allow_reuse_device && cache_entry && !host_ptr && out_reused)
363 {
364 *out_reused = reused_from_cache;
365 }
366
367 return cl_mem_input;
368}
369
392static void _gpu_clear_buffer(void **cl_mem_buffer, dt_pixel_cache_entry_t *cache_entry, void *host_ptr, int cst,
393 const gboolean cache_device)
394{
395 if(cl_mem_buffer && *cl_mem_buffer != NULL)
396 {
397 cl_mem mem = *cl_mem_buffer;
398 const cl_mem_flags flags = dt_opencl_get_mem_flags(mem);
399 const gboolean can_cache_pinned = (cache_entry && host_ptr && (flags & CL_MEM_USE_HOST_PTR));
400 const gboolean can_cache_device = (cache_entry && !host_ptr && cache_device && !(flags & CL_MEM_USE_HOST_PTR));
401 const gboolean can_cache = (can_cache_pinned || can_cache_device);
402 if(can_cache)
403 {
404 const int devid = dt_opencl_get_mem_context_id(mem);
405 const int width = dt_opencl_get_image_width(mem);
406 const int height = dt_opencl_get_image_height(mem);
407 const int bpp = dt_opencl_get_image_element_size(mem);
408 const int tracked_flags = can_cache_device ? CL_MEM_READ_WRITE : (int)flags;
409 dt_pixel_cache_clmem_put(cache_entry, host_ptr, devid, width, height, bpp, tracked_flags, cst, mem);
410 }
411 else
412 {
413 if(cache_entry) dt_pixel_cache_clmem_remove(cache_entry, mem);
415 }
416 *cl_mem_buffer = NULL;
417 }
418}
419
448static int _cl_pinned_memory_copy(const int devid, void *host_ptr, void *cl_mem_buffer, const dt_iop_roi_t *roi,
449 int cl_mode, size_t bpp, dt_iop_module_t *module, const char *message)
450{
451 if(!host_ptr || !cl_mem_buffer) return 1;
452
453 const cl_mem mem = (cl_mem)cl_mem_buffer;
454 const cl_mem_flags flags = dt_opencl_get_mem_flags(mem);
455
456 // Fast path for true zero-copy pinned images: map/unmap is enough to synchronize host<->device.
457 if(flags & CL_MEM_USE_HOST_PTR)
458 {
459 void *mapped = dt_opencl_map_image(devid, mem, TRUE, cl_mode, roi->width, roi->height, (int)bpp);
460 if(mapped)
461 {
462 const gboolean ptr_matches = (mapped == host_ptr);
463 const cl_int unmap_err = dt_opencl_unmap_mem_object(devid, mem, mapped);
464 if(unmap_err != CL_SUCCESS) return 1;
465
466 // Ensure unmap (and any implicit sync) completed before we possibly enqueue explicit transfers.
467 // When event tracking is disabled, clFinish is the only reliable barrier.
468 dt_opencl_finish(devid);
469
470 if(ptr_matches)
471 {
473 "[opencl_pixelpipe] successfully synced image %s via map/unmap for module %s (%s)\n",
474 (cl_mode == CL_MAP_WRITE) ? "host to device" : "device to host",
475 (module) ? module->op : "base buffer", message);
476 return 0;
477 }
478 }
479 }
480
481 // Fallback: explicit blocking transfers (safe on all drivers).
482 cl_int err = CL_SUCCESS;
483 if(cl_mode == CL_MAP_WRITE)
484 err = dt_opencl_write_host_to_device(devid, host_ptr, mem, roi->width, roi->height, (int)bpp);
485 else if(cl_mode == CL_MAP_READ)
486 err = dt_opencl_read_host_from_device(devid, host_ptr, mem, roi->width, roi->height, (int)bpp);
487 else
488 return 1;
489
490 if(err != CL_SUCCESS)
491 {
492 dt_print(DT_DEBUG_OPENCL, "[opencl_pixelpipe] couldn't copy image %s for module %s (%s)\n",
493 (cl_mode == CL_MAP_WRITE) ? "host to device" : "device to host",
494 (module) ? module->op : "base buffer", message);
495 return 1;
496 }
497
498 dt_print(DT_DEBUG_OPENCL, "[opencl_pixelpipe] successfully copied image %s for module %s (%s)\n",
499 (cl_mode == CL_MAP_WRITE) ? "host to device" : "device to host",
500 (module) ? module->op : "base buffer", message);
501 return 0;
502}
503
519static float *_resync_input_gpu_to_cache(dt_dev_pixelpipe_t *pipe, float *input, void *cl_mem_input,
520 dt_iop_buffer_dsc_t *input_format, const dt_iop_roi_t *roi_in,
521 dt_iop_module_t *module, dt_iop_colorspace_type_t input_cst_cl,
522 const size_t in_bpp, dt_pixel_cache_entry_t *input_entry,
523 const char *message)
524{
525 if(!cl_mem_input) return input;
527
528 int fail = _cl_pinned_memory_copy(pipe->devid, input, cl_mem_input, roi_in, CL_MAP_READ, in_bpp, module, message);
529
530 // Color conversions happen inplace, so we need to ensure colorspace metadata are up-to-date.
531 if(!fail) input_format->cst = input_cst_cl;
532
533 // Enforce the OpenCL pipe to run in sync with CPU RAM cache so lock validity is guaranteed.
534 dt_opencl_finish(pipe->devid);
536
537 // Update colorspace tag (again, for safety).
538 input_format->cst = input_cst_cl;
539 return input;
540}
541
568static int _gpu_prepare_cl_input(dt_dev_pixelpipe_t *pipe, dt_iop_module_t *module,
569 float *input, void **cl_mem_input, dt_iop_colorspace_type_t *input_cst_cl,
570 const dt_iop_roi_t *roi_in, const size_t in_bpp,
571 dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t **locked_input_entry,
572 void *keep)
573{
574 if(!locked_input_entry) return 1;
575 *locked_input_entry = NULL;
576
577 if(*cl_mem_input != NULL)
578 {
579 // We passed the OpenCL memory buffer through directly on vRAM from previous module.
580 // This is fast and efficient.
581 // If it's a true zero-copy pinned image, keep the input cache entry read-locked until kernels complete,
582 // otherwise another thread may overwrite host memory while the GPU is still reading it.
583 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] %s will use its input directly from vRAM\n", module->name());
584 const cl_mem mem = (cl_mem)*cl_mem_input;
585 const cl_mem_flags flags = dt_opencl_get_mem_flags(mem);
586 if(flags & CL_MEM_USE_HOST_PTR)
587 if(_cl_is_zero_copy_image(pipe->devid, mem, input, roi_in, in_bpp))
588 {
590 *locked_input_entry = input_entry;
591 }
592 return 0;
593 }
594
595 if(!input)
596 {
597 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] %s has no input (cache)\n", module->name());
598 return 1;
599 }
600
602
603 // Try to reuse a cached pinned buffer; otherwise allocate a new pinned image backed by `input`.
604 gboolean input_reused_from_cache = FALSE;
605 *cl_mem_input = _gpu_init_buffer(pipe->devid, input, roi_in, in_bpp, module, "input", input_entry,
606 TRUE, FALSE, input_cst_cl, &input_reused_from_cache, keep);
607 int fail = (*cl_mem_input == NULL);
608
609 // If the input is true zero-copy, the GPU will access host memory asynchronously: keep the cache
610 // entry read-locked until all kernels have completed. If not, drivers may use a device-side copy
611 // which must be synchronized from the host before running kernels.
612 gboolean keep_lock = FALSE;
613 cl_mem mem = NULL;
614 if(!fail && *cl_mem_input)
615 {
616 mem = (cl_mem)*cl_mem_input;
617 const cl_mem_flags flags = dt_opencl_get_mem_flags(mem);
618 if(flags & CL_MEM_USE_HOST_PTR)
619 keep_lock = _cl_is_zero_copy_image(pipe->devid, mem, input, roi_in, in_bpp);
620 }
621
622 if(!fail && mem && !keep_lock)
623 {
624 const cl_int err = dt_opencl_write_host_to_device(pipe->devid, input, mem, roi_in->width, roi_in->height,
625 (int)in_bpp);
626 if(err != CL_SUCCESS)
627 {
628 dt_print(DT_DEBUG_OPENCL, "[opencl_pixelpipe] couldn't copy image host to device for module %s (%s)\n",
629 (module) ? module->op : "base buffer", "cache to input");
630 fail = TRUE;
631 }
632 else
633 {
634 dt_print(DT_DEBUG_OPENCL, "[opencl_pixelpipe] successfully copied image host to device for module %s (%s)\n",
635 (module) ? module->op : "base buffer", "cache to input");
636 }
637 }
638
639 // Enforce sync with the CPU/RAM cache so lock validity is guaranteed.
641
642 if(keep_lock)
643 *locked_input_entry = input_entry;
644 else
646
647 return fail ? 1 : 0;
648}
649
650#else // HAVE_OPENCL
651
661static inline void _gpu_clear_buffer(void **cl_mem_buffer, dt_pixel_cache_entry_t *cache_entry, void *host_ptr, int cst,
662 const gboolean cache_device)
663{
664 (void)cache_entry;
665 (void)host_ptr;
666 (void)cst;
667 (void)cache_device;
668 if(cl_mem_buffer) *cl_mem_buffer = NULL;
669}
670
671#endif // HAVE_OPENCL
672
673// clang-format off
674// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
675// vim: shiftwidth=2 expandtab tabstop=2 cindent
676// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
677// clang-format on
#define TRUE
Definition ashift_lsd.c:162
#define FALSE
Definition ashift_lsd.c:158
int dt_atomic_get_int(dt_atomic_int *var)
Definition atomic.h:63
int dt_atomic_add_int(dt_atomic_int *var, int incr)
Definition atomic.h:66
atomic_int dt_atomic_int
Definition atomic.h:60
int width
Definition bilateral.h:1
int height
Definition bilateral.h:1
dt_iop_colorspace_type_t
Definition color_conversion.h:30
@ IOP_CS_NONE
Definition color_conversion.h:31
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))
darktable_t darktable
Definition darktable.c:178
void dt_print(dt_debug_thread_t thread, const char *msg,...)
Definition darktable.c:1528
@ DT_DEBUG_OPENCL
Definition darktable.h:642
int bpp
Definition imageio/format/pdf.c:88
dt_mipmap_buffer_dsc_flags flags
Definition mipmap_cache.c:4
static unsigned long dt_opencl_get_mem_flags(void *mem)
Definition opencl.h:615
static gboolean dt_opencl_finish(const int devid)
Definition opencl.h:526
static void dt_opencl_release_mem_object(void *mem)
Definition opencl.h:619
static void dt_opencl_events_wait_for(const int devid)
Definition opencl.h:629
void dt_dev_pixelpipe_cache_rdlock_entry(dt_dev_pixelpipe_cache_t *cache, const uint64_t hash, gboolean lock, dt_pixel_cache_entry_t *cache_entry)
Lock or release the read lock on the entry.
Definition pixelpipe_cache.c:1501
void dt_pixel_cache_clmem_put(dt_pixel_cache_entry_t *entry, void *host_ptr, int devid, int width, int height, int bpp, int flags, int cst, void *mem)
Definition pixelpipe_cache.c:531
void dt_dev_pixelpipe_cache_wrlock_entry(dt_dev_pixelpipe_cache_t *cache, const uint64_t hash, gboolean lock, dt_pixel_cache_entry_t *cache_entry)
Lock or release the write lock on the entry.
Definition pixelpipe_cache.c:1482
void dt_dev_pixelpipe_cache_flush_clmem(dt_dev_pixelpipe_cache_t *cache, const int devid, void *keep)
Release cached OpenCL buffers for a device (-1 for all).
Definition pixelpipe_cache.c:379
void dt_pixel_cache_clmem_remove(dt_pixel_cache_entry_t *entry, void *mem)
Definition pixelpipe_cache.c:577
void * dt_pixel_cache_clmem_get(dt_pixel_cache_entry_t *entry, void *host_ptr, int devid, int width, int height, int bpp, int flags, int *out_cst)
Definition pixelpipe_cache.c:488
Pixelpipe cache for storing intermediate results in the pixelpipe.
#define DT_PIXELPIPE_CACHE_HASH_INVALID
Definition pixelpipe_cache.h:41
static void _gpu_clear_buffer(void **cl_mem_buffer, dt_pixel_cache_entry_t *cache_entry, void *host_ptr, int cst, const gboolean cache_device)
No-OpenCL stub for _gpu_clear_buffer().
Definition pixelpipe_cache_cl.c:661
struct dt_dev_pixelpipe_cache_t * pixelpipe_cache
Definition darktable.h:717
Definition pixelpipe_hb.h:179
int devid
Definition pixelpipe_hb.h:259
Definition develop/format.h:48
int cst
Definition develop/format.h:74
Definition imageop.h:217
GModule *dt_dev_operation_t op
Definition imageop.h:227
Definition imageop.h:67
int width
Definition imageop.h:68
int height
Definition imageop.h:68
Definition pixelpipe_cache.h:78