Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
pixelpipe_gpu.c
Go to the documentation of this file.
1/*
2 Private OpenCL pixelpipe backend.
3*/
4
5#include "common/darktable.h"
6#include "common/iop_order.h"
7#include "common/opencl.h"
8#include "develop/blend.h"
12
13#include <math.h>
14#include <stdio.h>
15
17 dt_pixel_cache_entry_t *cache_entry, const char *reason)
18{
19#ifdef HAVE_OPENCL
20 if(pipe && !pipe->realtime && pipe->devid >= 0 && host_ptr && cache_entry)
21 {
22 /* Non-realtime host writes invalidate reusable pinned images bound to the previous ROI/hash.
23 * Realtime keeps its pinned reuse untouched to avoid stalling the live draw path. */
25 pipe->devid))
26 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] flushed pinned OpenCL images after %s\n",
27 reason ? reason : "host write");
28 }
29#else
30 (void)pipe;
31 (void)host_ptr;
32 (void)cache_entry;
33 (void)reason;
34#endif
35}
36
37#ifdef HAVE_OPENCL
38
40{
41 return dt_opencl_is_inited() && piece->process_cl_ready && module->process_cl;
42}
43
45 float **input, void **cl_mem_input,
47 dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
48{
49 dt_iop_module_t *module = piece->module;
50
51 if(IS_NULL_PTR(*input))
52 {
54 *input = dt_pixel_cache_alloc(darktable.pixelpipe_cache, input_entry);
56 }
57
58 if(IS_NULL_PTR(*input))
59 {
61 "[dev_pixelpipe] %s CPU fallback has no input buffer (cache allocation failed?)\n",
62 module->name());
63 return 1;
64 }
65
67 const int fail = dt_dev_pixelpipe_cache_sync_cl_buffer(pipe->devid, *input, *cl_mem_input, &piece->roi_in, CL_MAP_READ,
68 piece->dsc_in.bpp, module,
69 "cpu fallback input copy to cache");
71
72 if(fail)
73 {
75 "[dev_pixelpipe] %s couldn't resync GPU input to cache for CPU fallback\n",
76 module->name());
77 return 1;
78 }
79 return 0;
80}
81
83 void **cl_mem_input,
84 gboolean *const borrowed_cl_mem_input,
85 const dt_dev_pixelpipe_iop_t *piece,
86 const dt_dev_pixelpipe_iop_t *previous_piece,
88 dt_pixelpipe_flow_t *pixelpipe_flow,
89 gboolean *const cache_output,
90 dt_pixel_cache_entry_t *input_entry,
91 dt_pixel_cache_entry_t *output_entry)
92{
93 dt_iop_module_t *module = piece->module;
94
95 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] %s will run directly on CPU\n", module->name());
96
102 if(module->flags() & IOP_FLAGS_TAKE_NO_INPUT)
103 return pixelpipe_process_on_CPU(pipe, piece, previous_piece, tiling, pixelpipe_flow,
104 cache_output, input_entry, output_entry);
105
106 /* CPU fallback only needs a valid host buffer. If `input` already exists here, the upstream
107 * hand-off has already materialized authoritative RAM and re-reading the same pixels back out
108 * of the cached OpenCL image is redundant. */
109 if(input && !IS_NULL_PTR(*input))
110 {
112 "[dev_pixelpipe] %s CPU fallback will reuse host input\n",
113 module->name());
114 }
115 else if(cl_mem_input && !IS_NULL_PTR(*cl_mem_input))
116 {
117 if(input && IS_NULL_PTR(*input))
118 {
120 *input = dt_pixel_cache_alloc(darktable.pixelpipe_cache, input_entry);
122 }
123
124 if(IS_NULL_PTR(input) || IS_NULL_PTR(*input))
125 {
127 "[dev_pixelpipe] %s CPU fallback has no input buffer (cache allocation failed?)\n",
128 module->name());
129 if(borrowed_cl_mem_input && *borrowed_cl_mem_input)
130 {
131 dt_dev_pixelpipe_cache_return_cl_payload(input_entry, *cl_mem_input);
132 *cl_mem_input = NULL;
133 *borrowed_cl_mem_input = FALSE;
134 }
135 else
136 dt_dev_pixelpipe_cache_release_cl_buffer(cl_mem_input, input_entry, NULL,
138 return 1;
139 }
140
141 *input = dt_dev_pixelpipe_cache_restore_cl_buffer(pipe, *input, *cl_mem_input, &piece->roi_in, module,
142 piece->dsc_in.bpp, input_entry,
143 "cpu fallback input copy to cache");
144 if(IS_NULL_PTR(*input))
145 {
147 "[dev_pixelpipe] %s couldn't resync GPU input to cache for CPU fallback\n",
148 module->name());
149 if(borrowed_cl_mem_input && *borrowed_cl_mem_input)
150 {
151 dt_dev_pixelpipe_cache_return_cl_payload(input_entry, *cl_mem_input);
152 *cl_mem_input = NULL;
153 *borrowed_cl_mem_input = FALSE;
154 }
155 else
156 dt_dev_pixelpipe_cache_release_cl_buffer(cl_mem_input, input_entry, NULL,
158 return 1;
159 }
160 }
161 else if(!input || IS_NULL_PTR(*input))
162 {
164 "[dev_pixelpipe] %s CPU fallback has no input buffer (cache allocation failed?)\n",
165 module->name());
166 return 1;
167 }
168
169 if(borrowed_cl_mem_input && *borrowed_cl_mem_input)
170 {
171 /* Device-only inputs borrowed from the cache stay owned by the cache entry.
172 * CPU fallback only needs to drop the temporary borrow after the device->host
173 * sync, otherwise releasing the cl_mem here leaves a stale cache-side pointer
174 * that later thumbnail runs may reopen as corrupted input. */
175 dt_dev_pixelpipe_cache_return_cl_payload(input_entry, *cl_mem_input);
176 *cl_mem_input = NULL;
177 *borrowed_cl_mem_input = FALSE;
178 }
179 else
180 dt_dev_pixelpipe_cache_release_cl_buffer(cl_mem_input, input_entry, *input,
182
183 return pixelpipe_process_on_CPU(pipe, piece, previous_piece, tiling, pixelpipe_flow,
184 cache_output, input_entry, output_entry);
185}
186
188 const dt_dev_pixelpipe_iop_t *previous_piece,
190 dt_pixelpipe_flow_t *pixelpipe_flow,
191 gboolean *const cache_output,
192 dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
193{
194 dt_iop_module_t *module = piece->module;
195 float *input = input_entry ? dt_pixel_cache_entry_get_data(input_entry) : NULL;
196 void *output = dt_pixel_cache_entry_get_data(output_entry);
197 void *cl_mem_input = NULL;
198 void *cl_mem_output = NULL;
199 void *cl_mem_process_input = NULL;
200 void *cl_mem_blend_input = NULL;
201 void *cl_mem_blend_output = NULL;
202 void *cl_mem_process_input_temp = NULL;
203 void *cl_mem_blend_input_temp = NULL;
204 void *cl_mem_blend_output_temp = NULL;
205 dt_pixel_cache_entry_t *cpu_input_entry = input_entry;
206 dt_pixel_cache_entry_t *locked_input_entry = NULL;
207 gboolean borrowed_cl_mem_input = FALSE;
208 const dt_iop_buffer_dsc_t actual_input_dsc = previous_piece ? previous_piece->dsc_out : pipe->dev->image_storage.dsc;
209 dt_iop_buffer_dsc_t process_input_dsc = actual_input_dsc;
210 dt_iop_buffer_dsc_t blend_input_dsc = actual_input_dsc;
211 dt_iop_buffer_dsc_t blend_output_dsc = piece->dsc_out;
212
213 // Try to reuse the cached vRAM buffer for the input entry if available
214 // except for basebuffer module which takes no input
215 if(!(piece->module->flags() & IOP_FLAGS_TAKE_NO_INPUT))
216 {
217 cl_mem_input = dt_dev_pixelpipe_cache_borrow_cl_payload(input_entry, pipe->devid,
218 piece->roi_in.width, piece->roi_in.height,
219 actual_input_dsc.bpp);
220 borrowed_cl_mem_input = (!IS_NULL_PTR(cl_mem_input));
221 if(IS_NULL_PTR(cl_mem_input))
222 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] %s could not get a cached vRAM input buffer.\n", module->name());
223
224 // Note: if that fails, we will attempt resync from RAM cache later
225
226 if(IS_NULL_PTR(input) && IS_NULL_PTR(cl_mem_input))
227 {
228 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] %s has no RAM nor vRAM input... aborting.\n", module->name());
229 return 1;
230 }
231 }
232
233 if(!_is_opencl_supported(pipe, piece, module) || !pipe->opencl_enabled || !(pipe->devid >= 0))
234 {
235 return _gpu_early_cpu_fallback_if_unsupported(pipe, &input, &cl_mem_input,
236 &borrowed_cl_mem_input, piece, previous_piece, tiling,
237 pixelpipe_flow, cache_output,
238 input_entry, output_entry);
239 }
240
241 const dt_iop_order_iccprofile_info_t *const work_profile
242 = (process_input_dsc.cst != IOP_CS_RAW || piece->dsc_in.cst != IOP_CS_RAW)
244 : NULL;
245
246 const float required_factor_cl
247 = fmaxf(1.0f, (!IS_NULL_PTR(cl_mem_input)) ? tiling->factor_cl - 1.0f : tiling->factor_cl);
248
249 const size_t precheck_width = ROUNDUPDWD(MAX(piece->roi_in.width, piece->roi_out.width), pipe->devid);
250 const size_t precheck_height = ROUNDUPDHT(MAX(piece->roi_in.height, piece->roi_out.height), pipe->devid);
251 gboolean fits_on_device = dt_opencl_image_fits_device(pipe->devid, precheck_width, precheck_height,
252 MAX(piece->dsc_in.bpp, piece->dsc_out.bpp),
253 required_factor_cl, tiling->overhead);
254 if(!fits_on_device)
255 {
257 "[dev_pixelpipe] %s pre-check didn't fit on device, flushing cached pinned buffers and retrying\n",
258 module->name());
260 fits_on_device = dt_opencl_image_fits_device(pipe->devid, precheck_width, precheck_height,
261 MAX(piece->dsc_in.bpp, piece->dsc_out.bpp),
262 required_factor_cl, tiling->overhead);
263 }
264
265 gboolean possible_cl = !(pipe->type == DT_DEV_PIXELPIPE_PREVIEW
266 && (module->flags() & IOP_FLAGS_PREVIEW_NON_OPENCL))
267 && (fits_on_device || piece->process_tiling_ready);
268
269 if(!possible_cl || !fits_on_device) *cache_output = TRUE;
270 if(*cache_output && IS_NULL_PTR(output))
271 {
272 output = dt_pixel_cache_alloc(darktable.pixelpipe_cache, output_entry);
273 if(IS_NULL_PTR(output)) goto error;
274 }
275
276 if(possible_cl && !fits_on_device)
277 {
278 // Prepare the input buffer for tiling
279 const float cl_px = dt_opencl_get_device_available(pipe->devid)
280 / (sizeof(float) * MAX(piece->dsc_in.bpp, piece->dsc_out.bpp)
281 * ceilf(required_factor_cl));
282 const float dx = MAX(piece->roi_in.width, piece->roi_out.width);
283 const float dy = MAX(piece->roi_in.height, piece->roi_out.height);
284 const float border = tiling->overlap + 1;
285 const gboolean possible = (cl_px > dx * border) || (cl_px > dy * border) || (cl_px > border * border);
286 if(!possible)
287 {
289 "[dt_dev_pixelpipe_process_rec] CL: tiling impossible in module `%s'. avail=%.1fM, requ=%.1fM (%ix%i). overlap=%i\n",
290 module->name(), cl_px / 1e6f, dx * dy / 1e6f, (int)dx, (int)dy, (int)tiling->overlap);
291 goto error;
292 }
293
294 // Ensure the input image is present on RAM cache,
295 // tiling on OpenCL will only copy tiles from it to GPU.
296 if(_gpu_init_input(pipe, &input, &cl_mem_input, piece, tiling,
297 input_entry, output_entry))
298 goto error;
299 }
300
301 if(!possible_cl) goto error;
302
303 if(fits_on_device)
304 {
305 // Alloc input GPU buffer if we didn't already borrow it
306 if(!(piece->module->flags() & IOP_FLAGS_TAKE_NO_INPUT))
307 if(dt_dev_pixelpipe_cache_prepare_cl_input(pipe, module, input, &cl_mem_input,
308 &piece->roi_in, piece->dsc_in.bpp, input_entry,
309 &locked_input_entry, NULL))
310 goto error;
311
312 cl_mem_process_input = cl_mem_input;
313
314 // Alloc output GPU buffer - non-optional
315 cl_mem_output = dt_dev_pixelpipe_cache_get_cl_buffer(pipe->devid, output, &piece->roi_out, piece->dsc_out.bpp, module,
316 "output", output_entry,
317 NULL, cl_mem_input);
318 if(IS_NULL_PTR(cl_mem_output)) goto error;
319
320 const int cst_before_cl = process_input_dsc.cst;
321 if(process_input_dsc.cst != piece->dsc_in.cst
322 && !(dt_iop_colorspace_is_rgb(process_input_dsc.cst) && dt_iop_colorspace_is_rgb(piece->dsc_in.cst)))
323 {
324 cl_mem_process_input_temp = dt_dev_pixelpipe_cache_alloc_cl_device_buffer(pipe->devid, &piece->roi_in, piece->dsc_in.bpp,
325 module, "module input colorspace temp",
326 cl_mem_input);
327 if(IS_NULL_PTR(cl_mem_process_input_temp))
328 goto error;
329
330 if(!dt_ioppr_transform_image_colorspace_cl(module, pipe->devid, cl_mem_input, cl_mem_process_input_temp,
331 piece->roi_in.width, piece->roi_in.height,
332 process_input_dsc.cst, piece->dsc_in.cst,
333 &process_input_dsc.cst, work_profile))
334 goto error;
335 cl_mem_process_input = cl_mem_process_input_temp;
336 }
337 else if(process_input_dsc.cst != piece->dsc_in.cst)
338 {
339 process_input_dsc.cst = piece->dsc_in.cst;
340 }
341 const int cst_after_cl = process_input_dsc.cst;
342
343 dt_dev_pixelpipe_debug_dump_module_io(pipe, module, "pre", TRUE, &piece->dsc_in, &piece->dsc_out,
344 &piece->roi_in, &piece->roi_out,
345 process_input_dsc.bpp, piece->dsc_out.bpp,
346 cst_before_cl, cst_after_cl);
347
348 if(!module->process_cl(module, pipe, piece, cl_mem_process_input, cl_mem_output))
349 goto error;
350
351 *pixelpipe_flow |= PIXELPIPE_FLOW_PROCESSED_ON_GPU;
353
354 if(module->flags() & IOP_FLAGS_SUPPORTS_BLENDING)
355 {
356 const dt_dev_pixelpipe_display_mask_t request_mask_display
357 = (module->dev->gui_attached && (module == module->dev->gui_module) && (pipe == module->dev->pipe))
358 ? module->request_mask_display
359 : DT_DEV_PIXELPIPE_DISPLAY_NONE;
360 const dt_pixelpipe_blend_transform_t blend_transforms
361 = dt_dev_pixelpipe_transform_for_blend(module, piece, &piece->dsc_out);
362 cl_mem_blend_input = cl_mem_process_input;
363 cl_mem_blend_output = cl_mem_output;
364 blend_input_dsc = process_input_dsc;
365 blend_output_dsc = piece->dsc_out;
366 if(blend_transforms != DT_DEV_PIXELPIPE_BLEND_TRANSFORM_NONE)
367 {
369 int success = 1;
370 const int blend_in_before = blend_input_dsc.cst;
371 if(blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_INPUT)
372 {
373 cl_mem_blend_input_temp = dt_dev_pixelpipe_cache_alloc_cl_device_buffer(pipe->devid, &piece->roi_in, piece->dsc_in.bpp,
374 module, "blend input colorspace temp",
375 cl_mem_process_input);
376 if(IS_NULL_PTR(cl_mem_blend_input_temp))
377 goto error;
378
379 success &= dt_ioppr_transform_image_colorspace_cl(module, pipe->devid,
380 cl_mem_process_input, cl_mem_blend_input_temp,
381 piece->roi_in.width, piece->roi_in.height,
382 blend_input_dsc.cst, blend_cst,
383 &blend_input_dsc.cst, work_profile);
384 cl_mem_blend_input = cl_mem_blend_input_temp;
385 }
386 const int blend_in_after = blend_input_dsc.cst;
387 dt_dev_pixelpipe_debug_dump_module_io(pipe, module, "blend-in", TRUE,
388 &process_input_dsc, &blend_input_dsc,
389 &piece->roi_in, &piece->roi_in,
390 process_input_dsc.bpp, blend_input_dsc.bpp,
391 blend_in_before, blend_in_after);
392 const int blend_out_before = blend_output_dsc.cst;
393 if(blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_OUTPUT)
394 {
395 cl_mem_blend_output_temp = dt_dev_pixelpipe_cache_alloc_cl_device_buffer(pipe->devid, &piece->roi_out,
396 piece->dsc_out.bpp, module,
397 "blend output colorspace temp", cl_mem_output);
398 if(IS_NULL_PTR(cl_mem_blend_output_temp))
399 goto error;
400
401 success &= dt_ioppr_transform_image_colorspace_cl(module, pipe->devid, cl_mem_output,
402 cl_mem_blend_output_temp, piece->roi_out.width,
403 piece->roi_out.height, blend_output_dsc.cst, blend_cst,
404 &blend_output_dsc.cst, work_profile);
405 cl_mem_blend_output = cl_mem_blend_output_temp;
406 }
407 const int blend_out_after = blend_output_dsc.cst;
408 dt_dev_pixelpipe_debug_dump_module_io(pipe, module, "blend-out", TRUE,
409 &piece->dsc_out, &blend_output_dsc,
410 &piece->roi_out, &piece->roi_out,
411 piece->dsc_out.bpp, blend_output_dsc.bpp,
412 blend_out_before, blend_out_after);
413
414 if(!success)
415 {
416 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] couldn't transform blending colorspace for module %s\n",
417 module->name());
418 goto error;
419 }
420 }
421
422 if(dt_develop_blend_process_cl(module, pipe, piece, cl_mem_blend_input, cl_mem_blend_output))
423 goto error;
424
425 if((blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_OUTPUT)
426 && request_mask_display & DT_DEV_PIXELPIPE_DISPLAY_ANY)
427 {
428 size_t origin[] = { 0, 0, 0 };
429 size_t region[] = { piece->roi_out.width, piece->roi_out.height, 1 };
430 if(dt_opencl_enqueue_copy_image(pipe->devid, cl_mem_blend_output, cl_mem_output, origin, origin,
431 region) != CL_SUCCESS)
432 goto error;
433 }
434 else if((blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_OUTPUT)
435 && !dt_ioppr_transform_image_colorspace_cl(module, pipe->devid, cl_mem_blend_output,
436 cl_mem_output, piece->roi_out.width,
437 piece->roi_out.height, blend_output_dsc.cst,
438 piece->dsc_out.cst, &blend_output_dsc.cst,
439 work_profile))
440 goto error;
441
442 *pixelpipe_flow |= PIXELPIPE_FLOW_BLENDED_ON_GPU;
443 *pixelpipe_flow &= ~(PIXELPIPE_FLOW_BLENDED_ON_CPU);
444 }
445
446 if(*cache_output)
447 {
448 if(dt_dev_pixelpipe_cache_sync_cl_buffer(pipe->devid, output, cl_mem_output, &piece->roi_out, CL_MAP_READ,
449 piece->dsc_out.bpp, module,
450 "output to cache"))
451 goto error;
452 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] output memory was copied to cache for %s\n", module->name());
453 }
454 }
455 else if(piece->process_tiling_ready && !IS_NULL_PTR(input))
456 {
457 // FIXME: we don't cover the case (piece->module->flags() & IOP_FLAGS_TAKE_NO_INPUT)
458 // in tiling path
459 const float *module_input = input;
460 const float *blend_input = input;
461 float *module_input_temp = NULL;
462 float *blend_input_temp = NULL;
463 gboolean input_locked = FALSE;
464
465 if(borrowed_cl_mem_input)
466 {
467 dt_dev_pixelpipe_cache_return_cl_payload(input_entry, cl_mem_input);
468 cl_mem_input = NULL;
469 borrowed_cl_mem_input = FALSE;
470 }
471 else
472 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_input, input_entry, input,
474
475 if(process_input_dsc.cst != piece->dsc_in.cst
476 && !(dt_iop_colorspace_is_rgb(process_input_dsc.cst) && dt_iop_colorspace_is_rgb(piece->dsc_in.cst)))
477 {
478 module_input_temp
479 = dt_pixelpipe_cache_alloc_align_float((size_t)piece->roi_in.width * piece->roi_in.height * 4, pipe);
480 if(IS_NULL_PTR(module_input_temp))
481 goto error;
482
484 input_locked = TRUE;
485 dt_ioppr_transform_image_colorspace(module, input, module_input_temp, piece->roi_in.width,
486 piece->roi_in.height, process_input_dsc.cst, piece->dsc_in.cst,
487 &process_input_dsc.cst, work_profile);
489 input_locked = FALSE;
490 module_input = module_input_temp;
491 }
492 else if(process_input_dsc.cst != piece->dsc_in.cst)
493 {
494 process_input_dsc.cst = piece->dsc_in.cst;
496 input_locked = TRUE;
497 }
498 else
499 {
501 input_locked = TRUE;
502 }
503
504 int fail = !module->process_tiling_cl(module, pipe, piece, module_input, output, piece->dsc_in.bpp);
505 dt_opencl_finish(pipe->devid);
506
507 if(fail)
508 {
509 if(input_locked)
511 dt_pixelpipe_cache_free_align(module_input_temp);
512 goto error;
513 }
514
516 *pixelpipe_flow &= ~(PIXELPIPE_FLOW_PROCESSED_ON_CPU);
517
518 blend_input = module_input;
519 blend_input_dsc = process_input_dsc;
520 void *blend_output = output;
521 blend_output_dsc = piece->dsc_out;
522
523 const dt_dev_pixelpipe_display_mask_t request_mask_display
524 = (module->dev->gui_attached && (module == module->dev->gui_module) && (pipe == module->dev->pipe))
525 ? module->request_mask_display
526 : DT_DEV_PIXELPIPE_DISPLAY_NONE;
527 const dt_pixelpipe_blend_transform_t blend_transforms
528 = dt_dev_pixelpipe_transform_for_blend(module, piece, &piece->dsc_out);
529 if(blend_transforms != DT_DEV_PIXELPIPE_BLEND_TRANSFORM_NONE)
530 {
532
533 if(blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_INPUT)
534 {
535 blend_input_temp
536 = dt_pixelpipe_cache_alloc_align_float((size_t)piece->roi_in.width * piece->roi_in.height * 4, pipe);
537 if(IS_NULL_PTR(blend_input_temp))
538 {
539 if(input_locked)
541 dt_pixelpipe_cache_free_align(module_input_temp);
542 goto error;
543 }
544
545 dt_ioppr_transform_image_colorspace(module, module_input, blend_input_temp, piece->roi_in.width,
546 piece->roi_in.height, blend_input_dsc.cst, blend_cst,
547 &blend_input_dsc.cst, work_profile);
548 blend_input = blend_input_temp;
549 if(input_locked)
550 {
552 input_locked = FALSE;
553 }
554 }
555
556 if(blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_OUTPUT)
557 {
558 float *blend_output_temp
559 = dt_pixelpipe_cache_alloc_align_float((size_t)piece->roi_out.width * piece->roi_out.height * 4, pipe);
560 if(IS_NULL_PTR(blend_output_temp))
561 {
562 if(input_locked)
564 dt_pixelpipe_cache_free_align(blend_input_temp);
565 dt_pixelpipe_cache_free_align(module_input_temp);
566 goto error;
567 }
568
569 dt_ioppr_transform_image_colorspace(module, output, blend_output_temp, piece->roi_out.width,
570 piece->roi_out.height, blend_output_dsc.cst, blend_cst,
571 &blend_output_dsc.cst, work_profile);
572 blend_output = blend_output_temp;
573 }
574 }
575
576 dt_develop_blend_process(module, pipe, piece, blend_input, blend_output);
577 *pixelpipe_flow |= PIXELPIPE_FLOW_BLENDED_ON_CPU;
578 *pixelpipe_flow &= ~(PIXELPIPE_FLOW_BLENDED_ON_GPU);
579
580 if((blend_transforms & DT_DEV_PIXELPIPE_BLEND_TRANSFORM_OUTPUT))
581 {
582 if(request_mask_display & DT_DEV_PIXELPIPE_DISPLAY_ANY)
583 {
584 memcpy(output, blend_output,
585 (size_t)piece->roi_out.width * piece->roi_out.height * piece->dsc_out.bpp);
586 }
587 else
588 {
589 dt_ioppr_transform_image_colorspace(module, blend_output, output, piece->roi_out.width,
590 piece->roi_out.height, blend_output_dsc.cst, piece->dsc_out.cst,
591 &blend_output_dsc.cst, work_profile);
592 }
593 }
594
595 if(input_locked)
597 if(blend_output != output)
598 dt_pixelpipe_cache_free_align(blend_output);
599 dt_pixelpipe_cache_free_align(blend_input_temp);
600 dt_pixelpipe_cache_free_align(module_input_temp);
601 }
602 else
603 {
604 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] could not run module '%s' on gpu. falling back to cpu path\n",
605 module->name());
606 goto error;
607 }
608
609 dt_opencl_finish(pipe->devid);
610
611 if(locked_input_entry)
613
614 /* Borrowed vRAM inputs must stay protected until the current queue completed, otherwise
615 * another pipe can flush or recycle the shared device buffer while the queued kernels
616 * are still reading it. */
617 if(borrowed_cl_mem_input)
618 {
619 dt_dev_pixelpipe_cache_return_cl_payload(input_entry, cl_mem_input);
620 cl_mem_input = NULL;
621 borrowed_cl_mem_input = FALSE;
622 }
623 else
624 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_input, input_entry, input,
626
627 /* The backend now owns the authoritative module output payload until publish time.
628 * When the output stayed GPU-only, the recursion no longer carries `cl_mem_output`
629 * back explicitly, so we must cache it here before returning. Otherwise
630 * the caller publishes a cacheline with metadata only and no recoverable payload. */
631 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_output, output_entry, output, TRUE);
632
633 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_blend_output_temp, NULL, NULL, FALSE);
634 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_blend_input_temp, NULL, NULL, FALSE);
635 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_process_input_temp, NULL, NULL, FALSE);
636
637 return 0;
638
639error:
640 dt_print(DT_DEBUG_OPENCL, "[dev_pixelpipe] %s couldn't process on GPU\n", module->name());
641
642 dt_opencl_finish(pipe->devid);
643
644 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_blend_output_temp, NULL, NULL, FALSE);
645 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_blend_input_temp, NULL, NULL, FALSE);
646 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_process_input_temp, NULL, NULL, FALSE);
647
648 if(locked_input_entry)
650
651 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_output, output_entry, NULL, FALSE);
652
653 if(module->flags() & IOP_FLAGS_TAKE_NO_INPUT)
654 {
655 /* Root modules build their own input from external storage. If the OpenCL pre-check
656 * rejects the required allocation, CPU fallback must keep the same no-input contract
657 * instead of looking for an upstream cacheline that cannot exist. */
658 const size_t required_mib
659 = ((size_t)precheck_width * precheck_height * MAX(piece->dsc_in.bpp, piece->dsc_out.bpp))
660 / (1024 * 1024);
661 const size_t max_alloc_mib = (size_t)dt_opencl_get_device_memalloc(pipe->devid) / (1024 * 1024);
662 dt_control_log(_("OpenCL failed for module `%s`: image buffer needs %" G_GSIZE_FORMAT
663 " MiB but device limit is %" G_GSIZE_FORMAT " MiB; falling back to CPU"),
664 module->name(), required_mib, max_alloc_mib);
665 return pixelpipe_process_on_CPU(pipe, piece, previous_piece, tiling, pixelpipe_flow,
666 cache_output, cpu_input_entry, output_entry);
667 }
668
669 if(!IS_NULL_PTR(input))
670 {
672 "[dev_pixelpipe] %s GPU error fallback will reuse host input\n",
673 module->name());
674 }
675 else if(!IS_NULL_PTR(cl_mem_input))
676 {
677 if(_gpu_init_input(pipe, &input, &cl_mem_input, piece, tiling,
678 cpu_input_entry, output_entry))
679 {
680 if(borrowed_cl_mem_input)
681 {
682 dt_dev_pixelpipe_cache_return_cl_payload(cpu_input_entry, cl_mem_input);
683 cl_mem_input = NULL;
684 borrowed_cl_mem_input = FALSE;
685 }
686 else
687 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_input, cpu_input_entry, NULL,
688 dt_dev_pixelpipe_cache_gpu_device_buffer(pipe, cpu_input_entry));
689 return 1;
690 }
691 }
692 else if(IS_NULL_PTR(input))
693 {
695 "[dev_pixelpipe] %s CPU fallback has no input buffer (cache allocation failed?)\n",
696 module->name());
697 return 1;
698 }
699
700 if(borrowed_cl_mem_input)
701 {
702 dt_dev_pixelpipe_cache_return_cl_payload(cpu_input_entry, cl_mem_input);
703 cl_mem_input = NULL;
704 }
705 else
706 dt_dev_pixelpipe_cache_release_cl_buffer(&cl_mem_input, cpu_input_entry, input,
707 dt_dev_pixelpipe_cache_gpu_device_buffer(pipe, cpu_input_entry));
708
709 return pixelpipe_process_on_CPU(pipe, piece, previous_piece, tiling, pixelpipe_flow,
710 cache_output, cpu_input_entry, output_entry);
711}
712
713#else
714
716 const dt_dev_pixelpipe_iop_t *previous_piece,
718 dt_pixelpipe_flow_t *pixelpipe_flow,
719 gboolean *const cache_output,
720 dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
721{
722 return pixelpipe_process_on_CPU(pipe, piece, previous_piece, tiling, pixelpipe_flow,
723 cache_output, input_entry, output_entry);
724}
725
726#endif
static void error(char *msg)
Definition ashift_lsd.c:202
#define TRUE
Definition ashift_lsd.c:162
#define FALSE
Definition ashift_lsd.c:158
int dt_develop_blend_process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out)
Definition blend.c:1087
dt_iop_colorspace_type_t dt_develop_blend_colorspace(const dt_dev_pixelpipe_iop_t *const piece, dt_iop_colorspace_type_t cst)
Definition blend.c:179
int dt_develop_blend_process(struct dt_iop_module_t *self, dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid)
Definition blend.c:641
dt_iop_colorspace_type_t
@ IOP_CS_RAW
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))
void dt_control_log(const char *msg,...)
Definition control.c:761
darktable_t darktable
Definition darktable.c:181
void dt_print(dt_debug_thread_t thread, const char *msg,...)
Definition darktable.c:1542
@ DT_DEBUG_OPENCL
Definition darktable.h:722
@ DT_DEBUG_TILING
Definition darktable.h:739
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453
#define dt_pixelpipe_cache_alloc_align_float(pixels, pipe)
Definition darktable.h:442
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281
dt_dev_pixelpipe_display_mask_t
Definition develop.h:116
@ DT_DEV_PIXELPIPE_DISPLAY_ANY
Definition develop.h:138
static gboolean dt_iop_colorspace_is_rgb(const dt_iop_colorspace_type_t cst)
Definition imageop.h:213
@ IOP_FLAGS_SUPPORTS_BLENDING
Definition imageop.h:167
@ IOP_FLAGS_TAKE_NO_INPUT
Definition imageop.h:176
void dt_ioppr_transform_image_colorspace(struct dt_iop_module_t *self, const float *const image_in, float *const image_out, const int width, const int height, const int cst_from, const int cst_to, int *converted_cst, const dt_iop_order_iccprofile_info_t *const profile_info)
dt_iop_order_iccprofile_info_t * dt_ioppr_get_pipe_work_profile_info(const struct dt_dev_pixelpipe_t *pipe)
int dt_ioppr_transform_image_colorspace_cl(struct dt_iop_module_t *self, const int devid, cl_mem dev_img_in, cl_mem dev_img_out, const int width, const int height, const int cst_from, const int cst_to, int *converted_cst, const dt_iop_order_iccprofile_info_t *const profile_info)
cl_ulong dt_opencl_get_device_available(const int devid)
Definition opencl.c:2664
int dt_opencl_is_inited(void)
Definition opencl.c:2730
int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst, size_t *region)
Definition opencl.c:2261
gboolean dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height, const unsigned bpp, const float factor, const size_t overhead)
Definition opencl.c:2683
cl_ulong dt_opencl_get_device_memalloc(const int devid)
Definition opencl.c:2677
gboolean dt_opencl_finish(const int devid)
Definition opencl.c:1347
#define ROUNDUPDHT(a, b)
Definition opencl.h:82
#define ROUNDUPDWD(a, b)
Definition opencl.h:81
@ DT_DEV_PIXELPIPE_PREVIEW
Definition pixelpipe.h:40
void * dt_dev_pixelpipe_cache_borrow_cl_payload(dt_pixel_cache_entry_t *entry, int devid, int width, int height, int bpp)
Borrow a cached OpenCL payload attached to a cache entry.
void * dt_pixel_cache_entry_get_data(dt_pixel_cache_entry_t *entry)
void * dt_pixel_cache_alloc(dt_dev_pixelpipe_cache_t *cache, dt_pixel_cache_entry_t *cache_entry)
Actually allocate the memory buffer attached to the cache entry once you create it with dt_dev_pixelp...
void dt_dev_pixelpipe_cache_release_cl_buffer(void **cl_mem_buffer, dt_pixel_cache_entry_t *cache_entry, void *host_ptr, const gboolean cache_device)
Release or cache an OpenCL image associated with a host cache line.
void * dt_dev_pixelpipe_cache_get_cl_buffer(int devid, void *const host_ptr, const dt_iop_roi_t *roi, const size_t bpp, dt_iop_module_t *module, const char *message, dt_pixel_cache_entry_t *cache_entry, gboolean *out_reused, void *keep)
void * dt_dev_pixelpipe_cache_alloc_cl_device_buffer(int devid, const dt_iop_roi_t *roi, const size_t bpp, const dt_iop_module_t *module, const char *message, void *keep)
int dt_dev_pixelpipe_cache_prepare_cl_input(dt_dev_pixelpipe_t *pipe, dt_iop_module_t *module, float *input, void **cl_mem_input, const dt_iop_roi_t *roi_in, const size_t in_bpp, dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t **locked_input_entry, void *keep)
Prepare/obtain the OpenCL input image for a module.
void dt_dev_pixelpipe_cache_wrlock_entry(dt_dev_pixelpipe_cache_t *cache, gboolean lock, dt_pixel_cache_entry_t *cache_entry)
Lock or release the write lock on the entry.
int dt_dev_pixelpipe_cache_sync_cl_buffer(const int devid, void *host_ptr, void *cl_mem_buffer, const dt_iop_roi_t *roi, int cl_mode, size_t bpp, dt_iop_module_t *module, const char *message)
Synchronize between host memory and a pinned OpenCL image.
void dt_dev_pixelpipe_cache_return_cl_payload(dt_pixel_cache_entry_t *entry, void *mem)
Return a borrowed cached OpenCL payload to its cache entry.
gboolean dt_dev_pixelpipe_cache_flush_host_pinned_image(dt_dev_pixelpipe_cache_t *cache, void *host_ptr, dt_pixel_cache_entry_t *entry_hint, int devid)
Drop cached pinned OpenCL images associated with a given host buffer.
void dt_dev_pixelpipe_cache_rdlock_entry(dt_dev_pixelpipe_cache_t *cache, gboolean lock, dt_pixel_cache_entry_t *cache_entry)
Lock or release the read lock on the entry.
void dt_dev_pixelpipe_cache_flush_clmem(dt_dev_pixelpipe_cache_t *cache, const int devid)
Release cached OpenCL buffers for a single device.
float * dt_dev_pixelpipe_cache_restore_cl_buffer(dt_dev_pixelpipe_t *pipe, float *input, void *cl_mem_input, const dt_iop_roi_t *roi_in, dt_iop_module_t *module, const size_t in_bpp, dt_pixel_cache_entry_t *input_entry, const char *message)
Force device → host resynchronization of the pixelpipe input cache line.
Pixelpipe cache for storing intermediate results in the pixelpipe.
int pixelpipe_process_on_CPU(dt_dev_pixelpipe_t *pipe, const dt_dev_pixelpipe_iop_t *piece, const dt_dev_pixelpipe_iop_t *previous_piece, dt_develop_tiling_t *tiling, dt_pixelpipe_flow_t *pixelpipe_flow, gboolean *const cache_output, dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
int pixelpipe_process_on_GPU(dt_dev_pixelpipe_t *pipe, const dt_dev_pixelpipe_iop_t *piece, const dt_dev_pixelpipe_iop_t *previous_piece, dt_develop_tiling_t *tiling, dt_pixelpipe_flow_t *pixelpipe_flow, gboolean *const cache_output, dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
static int _is_opencl_supported(dt_dev_pixelpipe_t *pipe, const dt_dev_pixelpipe_iop_t *piece, dt_iop_module_t *module)
static int _gpu_early_cpu_fallback_if_unsupported(dt_dev_pixelpipe_t *pipe, float **input, void **cl_mem_input, gboolean *const borrowed_cl_mem_input, const dt_dev_pixelpipe_iop_t *piece, const dt_dev_pixelpipe_iop_t *previous_piece, dt_develop_tiling_t *tiling, dt_pixelpipe_flow_t *pixelpipe_flow, gboolean *const cache_output, dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
void dt_dev_pixelpipe_gpu_flush_host_pinned_images(dt_dev_pixelpipe_t *pipe, void *host_ptr, dt_pixel_cache_entry_t *cache_entry, const char *reason)
static int _gpu_init_input(dt_dev_pixelpipe_t *pipe, float **input, void **cl_mem_input, const dt_dev_pixelpipe_iop_t *piece, dt_develop_tiling_t *tiling, dt_pixel_cache_entry_t *input_entry, dt_pixel_cache_entry_t *output_entry)
dt_pixelpipe_blend_transform_t dt_dev_pixelpipe_transform_for_blend(const dt_iop_module_t *const self, const dt_dev_pixelpipe_iop_t *const piece, const dt_iop_buffer_dsc_t *const output_dsc)
gboolean dt_dev_pixelpipe_cache_gpu_device_buffer(const dt_dev_pixelpipe_t *pipe, const dt_pixel_cache_entry_t *cache_entry)
void dt_dev_pixelpipe_debug_dump_module_io(dt_dev_pixelpipe_t *pipe, dt_iop_module_t *module, const char *stage, const gboolean is_cl, const dt_iop_buffer_dsc_t *in_dsc, const dt_iop_buffer_dsc_t *out_dsc, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out, const size_t in_bpp, const size_t out_bpp, const int cst_before, const int cst_after)
dt_pixelpipe_flow_t
@ PIXELPIPE_FLOW_PROCESSED_ON_CPU
@ PIXELPIPE_FLOW_PROCESSED_WITH_TILING
@ PIXELPIPE_FLOW_PROCESSED_ON_GPU
@ PIXELPIPE_FLOW_BLENDED_ON_CPU
@ PIXELPIPE_FLOW_BLENDED_ON_GPU
dt_pixelpipe_blend_transform_t
@ DT_DEV_PIXELPIPE_BLEND_TRANSFORM_INPUT
@ DT_DEV_PIXELPIPE_BLEND_TRANSFORM_NONE
@ DT_DEV_PIXELPIPE_BLEND_TRANSFORM_OUTPUT
struct dt_dev_pixelpipe_cache_t * pixelpipe_cache
Definition darktable.h:790
dt_iop_buffer_dsc_t dsc_out
dt_iop_buffer_dsc_t dsc_in
dt_atomic_int realtime
dt_dev_pixelpipe_type_t type
struct dt_develop_t * dev
dt_image_t image_storage
Definition develop.h:259
dt_iop_buffer_dsc_t dsc
Definition image.h:337
#define MAX(a, b)
Definition thinplate.c:29