Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
tiling.c
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2011-2014, 2016-2017 Ulrich Pegelow.
4 Copyright (C) 2012 Richard Wonka.
5 Copyright (C) 2012-2014, 2016, 2018 Tobias Ellinghaus.
6 Copyright (C) 2013-2014, 2016 Roman Lebedev.
7 Copyright (C) 2013 Simon Spannagel.
8 Copyright (C) 2014 Bruce Guenter.
9 Copyright (C) 2016 Pedro Côrte-Real.
10 Copyright (C) 2018 Edgardo Hoszowski.
11 Copyright (C) 2019 Andreas Schneider.
12 Copyright (C) 2020-2021 Hubert Kowalski.
13 Copyright (C) 2020-2021 Pascal Obry.
14 Copyright (C) 2020-2021 Ralf Brown.
15 Copyright (C) 2021, 2023, 2025-2026 Aurélien PIERRE.
16 Copyright (C) 2021-2022 Hanno Schwalm.
17 Copyright (C) 2022 Martin Bařinka.
18 Copyright (C) 2024 Alynx Zhou.
19
20 darktable is free software: you can redistribute it and/or modify
21 it under the terms of the GNU General Public License as published by
22 the Free Software Foundation, either version 3 of the License, or
23 (at your option) any later version.
24
25 darktable is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 GNU General Public License for more details.
29
30 You should have received a copy of the GNU General Public License
31 along with darktable. If not, see <http://www.gnu.org/licenses/>.
32*/
33
34
35#include "common/darktable.h"
36#include "develop/tiling.h"
37#include "common/opencl.h"
38#include "control/control.h"
39#include "develop/blend.h"
40#include "develop/pixelpipe.h"
41
42#include <assert.h>
43#include <math.h>
44#include <stdlib.h>
45#include <string.h>
46#include <strings.h>
47#include <unistd.h>
48
49#define CLAMPI(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
50
51
52/* this defines an additional alignment requirement for opencl image width.
53 It can have strong effects on processing speed. Reasonable values are a
54 power of 2. set to 1 for no effect. */
55#define CL_ALIGNMENT ((piece->dsc_in.filters != 9u) ? 4 : 1)
56
57/* parameter RESERVE for extended roi_in sizes due to inaccuracies when doing
58 roi_out -> roi_in estimations.
59 Needs to be increased if tiling fails due to insufficient buffer sizes. */
60#define RESERVE 5
61
62/* greatest common divisor */
63static unsigned _gcd(unsigned a, unsigned b)
64{
65 unsigned t;
66 while(b != 0)
67 {
68 t = b;
69 b = a % b;
70 a = t;
71 }
72 return MAX(a, 1);
73}
74
75/* least common multiple */
76static unsigned _lcm(unsigned a, unsigned b)
77{
78 return (((unsigned long)a * b) / _gcd(a, b));
79}
80
81
82static inline int _min(int a, int b)
83{
84 return a < b ? a : b;
85}
86
87static inline int _max(int a, int b)
88{
89 return a > b ? a : b;
90}
91
92
93static inline int _align_up(int n, int a)
94{
95 return n + a - (n % a);
96}
97static inline int _align_down(int n, int a)
98{
99 return n - (n % a);
100}
101static inline int _align_close(int n, int a)
102{
103 const int off = n % a;
104 const int shift = (off > a/2) ? a - off : -off;
105 return n + shift;
106}
107
108/*
109 Completely arbitrary... Make that a pref ?
110*/
111static inline int _maximum_number_tiles()
112{
113 return 10000;
114}
115
116static inline void _print_roi(const dt_iop_roi_t *roi, const char *label)
117{
119 fprintf(stderr," {%5d %5d ->%5d %5d (%5dx%5d) %.6f } %s\n",
120 roi->x, roi->y, roi->x + roi->width, roi->y + roi->height, roi->width, roi->height, roi->scale, label);
121}
122
123
124#if 0
125static void
126_nm_constraints(double x[], int n)
127{
128 x[0] = fabs(x[0]);
129 x[1] = fabs(x[1]);
130 x[2] = fabs(x[2]);
131 x[3] = fabs(x[3]);
132
133 if(x[0] > 1.0) x[0] = 1.0 - x[0];
134 if(x[1] > 1.0) x[1] = 1.0 - x[1];
135 if(x[2] > 1.0) x[2] = 1.0 - x[2];
136 if(x[3] > 1.0) x[3] = 1.0 - x[3];
137
138}
139#endif
140
141static double _nm_fitness(double x[], void *rest[])
142{
143 struct dt_iop_module_t *self = (struct dt_iop_module_t *)rest[0];
144 const struct dt_dev_pixelpipe_iop_t *piece = (const struct dt_dev_pixelpipe_iop_t *)rest[1];
145 struct dt_iop_roi_t *iroi = (struct dt_iop_roi_t *)rest[2];
146 struct dt_iop_roi_t *oroi = (struct dt_iop_roi_t *)rest[3];
147 const struct dt_dev_pixelpipe_t *pipe = (const struct dt_dev_pixelpipe_t *)rest[4];
148
149 dt_iop_roi_t oroi_test = *oroi;
150 oroi_test.x = x[0] * piece->iwidth;
151 oroi_test.y = x[1] * piece->iheight;
152 oroi_test.width = x[2] * piece->iwidth;
153 oroi_test.height = x[3] * piece->iheight;
154
155 dt_iop_roi_t iroi_probe = *iroi;
156 dt_dev_pixelpipe_iop_t piece_copy = *piece;
157 self->modify_roi_in(self, pipe, &piece_copy, &oroi_test, &iroi_probe);
158
159 double fitness = 0.0;
160
161 fitness += (double)(iroi_probe.x - iroi->x) * (iroi_probe.x - iroi->x);
162 fitness += (double)(iroi_probe.y - iroi->y) * (iroi_probe.y - iroi->y);
163 fitness += (double)(iroi_probe.width - iroi->width) * (iroi_probe.width - iroi->width);
164 fitness += (double)(iroi_probe.height - iroi->height) * (iroi_probe.height - iroi->height);
165
166 return fitness;
167}
168
169
170/* We use a Nelder-Mead simplex algorithm based on an implementation of Michael F. Hutt.
171 It is covered by the following copyright notice: */
172/*
173 * Program: nmsimplex.c
174 * Author : Michael F. Hutt
175 * http://www.mikehutt.com
176 * 11/3/97
177 *
178 * An implementation of the Nelder-Mead simplex method.
179 *
180 * Copyright (c) 1997-2011 <Michael F. Hutt>
181 *
182 * Permission is hereby granted, free of charge, to any person obtaining
183 * a copy of this software and associated documentation files (the
184 * "Software"), to deal in the Software without restriction, including
185 * without limitation the rights to use, copy, modify, merge, publish,
186 * distribute, sublicense, and/or sell copies of the Software, and to
187 * permit persons to whom the Software is furnished to do so, subject to
188 * the following conditions:
189 *
190 * The above copyright notice and this permission notice shall be
191 * included in all copies or substantial portions of the Software.
192 *
193 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
194 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
195 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
196 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
197 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
198 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
199 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
200 *
201 */
202
203#define MAX_IT 1000 /* maximum number of iterations */
204#define ALPHA 1.0 /* reflection coefficient */
205#define BETA 0.5 /* contraction coefficient */
206#define GAMMA 2.0 /* expansion coefficient */
207
208static int _simplex(double (*objfunc)(double[], void *[]), double start[], int n, double EPSILON,
209 double scale, int maxiter, void (*constrain)(double[], int n), void *rest[])
210{
211
212 int vs; /* vertex with smallest value */
213 int vh; /* vertex with next smallest value */
214 int vg; /* vertex with largest value */
215
216 int i, j = 0, m, row;
217 int itr; /* track the number of iterations */
218
219 double **v; /* holds vertices of simplex */
220 double pn, qn; /* values used to create initial simplex */
221 double *f; /* value of function at each vertex */
222 double fr; /* value of function at reflection point */
223 double fe; /* value of function at expansion point */
224 double fc; /* value of function at contraction point */
225 double *vr; /* reflection - coordinates */
226 double *ve; /* expansion - coordinates */
227 double *vc; /* contraction - coordinates */
228 double *vm; /* centroid - coordinates */
229
230 double fsum, favg, s, cent;
231
232 /* dynamically allocate arrays */
233
234 /* allocate the rows of the arrays */
235 v = (double **)malloc(sizeof(double *) * (n + 1));
236 f = (double *)malloc(sizeof(double) * (n + 1));
237 vr = (double *)malloc(sizeof(double) * n);
238 ve = (double *)malloc(sizeof(double) * n);
239 vc = (double *)malloc(sizeof(double) * n);
240 vm = (double *)malloc(sizeof(double) * n);
241
242 /* allocate the columns of the arrays */
243 for(i = 0; i <= n; i++)
244 {
245 v[i] = (double *)malloc(sizeof(double) * n);
246 }
247
248 /* create the initial simplex */
249 /* assume one of the vertices is 0,0 */
250
251 pn = scale * (sqrt(n + 1) - 1 + n) / (n * sqrt(2));
252 qn = scale * (sqrt(n + 1) - 1) / (n * sqrt(2));
253
254 for(i = 0; i < n; i++)
255 {
256 v[0][i] = start[i];
257 }
258
259 for(i = 1; i <= n; i++)
260 {
261 for(j = 0; j < n; j++)
262 {
263 if(i - 1 == j)
264 {
265 v[i][j] = pn + start[j];
266 }
267 else
268 {
269 v[i][j] = qn + start[j];
270 }
271 }
272 }
273
274 if(!IS_NULL_PTR(constrain))
275 {
276 constrain(v[j], n);
277 }
278 /* find the initial function values */
279 for(j = 0; j <= n; j++)
280 {
281 f[j] = objfunc(v[j], rest);
282 }
283
284#if 0
285 /* print out the initial values */
286 printf ("Initial Values\n");
287 for (j = 0; j <= n; j++)
288 {
289 for (i = 0; i < n; i++)
290 {
291 printf ("%f %f\n", v[j][i], f[j]);
292 }
293 }
294#endif
295
296 /* begin the main loop of the minimization */
297 for(itr = 1; itr <= maxiter; itr++)
298 {
299 /* find the index of the largest value */
300 vg = 0;
301 for(j = 0; j <= n; j++)
302 {
303 if(f[j] > f[vg])
304 {
305 vg = j;
306 }
307 }
308
309 /* find the index of the smallest value */
310 vs = 0;
311 for(j = 0; j <= n; j++)
312 {
313 if(f[j] < f[vs])
314 {
315 vs = j;
316 }
317 }
318
319 /* find the index of the second largest value */
320 vh = vs;
321 for(j = 0; j <= n; j++)
322 {
323 if(f[j] > f[vh] && f[j] < f[vg])
324 {
325 vh = j;
326 }
327 }
328
329 /* calculate the centroid */
330 for(j = 0; j <= n - 1; j++)
331 {
332 cent = 0.0;
333 for(m = 0; m <= n; m++)
334 {
335 if(m != vg)
336 {
337 cent += v[m][j];
338 }
339 }
340 vm[j] = cent / n;
341 }
342
343 /* reflect vg to new vertex vr */
344 for(j = 0; j <= n - 1; j++)
345 {
346 /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j]; */
347 vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
348 }
349 if(!IS_NULL_PTR(constrain))
350 {
351 constrain(vr, n);
352 }
353 fr = objfunc(vr, rest);
354
355 if(fr < f[vh] && fr >= f[vs])
356 {
357 for(j = 0; j <= n - 1; j++)
358 {
359 v[vg][j] = vr[j];
360 }
361 f[vg] = fr;
362 }
363
364 /* investigate a step further in this direction */
365 if(fr < f[vs])
366 {
367 for(j = 0; j <= n - 1; j++)
368 {
369 /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j]; */
370 ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
371 }
372 if(!IS_NULL_PTR(constrain))
373 {
374 constrain(ve, n);
375 }
376 fe = objfunc(ve, rest);
377
378 /* by making fe < fr as opposed to fe < f[vs],
379 Rosenbrocks function takes 63 iterations as opposed
380 to 64 when using double variables. */
381
382 if(fe < fr)
383 {
384 for(j = 0; j <= n - 1; j++)
385 {
386 v[vg][j] = ve[j];
387 }
388 f[vg] = fe;
389 }
390 else
391 {
392 for(j = 0; j <= n - 1; j++)
393 {
394 v[vg][j] = vr[j];
395 }
396 f[vg] = fr;
397 }
398 }
399
400 /* check to see if a contraction is necessary */
401 if(fr >= f[vh])
402 {
403 if(fr < f[vg] && fr >= f[vh])
404 {
405 /* perform outside contraction */
406 for(j = 0; j <= n - 1; j++)
407 {
408 /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
409 vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
410 }
411 if(!IS_NULL_PTR(constrain))
412 {
413 constrain(vc, n);
414 }
415 fc = objfunc(vc, rest);
416 }
417 else
418 {
419 /* perform inside contraction */
420 for(j = 0; j <= n - 1; j++)
421 {
422 /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
423 vc[j] = vm[j] - BETA * (vm[j] - v[vg][j]);
424 }
425 if(!IS_NULL_PTR(constrain))
426 {
427 constrain(vc, n);
428 }
429 fc = objfunc(vc, rest);
430 }
431
432
433 if(fc < f[vg])
434 {
435 for(j = 0; j <= n - 1; j++)
436 {
437 v[vg][j] = vc[j];
438 }
439 f[vg] = fc;
440 }
441 /* at this point the contraction is not successful,
442 we must halve the distance from vs to all the
443 vertices of the simplex and then continue.
444 10/31/97 - modified to account for ALL vertices.
445 */
446 else
447 {
448 for(row = 0; row <= n; row++)
449 {
450 if(row != vs)
451 {
452 for(j = 0; j <= n - 1; j++)
453 {
454 v[row][j] = v[vs][j] + (v[row][j] - v[vs][j]) / 2.0;
455 }
456 }
457 }
458 if(!IS_NULL_PTR(constrain))
459 {
460 constrain(v[vg], n);
461 }
462 f[vg] = objfunc(v[vg], rest);
463 if(!IS_NULL_PTR(constrain))
464 {
465 constrain(v[vh], n);
466 }
467 f[vh] = objfunc(v[vh], rest);
468 }
469 }
470
471#if 0
472 /* print out the value at each iteration */
473 printf ("Iteration %d\n", itr);
474 for (j = 0; j <= n; j++)
475 {
476 for (i = 0; i < n; i++)
477 {
478 printf ("%f %f\n", v[j][i], f[j]);
479 }
480 }
481#endif
482
483 /* test for convergence */
484 fsum = 0.0;
485 for(j = 0; j <= n; j++)
486 {
487 fsum += f[j];
488 }
489 favg = fsum / (n + 1);
490 s = 0.0;
491 for(j = 0; j <= n; j++)
492 {
493 s += pow((f[j] - favg), 2.0) / (n);
494 }
495 s = sqrt(s);
496 if(s < EPSILON) break;
497 }
498 /* end main loop of the minimization */
499
500 /* find the index of the smallest value */
501 vs = 0;
502 for(j = 0; j <= n; j++)
503 {
504 if(f[j] < f[vs])
505 {
506 vs = j;
507 }
508 }
509
510#if 0
511 printf ("The minimum was found at\n");
512 for (j = 0; j < n; j++)
513 {
514 printf ("%e\n", v[vs][j]);
515 start[j] = v[vs][j];
516 }
517 double min = objfunc (v[vs], rest);
518 printf ("Function value at minimum %f\n", min);
519 k++;
520 printf ("%d Function Evaluations\n", k);
521 printf ("%d Iterations through program\n", itr);
522#endif
523
524 dt_free(f);
525 dt_free(vr);
526 dt_free(ve);
527 dt_free(vc);
528 dt_free(vm);
529 for(i = 0; i <= n; i++)
530 {
531 dt_free(v[i]);
532 }
533 dt_free(v);
534 return itr;
535}
536
537
538static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
539 const struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *iroi,
540 dt_iop_roi_t *oroi, int delta)
541{
542 void *rest[5] = { (void *)self, (void *)piece, (void *)iroi, (void *)oroi, (void *)pipe };
543 double start[4] = { (float)oroi->x / piece->iwidth, (float)oroi->y / piece->iheight,
544 (float)oroi->width / piece->iwidth, (float)oroi->height / piece->iheight };
545 double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
546 int maxiter = 1000;
547
548 int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
549
550 dt_vprint(DT_DEBUG_TILING, "[_nm_fit_output_to_input_roi] _simplex: %d, delta: %d, epsilon: %f\n", iter, delta, epsilon);
551
552 oroi->x = start[0] * piece->iwidth;
553 oroi->y = start[1] * piece->iheight;
554 oroi->width = start[2] * piece->iwidth;
555 oroi->height = start[3] * piece->iheight;
556
557 return (iter <= maxiter);
558}
559
560
561
562/* find a matching oroi_full by probing start value of oroi and get corresponding input roi into iroi_probe.
563 We search in two steps. first by a simplicistic iterative search which will succeed in most cases.
564 If this does not converge, we do a downhill simplex (nelder-mead) fitting */
565static int _fit_output_to_input_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
566 const struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *iroi,
567 dt_iop_roi_t *oroi, int delta, int iter)
568{
569 dt_iop_roi_t iroi_probe = *iroi;
570 dt_iop_roi_t save_oroi = *oroi;
571 dt_dev_pixelpipe_iop_t piece_copy = *piece;
572
573 // try to go the easy way. this works in many cases where output is
574 // just like input, only scaled down
575 self->modify_roi_in(self, pipe, &piece_copy, oroi, &iroi_probe);
576 while((abs((int)iroi_probe.x - (int)iroi->x) > delta || abs((int)iroi_probe.y - (int)iroi->y) > delta
577 || abs((int)iroi_probe.width - (int)iroi->width) > delta
578 || abs((int)iroi_probe.height - (int)iroi->height) > delta) && iter > 0)
579 {
580 _print_roi(&iroi_probe, "tile iroi_probe");
581 _print_roi(oroi, "tile oroi old");
582
583 oroi->x += (iroi->x - iroi_probe.x) * oroi->scale / iroi->scale;
584 oroi->y += (iroi->y - iroi_probe.y) * oroi->scale / iroi->scale;
585 oroi->width += (iroi->width - iroi_probe.width) * oroi->scale / iroi->scale;
586 oroi->height += (iroi->height - iroi_probe.height) * oroi->scale / iroi->scale;
587
588 _print_roi(oroi, "tile oroi new");
589
590 piece_copy = *piece;
591 self->modify_roi_in(self, pipe, &piece_copy, oroi, &iroi_probe);
592 iter--;
593 }
594
595 if(iter > 0) return TRUE;
596
597 *oroi = save_oroi;
598
599 // simplicistic approach did not converge.
600 // try simplex downhill fitting now.
601 // it's crucial that we have a good starting point in oroi, else this
602 // will not converge as well.
603 int fit = _nm_fit_output_to_input_roi(self, pipe, piece, iroi, oroi, delta);
604 return fit;
605}
606
607
608/* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
609static int _default_process_tiling_ptp(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
610 const struct dt_dev_pixelpipe_iop_t *piece,
611 const void *const ivoid, void *const ovoid,
612 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
613 const int in_bpp)
614{
615 dt_dev_pixelpipe_t *const mutable_pipe = (dt_dev_pixelpipe_t *)pipe;
616 void *input = NULL;
617 void *output = NULL;
618 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] **** tiling module '%s' for image with size %dx%d --> %dx%d\n",
619 self->op, roi_in->width, roi_in->height, roi_out->width, roi_out->height);
620 const int out_bpp = piece->dsc_out.bpp;
621
622 const int ipitch = roi_in->width * in_bpp;
623 const int opitch = roi_out->width * out_bpp;
624 const int max_bpp = _max(in_bpp, out_bpp);
625
626 /* get tiling requirements of module */
628 self->tiling_callback(self, pipe, piece, &tiling);
629
630 /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
631 */
632 if((tiling.factor < 2.2f)
633 && (tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp))
634 {
635 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] no need to use tiling for module '%s' as no real "
636 "memory saving to be expected\n", self->op);
637 goto fallback;
638 }
639
640 /* calculate optimal size of tiles */
641 float available = dt_get_available_mem();
642 assert(available >= 500.0f * 1024.0f * 1024.0f);
643 /* correct for size of ivoid and ovoid which are needed on top of tiling */
644 available = fmaxf(available - ((float)roi_out->width * roi_out->height * out_bpp)
645 - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
646 0);
647
648 /* Size the tile from the memory left in the host cache.
649 Using the generic singlebuffer floor here can oversize tiles for modules whose
650 scratch buffers scale with tiling.factor, which defeats tiling and makes the
651 tile-local allocations fail later on. */
652 const float factor = fmaxf(tiling.factor, 1.0f);
653 const float maxbuf = fmaxf(tiling.maxbuf, 1.0f);
654 const float singlebuffer = available / factor;
655
656 int width = roi_in->width;
657 int height = roi_in->height;
658
659 /* shrink tile size in case it would exceed singlebuffer size */
660 if((float)width * height * max_bpp * maxbuf > singlebuffer)
661 {
662 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
663
664 /* TODO: can we make this more efficient to minimize total overlap between tiles? */
665 if(width < height && scale >= 0.333f)
666 {
667 height = floorf(height * scale);
668 }
669 else if(height <= width && scale >= 0.333f)
670 {
671 width = floorf(width * scale);
672 }
673 else
674 {
675 width = floorf(width * sqrtf(scale));
676 height = floorf(height * sqrtf(scale));
677 }
678 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_ptp] buffer exceeds singlebuffer, corrected to %dx%d\n",
679 width, height);
680 }
681
682 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
683 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
684 {
685 width = height = floorf(sqrtf((float)width * height));
686 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_roi] use squares because of overlap, corrected to %dx%d\n",
687 width, height);
688 }
689
690 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
691 Modules will report alignment requirements via xalign and yalign within tiling_callback().
692 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
693 direction.
694 We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
695 that is identical to image width/height no special alignment is needed. */
696
697 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
698
699 assert(xyalign != 0);
700
701 /* properly align tile width and height by making them smaller if needed */
702 if(width < roi_in->width) width = (width / xyalign) * xyalign;
703 if(height < roi_in->height) height = (height / xyalign) * xyalign;
704
705 /* also make sure that overlap follows alignment rules by making it wider when needed */
706 const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
707 : tiling.overlap;
708
709 /* calculate effective tile size */
710 const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
711 const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
712
713 /* calculate number of tiles */
714 const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
715 const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
716
717 /* sanity check: don't run wild on too many tiles */
718 if(tiles_x * tiles_y > _maximum_number_tiles())
719 {
720 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] gave up tiling for module '%s'. too many tiles: %d x %d\n",
721 self->op, tiles_x, tiles_y);
722 goto error;
723 }
724
725 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] (%dx%d) tiles with max dimensions %dx%d and overlap %d\n",
726 tiles_x, tiles_y, width, height, overlap);
727
728 /* reserve input and output buffers for tiles */
730 (size_t)width * height * in_bpp,
731 pipe->type);
732 if(IS_NULL_PTR(input))
733 {
734 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] could not alloc input buffer for module '%s'\n",
735 self->op);
736 goto error;
737 }
739 (size_t)width * height * out_bpp,
740 pipe->type);
741 if(IS_NULL_PTR(output))
742 {
743 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] could not alloc output buffer for module '%s'\n",
744 self->op);
745 goto error;
746 }
747
748 /* iterate over tiles */
749 for(size_t tx = 0; tx < tiles_x; tx++)
750 {
751 const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
752 for(size_t ty = 0; ty < tiles_y; ty++)
753 {
754 mutable_pipe->tiling = 1;
755
756 const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
757
758 /* no need to process end-tiles that are smaller than the total overlap area */
759 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
760
761 /* origin and region of effective part of tile, which we want to store later */
762 size_t origin[] = { 0, 0, 0 };
763 size_t region[] = { wd, ht, 1 };
764
765 /* roi_in and roi_out for process_cl on subbuffer */
766 dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
767 dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
768
769 /* offsets of tile into ivoid and ovoid */
770 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
771 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
772
773 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] tile (%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT ") with %" G_GSIZE_FORMAT "x%" G_GSIZE_FORMAT " at origin [%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT "]\n",
774 tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
775
776/* prepare input tile buffer */
778 for(size_t j = 0; j < ht; j++)
779 memcpy((char *)input + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch, (size_t)wd * in_bpp);
780
781 /* call process() of module */
782 dt_dev_pixelpipe_iop_t piece_tile = *piece;
783 piece_tile.roi_in = iroi;
784 piece_tile.roi_out = oroi;
785 int err = self->process(self, pipe, &piece_tile, input, output);
786 if(err)
787 {
790 mutable_pipe->tiling = 0;
791 return err;
792 }
793
794 /* correct origin and region of tile for overlap.
795 make sure that we only copy back the "good" part. */
796 if(tx > 0)
797 {
798 origin[0] += overlap;
799 region[0] -= overlap;
800 ooffs += (size_t)overlap * out_bpp;
801 }
802 if(ty > 0)
803 {
804 origin[1] += overlap;
805 region[1] -= overlap;
806 ooffs += (size_t)overlap * opitch;
807 }
808
809/* copy "good" part of tile to output buffer */
811 for(size_t j = 0; j < region[1]; j++)
812 memcpy((char *)ovoid + ooffs + j * opitch,
813 (char *)output + ((j + origin[1]) * wd + origin[0]) * out_bpp, (size_t)region[0] * out_bpp);
814 }
815 }
816
819 mutable_pipe->tiling = 0;
820 return 0;
821
822error:
823 dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
824// fall through
825
826fallback:
829 mutable_pipe->tiling = 0;
830 dt_print(DT_DEBUG_TILING, "[default_process_tiling_ptp] fall back to standard processing for module '%s'\n",
831 self->op);
832 int err = self->process(self, pipe, piece, ivoid, ovoid);
833 return err;
834}
835
836
837
838/* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
839 more tiles and larger overlap */
840static int _default_process_tiling_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
841 const struct dt_dev_pixelpipe_iop_t *piece,
842 const void *const ivoid, void *const ovoid,
843 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
844 const int in_bpp)
845{
846 dt_dev_pixelpipe_t *const mutable_pipe = (dt_dev_pixelpipe_t *)pipe;
847 void *input = NULL;
848 void *output = NULL;
849
850 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] **** tiling module '%s' for image input size %dx%d --> %dx%d\n",
851 self->op, roi_in->width, roi_in->height, roi_out->width, roi_out->height);
852 _print_roi(roi_in, "module roi_in");
853 _print_roi(roi_out, "module roi_out");
854
855 const int out_bpp = piece->dsc_out.bpp;
856
857 const int ipitch = roi_in->width * in_bpp;
858 const int opitch = roi_out->width * out_bpp;
859 const int max_bpp = _max(in_bpp, out_bpp);
860
861 float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
862 / ((float)roi_out->width * roi_out->height)));
863
864 /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
865 const int delta = ceilf(fullscale);
866
867 /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
868 const int inacc = RESERVE * delta;
869
870 /* get tiling requirements of module */
872 self->tiling_callback(self, pipe, piece, &tiling);
873
874 /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
875 */
876 if((tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp))
877 {
878 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] no need to use tiling for module '%s' as no memory saving is expected\n",
879 self->op);
880 goto fallback;
881 }
882
883 /* calculate optimal size of tiles */
884 float available = dt_get_available_mem();
885 assert(available >= 500.0f * 1024.0f * 1024.0f);
886 /* correct for size of ivoid and ovoid which are needed on top of tiling */
887 available = fmaxf(available - ((float)roi_out->width * roi_out->height * out_bpp)
888 - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
889 0);
890
891 /* Size the tile from the memory left in the host cache.
892 Using the generic singlebuffer floor here can oversize tiles for modules whose
893 scratch buffers scale with tiling.factor, which defeats tiling and makes the
894 tile-local allocations fail later on. */
895 const float factor = fmaxf(tiling.factor, 1.0f);
896 const float maxbuf = fmaxf(tiling.maxbuf, 1.0f);
897 const float singlebuffer = available / factor;
898
899 int width = _max(roi_in->width, roi_out->width);
900 int height = _max(roi_in->height, roi_out->height);
901
902 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
903 Modules will report alignment requirements via xalign and yalign within tiling_callback().
904 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
905 direction. */
906
907 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
908 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
909
910 assert(xyalign != 0);
911
912 /* shrink tile size in case it would exceed singlebuffer size */
913 if((float)width * height * max_bpp * maxbuf > singlebuffer)
914 {
915 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
916
917 /* TODO: can we make this more efficient to minimize total overlap between tiles? */
918 if(width < height && scale >= 0.333f)
919 {
920 height = _align_down((int)floorf(height * scale), xyalign);
921 }
922 else if(height <= width && scale >= 0.333f)
923 {
924 width = _align_down((int)floorf(width * scale), xyalign);
925 }
926 else
927 {
928 width = _align_down((int)floorf(width * sqrtf(scale)), xyalign);
929 height = _align_down((int)floorf(height * sqrtf(scale)), xyalign);
930 }
931 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_roi] buffer exceeds singlebuffer, corrected to %dx%d\n",
932 width, height);
933 }
934
935 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
936 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
937 {
938 width = height = _align_down((int)floorf(sqrtf((float)width * height)), xyalign);
939 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_roi] use squares because of overlap, corrected to %dx%d\n",
940 width, height);
941 }
942
943 /* make sure that overlap follows alignment rules by making it wider when needed.
944 overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
945 const int overlap_in = _align_up(tiling.overlap, xyalign);
946 const int overlap_out = ceilf((float)overlap_in / fullscale);
947
948 int tiles_x = 1, tiles_y = 1;
949
950 /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
951 normally it is roi_in > roi_out; but let's be prepared */
952 if(roi_in->width > roi_out->width)
953 tiles_x = width < roi_in->width
954 ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
955 : 1;
956 else
957 tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
958 : 1;
959
960 if(roi_in->height > roi_out->height)
961 tiles_y = height < roi_in->height
962 ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
963 : 1;
964 else
965 tiles_y = height < roi_out->height
966 ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
967 : 1;
968
969 /* sanity check: don't run wild on too many tiles */
970 if(tiles_x * tiles_y > _maximum_number_tiles())
971 {
972 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] gave up tiling for module '%s'. too many tiles: %d x %d\n",
973 self->op, tiles_x, tiles_y);
974 goto error;
975 }
976
977
978 /* calculate tile width and height excl. overlap (i.e. the good part) for output.
979 values are important for all following processing steps. */
980 const int tile_wd = _align_up(
981 roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
982 const int tile_ht = _align_up(
983 roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
984
985 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] (%dx%d) tiles with max dimensions %dx%d, good %dx%d, overlap %d->%d\n",
986 tiles_x, tiles_y, width, height, tile_wd, tile_ht, overlap_in, overlap_out);
987
988 /* iterate over tiles */
989 for(size_t tx = 0; tx < tiles_x; tx++)
990 for(size_t ty = 0; ty < tiles_y; ty++)
991 {
992 mutable_pipe->tiling = 1;
993
994 /* the output dimensions of the good part of this specific tile */
995 const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
996 const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
997
998 /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
999 iroi_good is calculated by modify_roi_in() of respective module */
1000 dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1001 dt_iop_roi_t oroi_good = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1002
1003 dt_dev_pixelpipe_iop_t piece_copy = *piece;
1004 self->modify_roi_in(self, pipe, &piece_copy, &oroi_good, &iroi_good);
1005
1006 /* clamp iroi_good to not exceed roi_in */
1007 iroi_good.x = _max(iroi_good.x, roi_in->x);
1008 iroi_good.y = _max(iroi_good.y, roi_in->y);
1009 iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1010 iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1011
1012 _print_roi(&iroi_good, "tile iroi_good");
1013 _print_roi(&oroi_good, "tile oroi_good");
1014
1015 /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1016 requirements
1017 and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1018 -> generates first estimate of iroi_full */
1019 const int x_in = iroi_good.x;
1020 const int y_in = iroi_good.y;
1021 const int width_in = iroi_good.width;
1022 const int height_in = iroi_good.height;
1023 const int new_x_in = _max(_align_close(x_in - overlap_in - delta, xyalign), roi_in->x);
1024 const int new_y_in = _max(_align_close(y_in - overlap_in - delta, xyalign), roi_in->y);
1025 const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1026 roi_in->width + roi_in->x - new_x_in);
1027 const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1028 roi_in->height + roi_in->y - new_y_in);
1029
1030 /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1031 * following iterative search */
1032 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1033 dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1034
1035 _print_roi(&iroi_full, "tile iroi_full before optimization");
1036 _print_roi(&oroi_full, "tile oroi_full before optimization");
1037
1038 /* try to find a matching oroi_full */
1039 if(!_fit_output_to_input_roi(self, pipe, piece, &iroi_full, &oroi_full, delta, 10))
1040 {
1041 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] can not handle requested roi's. tiling for "
1042 "module '%s' not possible.\n",
1043 self->op);
1044 goto error;
1045 }
1046
1047 _print_roi(&iroi_full, "tile iroi_full after optimization");
1048 _print_roi(&oroi_full, "tile oroi_full after optimization");
1049
1050 /* make sure that oroi_full at least covers the range of oroi_good.
1051 this step is needed due to the possibility of rounding errors */
1052 oroi_full.x = _min(oroi_full.x, oroi_good.x);
1053 oroi_full.y = _min(oroi_full.y, oroi_good.y);
1054 oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1055 oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1056
1057 /* clamp oroi_full to not exceed roi_out */
1058 oroi_full.x = _max(oroi_full.x, roi_out->x);
1059 oroi_full.y = _max(oroi_full.y, roi_out->y);
1060 oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1061 oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1062
1063 /* calculate final iroi_full */
1064 dt_dev_pixelpipe_iop_t piece_full = *piece;
1065 self->modify_roi_in(self, pipe, &piece_full, &oroi_full, &iroi_full);
1066
1067 /* clamp iroi_full to not exceed roi_in */
1068 iroi_full.x = _max(iroi_full.x, roi_in->x);
1069 iroi_full.y = _max(iroi_full.y, roi_in->y);
1070 iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1071 iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1072
1073 _print_roi(&iroi_full, "tile iroi_full final");
1074 _print_roi(&oroi_full, "tile oroi_full final");
1075
1076 /* offsets of tile into ivoid and ovoid */
1077 const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1078 size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1079
1080 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] process tile (%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT ") size %dx%d at origin [%d,%d]\n",
1081 tx, ty, iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1082
1083 /* prepare input tile buffer */
1085 (size_t)iroi_full.width * iroi_full.height * in_bpp,
1086 pipe->type);
1087 if(IS_NULL_PTR(input))
1088 {
1089 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] could not alloc input buffer for module '%s'\n",
1090 self->op);
1091 goto error;
1092 }
1094 (size_t)oroi_full.width * oroi_full.height * out_bpp,
1095 pipe->type);
1096 if(IS_NULL_PTR(output))
1097 {
1098 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] could not alloc output buffer for module '%s'\n",
1099 self->op);
1100 goto error;
1101 }
1103 for(size_t j = 0; j < iroi_full.height; j++)
1104 memcpy((char *)input + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1105 (size_t)iroi_full.width * in_bpp);
1106
1107 /* call process() of module */
1108 dt_dev_pixelpipe_iop_t piece_tile = *piece;
1109 piece_tile.roi_in = iroi_full;
1110 piece_tile.roi_out = oroi_full;
1111 int err = self->process(self, pipe, &piece_tile, input, output);
1112 if(err)
1113 {
1116 mutable_pipe->tiling = 0;
1117 return err;
1118 }
1119
1120 /* copy "good" part of tile to output buffer */
1121 const int origin_x = oroi_good.x - oroi_full.x;
1122 const int origin_y = oroi_good.y - oroi_full.y;
1124 for(size_t j = 0; j < oroi_good.height; j++)
1125 memcpy((char *)ovoid + ooffs + j * opitch,
1126 (char *)output + ((j + origin_y) * oroi_full.width + origin_x) * out_bpp,
1127 (size_t)oroi_good.width * out_bpp);
1128
1131 input = output = NULL;
1132 }
1133
1136 mutable_pipe->tiling = 0;
1137 return 0;
1138
1139error:
1140 dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
1141// fall through
1142
1143fallback:
1146 mutable_pipe->tiling = 0;
1147 dt_print(DT_DEBUG_TILING, "[default_process_tiling_roi] fall back to standard processing for module '%s'\n",
1148 self->op);
1149 int err = self->process(self, pipe, piece, ivoid, ovoid);
1150 return err;
1151}
1152
1153
1154
1155/* if a module does not implement process_tiling() by itself, this function is called instead.
1156 _default_process_tiling_ptp() is able to handle standard cases where pixels do not change their places.
1157 _default_process_tiling_roi() takes care of all other cases where image gets distorted and for module
1158 "clipping",
1159 "flip" as this may flip or mirror the image. */
1160int default_process_tiling(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
1161 const struct dt_dev_pixelpipe_iop_t *piece,
1162 const void *const ivoid, void *const ovoid, const int in_bpp)
1163{
1164 const dt_iop_roi_t *const roi_in = &piece->roi_in;
1165 const dt_iop_roi_t *const roi_out = &piece->roi_out;
1166 if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1167 return _default_process_tiling_roi(self, pipe, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1168 else
1169 return _default_process_tiling_ptp(self, pipe, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1170}
1171
1172
1173
1174#ifdef HAVE_OPENCL
1175/* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
1176static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
1177 const struct dt_dev_pixelpipe_iop_t *piece,
1178 const void *const ivoid, void *const ovoid,
1179 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1180 const int in_bpp)
1181{
1182 dt_dev_pixelpipe_t *const mutable_pipe = (dt_dev_pixelpipe_t *)pipe;
1183 cl_int err = -999;
1184 cl_mem input = NULL;
1185 cl_mem output = NULL;
1186
1187 dt_print(DT_DEBUG_TILING, "[default_process_tiling_cl_ptp] **** tiling module '%s' for image with size %dx%d --> %dx%d\n",
1188 self->op, roi_in->width, roi_in->height, roi_out->width, roi_out->height);
1189
1190 const int out_bpp = piece->dsc_out.bpp;
1191
1192 const int devid = pipe->devid;
1193 const int ipitch = roi_in->width * in_bpp;
1194 const int opitch = roi_out->width * out_bpp;
1195 const int max_bpp = _max(in_bpp, out_bpp);
1196
1197 /* get tiling requirements of module */
1199 self->tiling_callback(self, pipe, piece, &tiling);
1200
1201 // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1202 const float available = (float)dt_opencl_get_device_available(devid);
1203 const float factor = fmaxf(tiling.factor_cl, 1.0f);
1204 const float singlebuffer = fminf(fmaxf((available - tiling.overhead) / factor, 0.0f),
1206 const float maxbuf = fmaxf(tiling.maxbuf_cl, 1.0f);
1209
1210 /* shrink tile size in case it would exceed singlebuffer size */
1211 if((float)width * height * max_bpp * maxbuf > singlebuffer)
1212 {
1213 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1214
1215 if(width < height && scale >= 0.333f)
1216 {
1217 height = floorf(height * scale);
1218 }
1219 else if(height <= width && scale >= 0.333f)
1220 {
1221 width = floorf(width * scale);
1222 }
1223 else
1224 {
1225 width = floorf(width * sqrtf(scale));
1226 height = floorf(height * sqrtf(scale));
1227 }
1228 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_cl_ptp] buffer exceeds singlebuffer, corrected to %dx%d\n",
1229 width, height);
1230 }
1231
1232 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1233 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1234 {
1235 width = height = floorf(sqrtf((float)width * height));
1236 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_cl_ptp] use squares because of overlap, corrected to %dx%d\n",
1237 width, height);
1238 }
1239
1240 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1241 Modules will report alignment requirements via xalign and yalign within tiling_callback().
1242 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1243 direction. Additional alignment requirements are set via definition of CL_ALIGNMENT.
1244 We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
1245 that is identical to image width/height no special alignment is done. */
1246
1247 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1248 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1249
1250 /* determining alignment requirement for tile width/height.
1251 in case of tile width also align according to definition of CL_ALIGNMENT */
1252 const unsigned int walign = _lcm(xyalign, CL_ALIGNMENT);
1253 const unsigned int halign = xyalign;
1254
1255 assert(xyalign != 0 && walign != 0 && halign != 0);
1256
1257 /* properly align tile width and height by making them smaller if needed */
1258 if(width < roi_in->width) width = (width / walign) * walign;
1259 if(height < roi_in->height) height = (height / halign) * halign;
1260
1261 /* OpenCL image allocations are backed by device-specific row/height strides.
1262 The generic full-frame pre-check already reasons on rounded dimensions, so
1263 tiling needs to use the same planning rule or it may pick a tile that fits
1264 mathematically in width*height*bpp but still fails once the driver rounds it
1265 up internally. Shrink the candidate tile until the rounded image footprint
1266 fits the per-buffer budget. */
1267 while((float)ROUNDUPDWD(width, devid) * ROUNDUPDHT(height, devid) * max_bpp * maxbuf > singlebuffer)
1268 {
1269 if(width <= (int)walign && height <= (int)halign) break;
1270 if(width < height && height > (int)halign)
1271 height -= halign;
1272 else if(width > (int)walign)
1273 width -= walign;
1274 else
1275 height -= halign;
1276 }
1277
1278 /* also make sure that overlap follows alignment rules by making it wider when needed */
1279 const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
1280 : tiling.overlap;
1281
1282
1283 /* calculate effective tile size */
1284 const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
1285 const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
1286
1287
1288 /* calculate number of tiles */
1289 const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
1290 const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
1291
1292 /* sanity check: don't run wild on too many tiles */
1293 if(tiles_x * tiles_y > _maximum_number_tiles())
1294 {
1295 dt_print(DT_DEBUG_TILING, "[default_process_tiling_cl_ptp] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1296 self->op, tiles_x, tiles_y);
1297 return FALSE;
1298 }
1299
1300 dt_print(DT_DEBUG_TILING, "[default_process_tiling_cl_ptp] (%dx%d) tiles with max dimensions %dx%d, good %dx%d and overlap %d\n",
1301 tiles_x, tiles_y, width, height, tile_wd, tile_ht, overlap);
1302
1303 /* iterate over tiles */
1304 for(size_t tx = 0; tx < tiles_x; tx++)
1305 for(size_t ty = 0; ty < tiles_y; ty++)
1306 {
1307 mutable_pipe->tiling = 1;
1308
1309 const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
1310 const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
1311
1312 /* no need to process (end)tiles that are smaller than the total overlap area */
1313 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
1314
1315 /* origin and region of effective part of tile, which we want to store later */
1316 size_t origin[] = { 0, 0, 0 };
1317 size_t region[] = { wd, ht, 1 };
1318
1319 /* roi_in and roi_out for process_cl on subbuffer */
1320 dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1321 dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1322
1323
1324 /* offsets of tile into ivoid and ovoid */
1325 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
1326 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
1327
1328
1329 dt_print(DT_DEBUG_TILING, "[default_process_tiling_cl_ptp] tile (%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT ") size %" G_GSIZE_FORMAT "x%" G_GSIZE_FORMAT " at origin [%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT "]\n",
1330 tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
1331
1332 /* get input and output buffers */
1333 input = dt_opencl_alloc_device(devid, wd, ht, in_bpp);
1334 if(IS_NULL_PTR(input)) goto error;
1335 output = dt_opencl_alloc_device(devid, wd, ht, out_bpp);
1336 if(IS_NULL_PTR(output)) goto error;
1337
1338 /* blocking direct memory transfer: host input image -> opencl/device tile */
1339 err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, origin, region, ipitch,
1340 CL_TRUE);
1341 if(err != CL_SUCCESS) goto error;
1342
1343 /* call process_cl of module */
1344 dt_dev_pixelpipe_iop_t piece_tile = *piece;
1345 piece_tile.roi_in = iroi;
1346 piece_tile.roi_out = oroi;
1347 if(!self->process_cl(self, pipe, &piece_tile, input, output)) goto error;
1348
1349 /* correct origin and region of tile for overlap.
1350 makes sure that we only copy back the "good" part. */
1351 if(tx > 0)
1352 {
1353 origin[0] += overlap;
1354 region[0] -= overlap;
1355 ooffs += (size_t)overlap * out_bpp;
1356 }
1357 if(ty > 0)
1358 {
1359 origin[1] += overlap;
1360 region[1] -= overlap;
1361 ooffs += (size_t)overlap * opitch;
1362 }
1363
1364 /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1365 err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, origin, region,
1366 opitch, CL_TRUE);
1367 if(err != CL_SUCCESS) goto error;
1368
1369 /* release input and output buffers */
1371 input = NULL;
1373 output = NULL;
1374
1375 /* block until opencl queue has finished to free all used event handlers */
1377 }
1378
1381 mutable_pipe->tiling = 0;
1382 return TRUE;
1383
1384error:
1387 mutable_pipe->tiling = 0;
1389 "[default_process_tiling_opencl_ptp] couldn't run process_cl() for module '%s' in tiling mode: %i\n",
1390 self->op, err);
1391 return FALSE;
1392}
1393
1394
1395/* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
1396 more tiles and larger overlap */
1397static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
1398 const struct dt_dev_pixelpipe_iop_t *piece,
1399 const void *const ivoid, void *const ovoid,
1400 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1401 const int in_bpp)
1402{
1403 dt_dev_pixelpipe_t *const mutable_pipe = (dt_dev_pixelpipe_t *)pipe;
1404 cl_int err = -999;
1405 cl_mem input = NULL;
1406 cl_mem output = NULL;
1407
1409 "[default_process_tiling_cl_roi] **** tiling module '%s' for image with input size %dx%d --> %dx%d\n",
1410 self->op, roi_in->width, roi_in->height, roi_out->width, roi_out->height);
1411 _print_roi(roi_in, "module roi_in");
1412 _print_roi(roi_out, "module roi_out");
1413
1414 const int out_bpp = piece->dsc_out.bpp;
1415
1416 const int devid = pipe->devid;
1417 const int ipitch = roi_in->width * in_bpp;
1418 const int opitch = roi_out->width * out_bpp;
1419 const int max_bpp = _max(in_bpp, out_bpp);
1420
1421 const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
1422 / ((float)roi_out->width * roi_out->height)));
1423
1424 /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
1425 const int delta = ceilf(fullscale);
1426
1427 /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
1428 const int inacc = RESERVE * delta;
1429
1430 /* get tiling requirements of module */
1432 self->tiling_callback(self, pipe, piece, &tiling);
1433
1434 // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1435 const float available = (float)dt_opencl_get_device_available(devid);
1436 const float factor = fmaxf(tiling.factor_cl, 1.0f);
1437 const float singlebuffer = fminf(fmaxf((available - tiling.overhead) / factor, 0.0f),
1439 const float maxbuf = fmaxf(tiling.maxbuf_cl, 1.0f);
1440
1441 int width = _min(_max(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
1442 int height = _min(_max(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);
1443
1444 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1445 Modules will report alignment requirements via xalign and yalign within tiling_callback().
1446 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1447 direction. Additional alignment requirements are set via definition of CL_ALIGNMENT. */
1448
1449 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1450 unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1451 xyalign = _lcm(xyalign, CL_ALIGNMENT);
1452
1453 assert(xyalign != 0);
1454
1455 /* shrink tile size in case it would exceed singlebuffer size */
1456 if((float)width * height * max_bpp * maxbuf > singlebuffer)
1457 {
1458 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1459
1460 if(width < height && scale >= 0.333f)
1461 {
1462 height = _align_down((int)floorf(height * scale), xyalign);
1463 }
1464 else if(height <= width && scale >= 0.333f)
1465 {
1466 width = _align_down((int)floorf(width * scale), xyalign);
1467 }
1468 else
1469 {
1470 width = _align_down((int)floorf(width * sqrtf(scale)), xyalign);
1471 height = _align_down((int)floorf(height * sqrtf(scale)), xyalign);
1472 }
1473 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_cl_roi] buffer exceeds singlebuffer, corrected to %dx%d\n",
1474 width, height);
1475 }
1476
1477 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1478 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1479 {
1480 width = height = _align_down((int)floorf(sqrtf((float)width * height)), xyalign);
1481 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_cl_roi] use squares because of overlap, corrected to %dx%d\n",
1482 width, height);
1483 }
1484
1485 /* make sure that overlap follows alignment rules by making it wider when needed.
1486 overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
1487 const int overlap_in = _align_up(tiling.overlap, xyalign);
1488 const int overlap_out = ceilf((float)overlap_in / fullscale);
1489
1490 /* As in the pixel-perfect tiler above, keep planning conservative with the
1491 same rounded OpenCL dimensions used by the non-tiling GPU fit checks. The
1492 ROI path can otherwise accept a tile whose raw area fits `singlebuffer`
1493 even though the driver-backed image allocation for the tile does not. */
1494 while((float)ROUNDUPDWD(width, devid) * ROUNDUPDHT(height, devid) * max_bpp * maxbuf > singlebuffer)
1495 {
1496 if(width <= (int)xyalign && height <= (int)xyalign) break;
1497 if(width < height && height > (int)xyalign)
1498 height -= xyalign;
1499 else if(width > (int)xyalign)
1500 width -= xyalign;
1501 else
1502 height -= xyalign;
1503 }
1504
1505 int tiles_x = 1, tiles_y = 1;
1506
1507 /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
1508 normally it is roi_in > roi_out; but let's be prepared */
1509 if(roi_in->width > roi_out->width)
1510 tiles_x = width < roi_in->width
1511 ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
1512 : 1;
1513 else
1514 tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
1515 : 1;
1516
1517 if(roi_in->height > roi_out->height)
1518 tiles_y = height < roi_in->height
1519 ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
1520 : 1;
1521 else
1522 tiles_y = height < roi_out->height
1523 ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
1524 : 1;
1525
1526 /* sanity check: don't run wild on too many tiles */
1527 if(tiles_x * tiles_y > _maximum_number_tiles())
1528 {
1530 "[default_process_tiling_cl_roi] aborted tiling for module '%s'. too many tiles: %dx%d\n",
1531 self->op, tiles_x, tiles_y);
1532 return FALSE;
1533 }
1534
1535 /* calculate tile width and height excl. overlap (i.e. the good part) for output.
1536 important for all following processing steps. */
1537 const int tile_wd = _align_up(
1538 roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
1539 const int tile_ht = _align_up(
1540 roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
1541
1543 "[default_process_tiling_cl_roi] (%dx%d) tiles with max input dimensions %dx%d, good %ix%i\n",
1544 tiles_x, tiles_y, width, height, tile_wd, tile_ht);
1545
1546 /* iterate over tiles */
1547 for(size_t tx = 0; tx < tiles_x; tx++)
1548 for(size_t ty = 0; ty < tiles_y; ty++)
1549 {
1550 mutable_pipe->tiling = 1;
1551
1552 /* the output dimensions of the good part of this specific tile */
1553 const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
1554 const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
1555
1556 /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1557 iroi_good is calculated by modify_roi_in() of respective module */
1558 dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1559 dt_iop_roi_t oroi_good = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1560
1561 dt_dev_pixelpipe_iop_t piece_copy = *piece;
1562 self->modify_roi_in(self, pipe, &piece_copy, &oroi_good, &iroi_good);
1563
1564 /* clamp iroi_good to not exceed roi_in */
1565 iroi_good.x = _max(iroi_good.x, roi_in->x);
1566 iroi_good.y = _max(iroi_good.y, roi_in->y);
1567 iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1568 iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1569
1570 _print_roi(&iroi_good, "tile iroi_good");
1571 _print_roi(&oroi_good, "tile oroi_good");
1572
1573 /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1574 requirements
1575 and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1576 -> generates first estimate of iroi_full */
1577 const int x_in = iroi_good.x;
1578 const int y_in = iroi_good.y;
1579 const int width_in = iroi_good.width;
1580 const int height_in = iroi_good.height;
1581 const int new_x_in = _max(_align_close(x_in - overlap_in - delta, xyalign), roi_in->x);
1582 const int new_y_in = _max(_align_close(y_in - overlap_in - delta, xyalign), roi_in->y);
1583 const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1584 roi_in->width + roi_in->x - new_x_in);
1585 const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1586 roi_in->height + roi_in->y - new_y_in);
1587
1588 /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1589 * following iterative search */
1590 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1591 dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1592
1593 _print_roi(&iroi_full, "tile iroi_full before optimization");
1594 _print_roi(&oroi_full, "tile oroi_full before optimization");
1595
1596 /* try to find a matching oroi_full */
1597 if(!_fit_output_to_input_roi(self, pipe, piece, &iroi_full, &oroi_full, delta, 10))
1598 {
1599 dt_print(DT_DEBUG_OPENCL | DT_DEBUG_TILING, "[default_process_tiling_cl_roi] can not handle requested roi's tiling "
1600 "for module '%s' not possible.\n",
1601 self->op);
1602 goto error;
1603 }
1604
1605
1606 /* make sure that oroi_full at least covers the range of oroi_good.
1607 this step is needed due to the possibility of rounding errors */
1608 oroi_full.x = _min(oroi_full.x, oroi_good.x);
1609 oroi_full.y = _min(oroi_full.y, oroi_good.y);
1610 oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1611 oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1612
1613 /* clamp oroi_full to not exceed roi_out */
1614 oroi_full.x = _max(oroi_full.x, roi_out->x);
1615 oroi_full.y = _max(oroi_full.y, roi_out->y);
1616 oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1617 oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1618
1619
1620 /* calculate final iroi_full */
1621 dt_dev_pixelpipe_iop_t piece_full = *piece;
1622 self->modify_roi_in(self, pipe, &piece_full, &oroi_full, &iroi_full);
1623
1624 /* clamp iroi_full to not exceed roi_in */
1625 iroi_full.x = _max(iroi_full.x, roi_in->x);
1626 iroi_full.y = _max(iroi_full.y, roi_in->y);
1627 iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1628 iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1629
1630 _print_roi(&iroi_full, "tile iroi_full");
1631 _print_roi(&oroi_full, "tile oroi_full");
1632
1633 /* offsets of tile into ivoid and ovoid */
1634 const int in_dx = iroi_full.x - roi_in->x;
1635 const int in_dy = iroi_full.y - roi_in->y;
1636 const int out_dx = oroi_good.x - roi_out->x;
1637 const int out_dy = oroi_good.y - roi_out->y;
1638 const size_t ioffs = (size_t)(in_dy * ipitch) + (size_t)(in_dx * in_bpp);
1639 const size_t ooffs = (size_t)(out_dy * opitch) + (size_t)(out_dx * out_bpp);
1640
1641 /* origin and region of full input tile */
1642 size_t iorigin[] = { 0, 0, 0 };
1643 size_t iregion[] = { iroi_full.width, iroi_full.height, 1 };
1644
1645 /* origin and region of good part of output tile */
1646 size_t oorigin[] = { oroi_good.x - oroi_full.x, oroi_good.y - oroi_full.y, 0 };
1647 size_t oregion[] = { oroi_good.width, oroi_good.height, 1 };
1648
1649 dt_print(DT_DEBUG_TILING, "[default_process_tiling_cl_roi] process tile (%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT ") size %dx%d at origin [%d,%d]\n",
1650 tx, ty, iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1651 dt_vprint(DT_DEBUG_TILING, "[default_process_tiling_cl_roi] dest [%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT "] at [%" G_GSIZE_FORMAT ",%" G_GSIZE_FORMAT "], offsets [%i,%i] -> [%i,%i], delta=%i\n\n",
1652 oregion[0], oregion[1], oorigin[0], oorigin[1], in_dx, in_dy, out_dx, out_dy, delta);
1653
1654 /* get opencl input and output buffers */
1655 input = dt_opencl_alloc_device(devid, iroi_full.width, iroi_full.height, in_bpp);
1656 if(IS_NULL_PTR(input)) goto error;
1657
1658 output = dt_opencl_alloc_device(devid, oroi_full.width, oroi_full.height, out_bpp);
1659 if(IS_NULL_PTR(output)) goto error;
1660
1661 /* blocking direct memory transfer: host input image -> opencl/device tile */
1662 err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, iorigin, iregion,
1663 ipitch, CL_TRUE);
1664 if(err != CL_SUCCESS) goto error;
1665
1666 /* call process_cl of module */
1667 dt_dev_pixelpipe_iop_t piece_tile = *piece;
1668 piece_tile.roi_in = iroi_full;
1669 piece_tile.roi_out = oroi_full;
1670 if(!self->process_cl(self, pipe, &piece_tile, input, output)) goto error;
1671
1672 /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1673 err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, oorigin, oregion,
1674 opitch, CL_TRUE);
1675 if(err != CL_SUCCESS) goto error;
1676
1677 /* release input and output buffers */
1679 input = NULL;
1681 output = NULL;
1682
1683 /* block until opencl queue has finished to free all used event handlers */
1685 }
1686
1689 mutable_pipe->tiling = 0;
1690 return TRUE;
1691
1692error:
1695 mutable_pipe->tiling = 0;
1697 "[default_process_tiling_opencl_roi] couldn't run process_cl() for module '%s' in tiling mode: %i\n",
1698 self->op, err);
1699 return FALSE;
1700}
1701
1702
1703
1704/* if a module does not implement process_tiling_cl() by itself, this function is called instead.
1705 _default_process_tiling_cl_ptp() is able to handle standard cases where pixels do not change their places.
1706 _default_process_tiling_cl_roi() takes care of all other cases where image gets distorted. */
1708 const struct dt_dev_pixelpipe_iop_t *piece,
1709 const void *const ivoid, void *const ovoid, const int in_bpp)
1710{
1711 const dt_iop_roi_t *const roi_in = &piece->roi_in;
1712 const dt_iop_roi_t *const roi_out = &piece->roi_out;
1713 if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1714 return _default_process_tiling_cl_roi(self, pipe, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1715 else
1716 return _default_process_tiling_cl_ptp(self, pipe, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1717}
1718
1719#else
1720int default_process_tiling_cl(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
1721 const struct dt_dev_pixelpipe_iop_t *piece,
1722 const void *const ivoid, void *const ovoid, const int in_bpp)
1723{
1724 (void)pipe;
1725 return FALSE;
1726}
1727#endif
1728
1729
1730/* If a module does not implement tiling_callback() by itself, this function is called instead.
1731 Default is an image size factor of 2 (i.e. input + output buffer needed), no overhead (1),
1732 no overlap between tiles, and an pixel alignment of 1 in x and y direction, i.e. no special
1733 alignment required. Simple pixel to pixel modules (take tonecurve as an example) can happily
1734 live with that.
1735 (1) Small overhead like look-up-tables in tonecurve can be ignored safely. */
1736void default_tiling_callback(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe,
1737 const struct dt_dev_pixelpipe_iop_t *piece,
1739{
1740 const dt_iop_roi_t *const roi_in = &piece->roi_in;
1741 const dt_iop_roi_t *const roi_out = &piece->roi_out;
1742 const float ioratio
1743 = ((float)roi_out->width * (float)roi_out->height) / ((float)roi_in->width * (float)roi_in->height);
1744
1745 tiling->factor = 1.0f + ioratio;
1746 tiling->factor_cl = tiling->factor;
1747 tiling->maxbuf = 1.0f;
1748 tiling->maxbuf_cl = tiling->maxbuf;
1749 tiling->overhead = 0;
1750 tiling->overlap = 0;
1751 tiling->xalign = 1;
1752 tiling->yalign = 1;
1753
1754 if((self->flags() & IOP_FLAGS_TILING_FULL_ROI) == IOP_FLAGS_TILING_FULL_ROI) tiling->overlap = 4;
1755
1756 if(self->iop_order > dt_ioppr_get_iop_order(pipe->iop_order_list, "demosaic", 0)) return;
1757
1758 // all operations that work with mosaiced data should respect pattern size!
1759
1760 if(!piece->dsc_in.filters) return;
1761
1762 if(piece->dsc_in.filters == 9u)
1763 {
1764 // X-Trans, sensor is 6x6 but algorithms have been corrected to work with 3x3
1765 tiling->xalign = 3;
1766 tiling->yalign = 3;
1767 }
1768 else
1769 {
1770 // Bayer, good old 2x2
1771 tiling->xalign = 2;
1772 tiling->yalign = 2;
1773 }
1774
1775 return;
1776}
1777
1778int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp,
1779 const float factor, const size_t overhead)
1780{
1781 size_t available = dt_get_available_mem();
1782 const size_t total = factor * width * height * bpp + overhead;
1783
1784 // Try to make room in cache first
1785 int error = 0;
1786 while(!error && available < total)
1787 {
1789 available = dt_get_available_mem();
1790 }
1791
1792 if(total <= available)
1793 return TRUE;
1794 else
1795 return FALSE;
1796}
1797
1798// clang-format off
1799// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
1800// vim: shiftwidth=2 expandtab tabstop=2 cindent
1801// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
1802// clang-format on
static void error(char *msg)
Definition ashift_lsd.c:202
#define TRUE
Definition ashift_lsd.c:162
#define FALSE
Definition ashift_lsd.c:158
#define m
Definition basecurve.c:278
int width
Definition bilateral.h:1
int height
Definition bilateral.h:1
const dt_aligned_pixel_t f
static const float const float const float min
static const int row
const float delta
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))
void dt_control_log(const char *msg,...)
Definition control.c:761
#define EPSILON
Definition curve_tools.c:41
void dt_vprint(dt_debug_thread_t thread, const char *msg,...)
Definition darktable.c:1567
darktable_t darktable
Definition darktable.c:181
size_t dt_get_available_mem()
Definition darktable.c:1682
void dt_print(dt_debug_thread_t thread, const char *msg,...)
Definition darktable.c:1542
@ DT_DEBUG_OPENCL
Definition darktable.h:722
@ DT_DEBUG_VERBOSE
Definition darktable.h:743
@ DT_DEBUG_TILING
Definition darktable.h:739
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
Definition darktable.h:433
#define dt_free(ptr)
Definition darktable.h:456
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453
#define __OMP_PARALLEL_FOR__(...)
Definition darktable.h:258
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281
int bpp
@ IOP_FLAGS_TILING_FULL_ROI
Definition imageop.h:171
void *const ovoid
int dt_ioppr_get_iop_order(GList *iop_order_list, const char *op_name, const int multi_priority)
Return the iop_order for a given operation/instance pair.
Definition iop_order.c:868
static const float x
const int t
const float v
float *const restrict const size_t k
cl_ulong dt_opencl_get_device_available(const int devid)
Definition opencl.c:2664
void * dt_opencl_alloc_device(const int devid, const int width, const int height, const int bpp)
Definition opencl.c:2471
int dt_opencl_read_host_from_device_raw(const int devid, void *host, void *device, const size_t *origin, const size_t *region, const int rowpitch, const int blocking)
Definition opencl.c:2204
cl_ulong dt_opencl_get_device_memalloc(const int devid)
Definition opencl.c:2677
gboolean dt_opencl_finish(const int devid)
Definition opencl.c:1347
int dt_opencl_write_host_to_device_raw(const int devid, const void *host, void *device, const size_t *origin, const size_t *region, const int rowpitch, const int blocking)
Definition opencl.c:2249
void dt_opencl_release_mem_object(cl_mem mem)
Definition opencl.c:2383
#define ROUNDUPDHT(a, b)
Definition opencl.h:82
#define ROUNDUPDWD(a, b)
Definition opencl.h:81
const float factor
Definition pdf.h:90
int dt_dev_pixel_pipe_cache_remove_lru(dt_dev_pixelpipe_cache_t *cache)
struct dt_dev_pixelpipe_cache_t * pixelpipe_cache
Definition darktable.h:790
struct dt_opencl_t * opencl
Definition darktable.h:785
int32_t unmuted
Definition darktable.h:760
dt_iop_buffer_dsc_t dsc_out
dt_iop_buffer_dsc_t dsc_in
dt_dev_pixelpipe_type_t type
uint32_t filters
Definition format.h:60
GModule *dt_dev_operation_t op
Definition imageop.h:256
Region of interest passed through the pixelpipe.
Definition imageop.h:72
double scale
Definition imageop.h:74
size_t max_image_width
Definition opencl.h:128
size_t max_image_height
Definition opencl.h:129
dt_opencl_device_t * dev
Definition opencl.h:246
typedef double((*spd)(unsigned long int wavelength, double TempK))
#define MIN(a, b)
Definition thinplate.c:32
#define MAX(a, b)
Definition thinplate.c:29
int default_process_tiling(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const int in_bpp)
Definition tiling.c:1160
static int _fit_output_to_input_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta, int iter)
Definition tiling.c:565
#define BETA
Definition tiling.c:205
static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta)
Definition tiling.c:538
static double _nm_fitness(double x[], void *rest[])
Definition tiling.c:141
void default_tiling_callback(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, struct dt_develop_tiling_t *tiling)
Definition tiling.c:1736
static int _default_process_tiling_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
Definition tiling.c:840
static int _align_up(int n, int a)
Definition tiling.c:93
static int _default_process_tiling_ptp(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
Definition tiling.c:609
static void _print_roi(const dt_iop_roi_t *roi, const char *label)
Definition tiling.c:116
#define RESERVE
Definition tiling.c:60
int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp, const float factor, const size_t overhead)
Definition tiling.c:1778
static int _max(int a, int b)
Definition tiling.c:87
#define GAMMA
Definition tiling.c:206
static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
Definition tiling.c:1176
static int _simplex(double(*objfunc)(double[], void *[]), double start[], int n, double EPSILON, double scale, int maxiter, void(*constrain)(double[], int n), void *rest[])
Definition tiling.c:208
static int _align_close(int n, int a)
Definition tiling.c:101
static unsigned _lcm(unsigned a, unsigned b)
Definition tiling.c:76
static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
Definition tiling.c:1397
static int _min(int a, int b)
Definition tiling.c:82
#define CL_ALIGNMENT
Definition tiling.c:55
int default_process_tiling_cl(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const int in_bpp)
Definition tiling.c:1707
static int _align_down(int n, int a)
Definition tiling.c:97
static int _maximum_number_tiles()
Definition tiling.c:111
#define ALPHA
Definition tiling.c:204
static unsigned _gcd(unsigned a, unsigned b)
Definition tiling.c:63