Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
box_filters.c
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2020-2021 Hubert Kowalski.
4 Copyright (C) 2020-2021 Pascal Obry.
5 Copyright (C) 2020-2021 Ralf Brown.
6 Copyright (C) 2022 Martin Bařinka.
7 Copyright (C) 2023 Luca Zulberti.
8 Copyright (C) 2025-2026 Aurélien PIERRE.
9
10 darktable is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 darktable is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with darktable. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#ifdef HAVE_CONFIG_H
25#include "config.h"
26#endif
27#include <assert.h>
28#include <math.h>
29#include <stdlib.h>
30
31#include "common/box_filters.h"
32#include "common/darktable.h"
33#include "common/math.h"
34#include "develop/imageop.h"
36
37#if defined(__x86_64__) || defined(__i386__)
38#define DT_PREFETCH(addr) _mm_prefetch(addr, _MM_HINT_T2)
39#define PREFETCH_NTA(addr) _mm_prefetch(addr, _MM_HINT_NTA)
40#elif defined(__GNUC__) && __GNUC__ > 7
41#define DT_PREFETCH(addr) __builtin_prefetch(addr,1,1)
42#define PREFETCH_NTA(addr) __builtin_prefetch(addr,1,0)
43#else
44#define DT_PREFETCH(addr)
45#define PREFETCH_NTA(addr)
46#endif
47
49static void blur_horizontal_1ch(float *const restrict buf, const int height, const int width, const int radius,
50 float *const restrict scanlines, const size_t padded_size)
51{
53 for(int y = 0; y < height; y++)
54 {
55 float L = 0;
56 int hits = 0;
57 const size_t index = (size_t)y * width;
58 float *const restrict scanline = dt_get_perthread(scanlines,padded_size);
59 // add up the left half of the window
60 for (int x = 0; x < MIN(radius,width) ; x++)
61 {
62 L += buf[index+x];
63 hits++;
64 }
65 // process the blur up to the point where we start removing values
66 int x;
67 for (x = 0; (x <= radius) && ((x + radius) < width); x++)
68 {
69 const int np = x + radius;
70 if(np < width)
71 {
72 L += buf[index + np];
73 hits++;
74 }
75 scanline[x] = L / hits;
76 }
77 // if radius > width/2, we have pixels for which we can neither add new values (x+radius >= width) nor
78 // remove old values (x-radius < 0)
79 for(; x <= radius && x < width; x++)
80 {
81 scanline[x] = L / hits;
82 }
83 // process the blur for the bulk of the scan line
84 for(; x + radius < width; x++)
85 {
86 const int op = x - radius - 1;
87 const int np = x + radius;
88 L -= buf[index + op];
89 L += buf[index + np];
90 scanline[x] = L / hits;
91 }
92 // process the right end where we have no more values to add to the running sum
93 for(; x < width; x++)
94 {
95 const int op = x - radius - 1;
96 L -= buf[index + op];
97 hits--;
98 scanline[x] = L / hits;
99 }
100 // copy blurred values back to original location in buffer
101 for(x = 0; x < width; x++)
102 buf[index + x] = scanline[x];
103 }
104 return;
105}
106
108static void blur_horizontal_2ch(float *const restrict buf, const int height, const int width, const int radius,
109 float *const restrict scanlines, const size_t padded_size)
110{
112 for(int y = 0; y < height; y++)
113 {
114 float *const restrict scanline = dt_get_perthread(scanlines, padded_size);
115 float L1 = 0.0f, L2 = 0.0f;
116 int hits = 0;
117 const size_t index = (size_t)2 * y * width;
118 // add up the left half of the window
119 for (int x = 0; x < MIN(radius, width) ; x++)
120 {
121 hits++;
122 L1 += buf[index + 2*x];
123 L2 += buf[index + 2*x + 1];
124 }
125 // process the blur up to the point where we start removing values
126 int x;
127 for (x = 0; (x <= radius) && ((x + radius) < width); x++)
128 {
129 const int np = x + radius;
130 if(np < width)
131 {
132 hits++;
133 L1 += buf[index + 2*np];
134 L2 += buf[index + 2*np + 1];
135 }
136 scanline[2*x] = L1 / hits;
137 scanline[2*x+1] = L2 / hits;
138 }
139 // if radius > width/2, we have pixels for which we can neither add new values (x+radius >= width) nor
140 // remove old values (x-radius < 0)
141 for(; x <= radius && x < width; x++)
142 {
143 scanline[2*x] = L1 / hits;
144 scanline[2*x+1] = L2 / hits;
145 }
146 // process the blur for the bulk of the scan line
147 for(; x + radius < width; x++)
148 {
149 const int op = x - radius - 1;
150 const int np = x + radius;
151 L1 = L1 - buf[index + 2*op] + buf[index + 2*np];
152 L2 = L2 - buf[index + 2*op + 1] + buf[index + 2*np + 1];
153 scanline[2*x] = L1 / hits;
154 scanline[2*x+1] = L2 / hits;
155 }
156 // process the right end where we have no more values to add to the running sum
157 for(; x < width; x++)
158 {
159 const int op = x - radius - 1;
160 hits--;
161 L1 -= buf[index + 2*op];
162 L2 -= buf[index + 2*op + 1];
163 scanline[2*x] = L1 / hits;
164 scanline[2*x+1] = L2 / hits;
165 }
166 // copy blurred values back to original location in buffer
167 for(x = 0; x < 2*width; x++)
168 {
169 buf[index + x] = scanline[x];
170 }
171 }
172 return;
173}
174
175// Put the to-be-vectorized loop into a function by itself to nudge the compiler into actually vectorizing...
176// With optimization enabled, this gets inlined and interleaved with other instructions as though it had been
177// written in place, so we get a net win from better vectorization.
178static inline __attribute__((always_inline)) void load_add_4wide(float *const restrict out, dt_aligned_pixel_t accum, const float *const restrict values)
179{
180 for_four_channels(c,aligned(accum, out))
181 {
182 const float v = values[c];
183 accum[c] += v;
184 out[c] = v;
185 }
186}
187
188static inline __attribute__((always_inline)) void sub_4wide(float *const restrict accum, const dt_aligned_pixel_t values)
189{
190 for_four_channels(c,aligned(accum))
191 accum[c] -= values[c];
192}
193
194// Put the to-be-vectorized loop into a function by itself to nudge the compiler into actually vectorizing...
195// With optimization enabled, this gets inlined and interleaved with other instructions as though it had been
196// written in place, so we get a net win from better vectorization.
197static inline __attribute__((always_inline)) void load_add_4wide_Kahan(float *const restrict out, dt_aligned_pixel_t accum,
198 const float *const restrict values, float *const restrict comp)
199{
200 for_four_channels(c,aligned(accum, comp, out))
201 {
202 const float v = values[c];
203 out[c] = v;
204 // Kahan (compensated) summation
205 const float t1 = v - comp[c];
206 const float t2 = accum[c] + t1;
207 comp[c] = (t2 - accum[c]) - t1;
208 accum[c] = t2;
209 }
210}
211
212static inline __attribute__((always_inline)) void sub_4wide_Kahan(float *const restrict accum, const dt_aligned_pixel_t values,
213 float *const restrict comp)
214{
215 for_four_channels(c,aligned(accum,comp,values))
216 {
217 // Kahan (compensated) summation
218 const float t1 = -values[c] - comp[c];
219 const float t2 = accum[c] + t1;
220 comp[c] = (t2 - accum[c]) - t1;
221 accum[c] = t2;
222 }
223}
224
225static inline __attribute__((always_inline)) void store_scaled_4wide(float *const restrict out, const dt_aligned_pixel_t in, const float scale)
226{
227 for_four_channels(c,aligned(in))
228 out[c] = in[c] / scale;
229}
230
232static void sub_16wide(float *const restrict accum, const float *const restrict values)
233{
234 __OMP_SIMD__(aligned(accum : 64) aligned(values : 16))
235 for(size_t c = 0; c < 16; c++)
236 accum[c] -= values[c];
237}
238
239// copy 16 floats from a possibly-unaligned buffer into aligned temporary space, and also add to accumulator
241static void load_add_16wide(float *const restrict out, float *const restrict accum, const float *const restrict in)
242{
243 __OMP_SIMD__(aligned(accum, out : 64))
244 for (size_t c = 0; c < 16; c++)
245 {
246 const float v = in[c];
247 accum[c] += v;
248 out[c] = v;
249 }
250}
251
253static void sub_16wide_Kahan(float *const restrict accum, const float *const restrict values,
254 float *const restrict comp)
255{
256 __OMP_SIMD__(aligned(accum,comp : 64) aligned(values : 16))
257 for(size_t c = 0; c < 16; c++)
258 {
259 const float v = -values[c];
260 // Kahan (compensated) summation
261 const float t1 = v - comp[c];
262 const float t2 = accum[c] + t1;
263 comp[c] = (t2 - accum[c]) - t1;
264 accum[c] = t2;
265 }
266}
267
268// copy 16 floats from a possibly-unaligned buffer into aligned temporary space, and also add to accumulator
270static void load_add_16wide_Kahan(float *const restrict out, float *const restrict accum,
271 const float *const restrict in, float *const restrict comp)
272{
273 __OMP_SIMD__(aligned(accum, comp, out : 64))
274 for (size_t c = 0; c < 16; c++)
275 {
276 const float v = in[c];
277 out[c] = v;
278 // Kahan (compensated) summation
279 const float t1 = v - comp[c];
280 const float t2 = accum[c] + t1;
281 comp[c] = (t2 - accum[c]) - t1;
282 accum[c] = t2;
283 }
284}
285
286// copy 16 floats from aligned temporary space back to the possibly-unaligned user buffer
288static void store_16wide(float *const restrict out, const float *const restrict in)
289{
290 __OMP_SIMD__(aligned(in : 64))
291 for (size_t c = 0; c < 16; c++)
292 out[c] = in[c];
293}
294
296static void store_scaled_16wide(float *const restrict out, const float *const restrict in, const float scale)
297{
298 __OMP_SIMD__(aligned(in : 64))
299 for(size_t c = 0; c < 16; c++)
300 out[c] = in[c] / scale;
301}
302
304static void sub_Nwide_Kahan(const size_t N, float *const restrict accum, const float *const restrict values,
305 float *const restrict comp)
306{
307 __OMP_SIMD__(aligned(accum,comp : 64))
308 for(size_t c = 0; c < N; c++)
309 {
310 const float v = -values[c];
311 // Kahan (compensated) summation
312 const float t1 = v - comp[c];
313 const float t2 = accum[c] + t1;
314 comp[c] = (t2 - accum[c]) - t1;
315 accum[c] = t2;
316 }
317}
318
319// copy N (<=16) floats from a possibly-unaligned buffer into aligned temporary space, and also add to accumulator
321static void load_add_Nwide_Kahan(const size_t N, float *const restrict out, float *const restrict accum,
322 const float *const restrict in, float *const restrict comp)
323{
324 __OMP_SIMD__(aligned(accum, comp : 64))
325 for (size_t c = 0; c < N; c++)
326 {
327 const float v = in[c];
328 out[c] = v;
329 // Kahan (compensated) summation
330 const float t1 = v - comp[c];
331 const float t2 = accum[c] + t1;
332 comp[c] = (t2 - accum[c]) - t1;
333 accum[c] = t2;
334 }
335}
336
338static void store_scaled_Nwide(const size_t N, float *const restrict out, const float *const restrict in,
339 const float scale)
340{
341 __OMP_SIMD__(aligned(in : 64))
342 for(size_t c = 0; c < N; c++)
343 out[c] = in[c] / scale;
344}
345
346
348static void blur_horizontal_4ch(float *const restrict buf, const size_t height, const size_t width, const size_t radius,
349 float *const restrict scanlines, const size_t padded_size)
350{
352 for(int y = 0; y < height; y++)
353 {
354 float *const restrict scratch = dt_get_perthread(scanlines,padded_size);
355 dt_aligned_pixel_t L = { 0, 0, 0, 0 };
356 size_t hits = 0;
357 const size_t index = (size_t)4 * y * width;
358 float *const restrict bufp = buf + index;
359 // add up the left half of the window
360 for (size_t x = 0; x < MIN(radius,width) ; x++)
361 {
362 hits++;
363 load_add_4wide(scratch + 4*x, L, bufp + 4*x);
364 }
365 // process the blur up to the point where we start removing values
366 size_t x;
367 for (x = 0; (x <= radius) && ((x + radius) < width); x++)
368 {
369 const int np = x + radius;
370 hits++;
371 load_add_4wide(scratch + 4*np, L, bufp + 4*np);
372 store_scaled_4wide(bufp + 4*x, L, hits);
373 }
374 // if radius > width/2, we have pixels for which we can neither add new values (x+radius >= width) nor
375 // remove old values (x-radius < 0)
376 for(; x <= radius && x < width; x++)
377 {
378 store_scaled_4wide(bufp + 4*x, L, hits);
379 }
380 // process the blur for the bulk of the scan line
381 for(; x + radius < width; x++)
382 {
383 //very strange: if any of the 'op' or 'np' variables in this function are changed to either
384 // 'unsigned' or 'size_t', the function runs a fair bit slower....
385 const int op = x - radius - 1;
386 const int np = x + radius;
387 sub_4wide(L, scratch + 4*op);
388 load_add_4wide(scratch + 4*np, L, bufp + 4*np);
389 store_scaled_4wide(bufp + 4*x, L, hits);
390 }
391 // process the right end where we have no more values to add to the running sum
392 for(; x < width; x++)
393 {
394 const int op = x - radius - 1;
395 hits--;
396 sub_4wide(L, scratch + 4*op);
397 store_scaled_4wide(bufp + 4*x, L, hits);
398 }
399 }
400 return;
401}
402
403// invoked inside an OpenMP parallel for, so no need to parallelize
405static void blur_horizontal_4ch_Kahan(float *const restrict buf, const size_t width,
406 const size_t radius, float *const restrict scratch)
407{
408 dt_aligned_pixel_t L = { 0, 0, 0, 0 };
409 dt_aligned_pixel_t comp = { 0, 0, 0, 0 };
410 size_t hits = 0;
411 // add up the left half of the window
412 for (size_t x = 0; x < MIN(radius,width) ; x++)
413 {
414 hits++;
415 load_add_4wide_Kahan(scratch + 4*x, L, buf + 4*x, comp);
416 }
417 // process the blur up to the point where we start removing values from the moving average
418 size_t x;
419 for (x = 0; (x <= radius) && ((x + radius) < width); x++)
420 {
421 const int np = x + radius;
422 hits++;
423 load_add_4wide_Kahan(scratch + 4*np, L, buf + 4*np, comp);
424 store_scaled_4wide(buf + 4*x, L, hits);
425 }
426 // if radius > width/2, we have pixels for which we can neither add new values (x+radius >= width) nor
427 // remove old values (x-radius < 0)
428 for(; x <= radius && x < width; x++)
429 {
430 store_scaled_4wide(buf + 4*x, L, hits);
431 }
432 // process the blur for the bulk of the scan line
433 for(; x + radius < width; x++)
434 {
435 const int op = x - radius - 1;
436 const int np = x + radius;
437 sub_4wide_Kahan(L, scratch + 4*op, comp);
438 load_add_4wide_Kahan(scratch + 4*np, L, buf + 4*np, comp);
439 store_scaled_4wide(buf + 4*x, L, hits);
440 }
441 // process the right end where we have no more values to add to the running sum
442 for(; x < width; x++)
443 {
444 const int op = x - radius - 1;
445 hits--;
446 sub_4wide_Kahan(L, scratch + 4*op, comp);
447 store_scaled_4wide(buf + 4*x, L, hits);
448 }
449 return;
450}
451
452// invoked inside an OpenMP parallel for, so no need to parallelize
454static void blur_horizontal_Nch_Kahan(const size_t N, float *const restrict buf, const size_t width,
455 const size_t radius, float *const restrict scratch)
456{
457 if (N > 16) return;
458 if (N != 9) return; // since we only use 9 channels at the moment, give the compiler a big hint
459
460 float DT_ALIGNED_ARRAY L[16] = { 0, 0, 0, 0 };
461 float DT_ALIGNED_ARRAY comp[16] = { 0, 0, 0, 0 };
462 size_t hits = 0;
463 // add up the left half of the window
464 for (size_t x = 0; x < MIN(radius,width) ; x++)
465 {
466 hits++;
467 load_add_Nwide_Kahan(N, scratch + N*x, L, buf + N*x, comp);
468 }
469 // process the blur up to the point where we start removing values from the moving average
470 size_t x;
471 for (x = 0; (x <= radius) && ((x + radius) < width); x++)
472 {
473 const int np = x + radius;
474 hits++;
475 load_add_Nwide_Kahan(N, scratch + N*np, L, buf + N*np, comp);
476 store_scaled_Nwide(N, buf + N*x, L, hits);
477 }
478 // if radius > width/2, we have pixels for which we can neither add new values (x+radius >= width) nor
479 // remove old values (x-radius < 0)
480 for(; x <= radius && x < width; x++)
481 {
482 store_scaled_Nwide(N, buf + N*x, L, hits);
483 }
484 // process the blur for the bulk of the scan line
485 for(; x + radius < width; x++)
486 {
487 const int op = x - radius - 1;
488 const int np = x + radius;
489 sub_Nwide_Kahan(N, L, scratch + N*op, comp);
490 load_add_Nwide_Kahan(N, scratch + N*np, L, buf + N*np, comp);
491 store_scaled_Nwide(N, buf + N*x, L, hits);
492 }
493 // process the right end where we have no more values to add to the running sum
494 for(; x < width; x++)
495 {
496 const int op = x - radius - 1;
497 hits--;
498 sub_Nwide_Kahan(N, L, scratch + N*op, comp);
499 store_scaled_Nwide(N, buf + N*x, L, hits);
500 }
501 return;
502}
503
504// invoked inside an OpenMP parallel for, so no need to parallelize
506static void blur_vertical_1wide(float *const restrict buf, const size_t height, const size_t width,
507 const size_t radius, float *const restrict scratch)
508{
509 // To improve cache hit rates, we copy the final result from the scratch space back to the original
510 // location in the buffer as soon as we finish the final read of the buffer. To reduce the working
511 // set and further improve cache hits, we can treat the scratch space as a circular buffer and cycle
512 // through it repeatedly. To use a simple bitmask instead of a division, the size we cycle through
513 // needs to be the power of two larger than the window size (2*radius+1).
514 size_t mask = 1;
515 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) mask = (mask << 1) | 1;
516
517 float L = 0.0f;
518 size_t hits = 0;
519 // add up the left half of the window
520 for (size_t y = 0; y < MIN(radius, height); y++)
521 {
522 PREFETCH_NTA(buf + (y+16)*width);
523 hits++;
524 const float v = buf[y*width];
525 L += v;
526 scratch[y&mask] = v;
527 }
528 // process up to the point where we start removing values from the moving average
529 size_t y;
530 for (y = 0; y <= radius && y + radius < height; y++)
531 {
532 // weirdly, changing any of the 'np' or 'op' variables in this function to 'size_t' yields a substantial slowdown!
533 const int np = y + radius;
534 PREFETCH_NTA(buf + (np+16)*width);
535 hits++;
536 const float v = buf[np*width];
537 L += v;
538 scratch[np&mask] = v;
539 buf[y*width] = L / hits;
540 }
541 // if radius > height/2, we have pixels for which we can neither add new values (y+radius >= height) nor
542 // remove old values (y-radius < 0)
543 for(; y <= radius && y < height; y++)
544 {
545 buf[y*width] = L / hits;
546 }
547 // process the bulk of the column
548 for( ; y + radius < height; y++)
549 {
550 const int np = y + radius;
551 const int op = y - radius - 1;
552 PREFETCH_NTA(buf + (np+16)*width);
553 L -= scratch[op&mask];
554 const float v = buf[np*width];
555 L += v;
556 scratch[np&mask] = v;
557 // update the means
558 buf[y*width] = L / hits;
559 }
560 // process the end of the column, where we don't have any more values to add to the mean
561 for( ; y < height; y++)
562 {
563 const int op = y - radius - 1;
564 hits--;
565 L -= scratch[op&mask];
566 // update the means
567 buf[y*width] = L / hits;
568 }
569 return;
570}
571
572// invoked inside an OpenMP parallel for, so no need to parallelize
574static void blur_vertical_1wide_Kahan(float *const restrict buf, const size_t height, const size_t width,
575 const size_t radius, float *const restrict scratch)
576{
577 // To improve cache hit rates, we copy the final result from the scratch space back to the original
578 // location in the buffer as soon as we finish the final read of the buffer. To reduce the working
579 // set and further improve cache hits, we can treat the scratch space as a circular buffer and cycle
580 // through it repeatedly. To use a simple bitmask instead of a division, the size we cycle through
581 // needs to be the power of two larger than the window size (2*radius+1).
582 size_t mask = 1;
583 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) mask = (mask << 1) | 1;
584
585 float L = 0.0f;
586 float c = 0.0f;
587 size_t hits = 0;
588 // add up the left half of the window
589 for (size_t y = 0; y < MIN(radius, height); y++)
590 {
591 PREFETCH_NTA(buf + (y+16)*width);
592 hits++;
593 const float v = buf[y*width];
594 L = Kahan_sum(L, &c, v);
595 scratch[y&mask] = v;
596 }
597 // process up to the point where we start removing values from the moving average
598 size_t y;
599 for (y = 0; y <= radius && y + radius < height; y++)
600 {
601 // weirdly, changing any of the 'np' or 'op' variables in this function to 'size_t' yields a substantial slowdown!
602 const int np = y + radius;
603 hits++;
604 PREFETCH_NTA(buf + (np+16)*width);
605 const float v = buf[np*width];
606 L = Kahan_sum(L, &c, v);
607 scratch[np&mask] = v;
608 buf[y*width] = L / hits;
609 }
610 // if radius > height/2, we have pixels for which we can neither add new values (y+radius >= height) nor
611 // remove old values (y-radius < 0)
612 for(; y <= radius && y < height; y++)
613 {
614 buf[y*width] = L / hits;
615 }
616 // process the bulk of the column
617 for( ; y + radius < height; y++)
618 {
619 const int np = y + radius;
620 const int op = y - radius - 1;
621 PREFETCH_NTA(buf + (np+16)*width);
622 L = Kahan_sum(L, &c, -scratch[op&mask]);
623 const float v = buf[np*width];
624 L = Kahan_sum(L, &c, v);
625 scratch[np&mask] = v;
626 // update the means
627 buf[y*width] = L / hits;
628 }
629 // process the end of the column, where we don't have any more values to add to the mean
630 for( ; y < height; y++)
631 {
632 const int op = y - radius - 1;
633 hits--;
634 L = Kahan_sum(L, &c, scratch[op&mask]);
635 // update the means
636 buf[y*width] = L / hits;
637 }
638 return;
639}
640
641// invoked inside an OpenMP parallel for, so no need to parallelize
643static void blur_vertical_4wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius,
644 float *const restrict scratch)
645{
646 // To improve cache hit rates, we copy the final result from the scratch space back to the original
647 // location in the buffer as soon as we finish the final read of the buffer. To reduce the working
648 // set and further improve cache hits, we can treat the scratch space as a circular buffer and cycle
649 // through it repeatedly. To use a simple bitmask instead of a division, the size we cycle through
650 // needs to be the power of two larger than the window size (2*radius+1).
651 size_t mask = 1;
652 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) mask = (mask << 1) | 1;
653
654 dt_aligned_pixel_t L = { 0, 0, 0, 0 };
655 size_t hits = 0;
656 // add up the left half of the window
657 for (size_t y = 0; y < MIN(radius, height); y++)
658 {
659 DT_PREFETCH(buf + (y+16)*width);
660 hits++;
661 load_add_4wide(scratch + 4*(y&mask), L, buf + y * width);
662 }
663 // process the blur up to the point where we start removing values
664 size_t y;
665 for (y = 0; y <= radius && y + radius < height; y++)
666 {
667 // weirdly, changing any of the 'np' or 'op' variables in this function to 'size_t' yields a substantial slowdown!
668 const int np = y + radius;
669 hits++;
670 DT_PREFETCH(buf + (np+16)*width);
671 load_add_4wide(scratch + 4*(np&mask), L, buf + np*width);
672 store_scaled_4wide(buf + y*width, L, hits);
673 }
674 // if radius > height/2, we have pixels for which we can neither add new values (y+radius >= height) nor
675 // remove old values (y-radius < 0)
676 for(; y <= radius && y < height; y++)
677 {
678 store_scaled_4wide(buf + y*width, L, hits);
679 }
680 // process the blur for the bulk of the column
681 for ( ; y + radius < height; y++)
682 {
683 const int np = y + radius;
684 const int op = y - radius - 1;
685 DT_PREFETCH(buf + (np+16)*width);
686 sub_4wide(L, scratch + 4*(op&mask));
687 load_add_4wide(scratch + 4*(np&mask), L, buf + np*width);
688 store_scaled_4wide(buf + y*width, L, hits);
689 }
690 // process the blur for the end of the scan line, where we don't have any more values to add to the mean
691 for ( ; y < height; y++)
692 {
693 const int op = y - radius - 1;
694 hits--;
695 sub_4wide(L, scratch + 4*(op&mask));
696 store_scaled_4wide(buf + y*width, L, hits);
697 }
698 return;
699}
700
701// invoked inside an OpenMP parallel for, so no need to parallelize
703static void blur_vertical_4wide_Kahan(float *const restrict buf, const size_t height, const size_t width,
704 const size_t radius, float *const restrict scratch)
705{
706 // To improve cache hit rates, we copy the final result from the scratch space back to the original
707 // location in the buffer as soon as we finish the final read of the buffer. To reduce the working
708 // set and further improve cache hits, we can treat the scratch space as a circular buffer and cycle
709 // through it repeatedly. To use a simple bitmask instead of a division, the size we cycle through
710 // needs to be the power of two larger than the window size (2*radius+1).
711 size_t mask = 1;
712 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) mask = (mask << 1) | 1;
713
714 dt_aligned_pixel_t L = { 0, 0, 0, 0 };
715 dt_aligned_pixel_t comp = { 0, 0, 0, 0 };
716 size_t hits = 0;
717 // add up the left half of the window
718 for (size_t y = 0; y < MIN(radius, height); y++)
719 {
720 DT_PREFETCH(buf + (y+16)*width);
721 hits++;
722 load_add_4wide_Kahan(scratch + 4*(y&mask), L, buf + y * width, comp);
723 }
724 // process the blur up to the point where we start removing values
725 size_t y;
726 for (y = 0; y <= radius && y + radius < height; y++)
727 {
728 // weirdly, changing any of the 'np' or 'op' variables in this function to 'size_t' yields a substantial slowdown!
729 const int np = y + radius;
730 hits++;
731 DT_PREFETCH(buf + (np+16)*width);
732 load_add_4wide_Kahan(scratch + 4*(np&mask), L, buf + np*width, comp);
733 store_scaled_4wide(buf + y*width, L, hits);
734 }
735 // if radius > height/2, we have pixels for which we can neither add new values (y+radius >= height) nor
736 // remove old values (y-radius < 0)
737 for(; y <= radius && y < height; y++)
738 {
739 store_scaled_4wide(buf + y*width, L, hits);
740 }
741 // process the blur for the bulk of the scan line
742 for ( ; y + radius < height; y++)
743 {
744 const int np = y + radius;
745 const int op = y - radius - 1;
746 DT_PREFETCH(buf + (np+16)*width);
747 sub_4wide_Kahan(L, scratch + 4*(op&mask), comp);
748 load_add_4wide_Kahan(scratch + 4*(np&mask), L, buf + np*width, comp);
749 store_scaled_4wide(buf + y*width, L, hits);
750 }
751 // process the blur for the end of the scan line, where we don't have any more values to add to the mean
752 for ( ; y < height; y++)
753 {
754 const int op = y - radius - 1;
755 hits--;
756 sub_4wide_Kahan(L, scratch + 4*(op&mask), comp);
757 store_scaled_4wide(buf + y*width, L, hits);
758 }
759 return;
760}
761
762// invoked inside an OpenMP parallel for, so no need to parallelize
764static void blur_vertical_16wide(float *const restrict buf, const size_t height, const size_t width,
765 const size_t radius, float *const restrict scratch)
766{
767 // To improve cache hit rates, we copy the final result from the scratch space back to the original
768 // location in the buffer as soon as we finish the final read of the buffer. To reduce the working
769 // set and further improve cache hits, we can treat the scratch space as a circular buffer and cycle
770 // through it repeatedly. To use a simple bitmask instead of a division, the size we cycle through
771 // needs to be the power of two larger than the window size (2*radius+1).
772 size_t mask = 1;
773 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) mask = (mask << 1) | 1;
774
775 float DT_ALIGNED_ARRAY L[16] = { 0, 0, 0, 0 };
776 float hits = 0;
777 // add up the left half of the window
778 for (size_t y = 0; y < MIN(radius, height); y++)
779 {
780 PREFETCH_NTA(buf + (y+16)*width);
781 hits++;
782 load_add_16wide(scratch + 16 * (y&mask), L, buf + y*width);
783 }
784 // process the blur up to the point where we start removing values from the moving average
785 size_t y;
786 for (y = 0; y <= radius && y + radius < height; y++)
787 {
788 // weirdly, changing any of the 'np' or 'op' variables in this function to 'size_t' yields a substantial slowdown!
789 const int np = y + radius;
790 hits++;
791 PREFETCH_NTA(buf + (np+16)*width);
792 load_add_16wide(scratch + 16 * (np&mask), L, buf + np*width);
793 store_scaled_16wide(buf + y*width, L, hits);
794 }
795 // if radius > height/2, we have pixels for which we can neither add new values (y+radius >= height) nor
796 // remove old values (y-radius < 0)
797 for(; y <= radius && y < height; y++)
798 {
799 store_scaled_16wide(buf + y*width, L, hits);
800 }
801 // process the blur for the bulk of the column
802 for ( ; y + radius < height; y++)
803 {
804 const int np = y + radius;
805 const int op = y - radius - 1;
806 PREFETCH_NTA(buf + (np+16)*width);
807 sub_16wide(L, scratch + 16*(op&mask));
808 load_add_16wide(scratch + 16*(np&mask), L, buf + np*width);
809 // update the means
810 store_scaled_16wide(buf + y*width, L, hits);
811 }
812 // process the blur for the end of the scan line, where we don't have any more values to add to the mean
813 for ( ; y < height; y++)
814 {
815 const int op = y - radius - 1;
816 hits--;
817 sub_16wide(L, scratch + 16*(op&mask));
818 // update the means
819 store_scaled_16wide(buf + y*width, L, hits);
820 }
821 return;
822}
823
824// invoked inside an OpenMP parallel for, so no need to parallelize
826static void blur_vertical_16wide_Kahan(float *const restrict buf, const size_t height, const size_t width,
827 const size_t radius, float *const restrict scratch)
828{
829 // To improve cache hit rates, we copy the final result from the scratch space back to the original
830 // location in the buffer as soon as we finish the final read of the buffer. To reduce the working
831 // set and further improve cache hits, we can treat the scratch space as a circular buffer and cycle
832 // through it repeatedly. To use a simple bitmask instead of a division, the size we cycle through
833 // needs to be the power of two larger than the window size (2*radius+1).
834 size_t mask = 1;
835 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) mask = (mask << 1) | 1;
836
837 float DT_ALIGNED_ARRAY L[16] = { 0, 0, 0, 0 };
838 float DT_ALIGNED_ARRAY comp[16] = { 0, 0, 0, 0 };
839 float hits = 0;
840 // add up the left half of the window
841 for (size_t y = 0; y < MIN(radius, height); y++)
842 {
843 DT_PREFETCH(buf + (y+16)*width);
844 hits++;
845 load_add_16wide_Kahan(scratch + 16 * (y&mask), L, buf + y*width, comp);
846 }
847 // process the blur up to the point where we start removing values from the moving average
848 size_t y;
849 for (y = 0; y <= radius && y + radius < height; y++)
850 {
851 // weirdly, changing any of the 'np' or 'op' variables in this function to 'size_t' yields a substantial slowdown!
852 const int np = y + radius;
853 hits++;
854 DT_PREFETCH(buf + (np+16)*width);
855 load_add_16wide_Kahan(scratch + 16 * (np&mask), L, buf + np*width, comp);
856 store_scaled_16wide(buf + y*width, L, hits);
857 }
858 // if radius > height/2, we have pixels for which we can neither add new values (y+radius >= height) nor
859 // remove old values (y-radius < 0)
860 for(; y <= radius && y < height; y++)
861 {
862 store_scaled_16wide(buf + y*width, L, hits);
863 }
864 // process the blur for the bulk of the column
865 for ( ; y + radius < height; y++)
866 {
867 const int np = y + radius;
868 const int op = y - radius - 1;
869 DT_PREFETCH(buf + (np+16)*width);
870 sub_16wide_Kahan(L, scratch + 16*(op&mask), comp);
871 load_add_16wide_Kahan(scratch + 16*(np&mask), L, buf + np*width, comp);
872 // update the means
873 store_scaled_16wide(buf + y*width, L, hits);
874 }
875 // process the blur for the end of the scan line, where we don't have any more values to add to the mean
876 for ( ; y < height; y++)
877 {
878 const int op = y - radius - 1;
879 hits--;
880 sub_16wide_Kahan(L, scratch + 16*(op&mask), comp);
881 // update the means
882 store_scaled_16wide(buf + y*width, L, hits);
883 }
884 return;
885}
886
888static void blur_vertical_1ch(float *const restrict buf, const size_t height, const size_t width, const size_t radius,
889 float *const restrict scanlines, const size_t padded_size)
890{
892 for(int x = 0; x < width; x += 16)
893 {
894 float *const restrict scratch = dt_get_perthread(scanlines,padded_size);
895 if (x + 16 <= width)
896 {
897 blur_vertical_16wide(buf + x, height, width, radius, scratch);
898 }
899 else
900 {
901 // handle the leftover 1..15 columns, first in groups of four, then the final 0..3 singly
902 int col = x;
903 for( ; col < (width & ~3); col += 4)
904 blur_vertical_4wide(buf + col, height, width, radius, scratch);
905 for( ; col < width; col++)
906 blur_vertical_1wide(buf + col, height, width, radius, scratch);
907 }
908 }
909 return;
910}
911
912// determine the size of the scratch buffer needed for vertical passes of the box-mean filter
913// filter_window = 2**ceil(lg2(2*radius+1))
915static size_t _compute_effective_height(const size_t height, const size_t radius)
916{
917 size_t eff_height = 2;
918 for(size_t r = (2*radius+1); r > 1 ; r >>= 1) eff_height <<= 1;
919 eff_height = MIN(eff_height,height);
920 return eff_height;
921}
922
924static int dt_box_mean_1ch(float *const buf, const size_t height, const size_t width, const size_t radius,
925 const unsigned iterations)
926{
927 // scratch space needed per thread:
928 // width floats to store one row during horizontal pass
929 // 16*filter_window floats for vertical pass
930 const size_t eff_height = _compute_effective_height(height,radius);
931 const size_t size = MAX(width,16*eff_height);
932 size_t padded_size;
933 float *const restrict scanlines = dt_pixelpipe_cache_alloc_perthread_float(size, &padded_size);
934 if(IS_NULL_PTR(scanlines)) return 1;
935
936 for(unsigned iteration = 0; iteration < iterations; iteration++)
937 {
938 blur_horizontal_1ch(buf, height, width, radius, scanlines, padded_size);
939 blur_vertical_1ch(buf, height, width, radius, scanlines, padded_size);
940 }
941
943 return 0;
944}
945
947static int dt_box_mean_4ch(float *const buf, const int height, const int width, const int radius,
948 const unsigned iterations)
949{
950 // scratch space needed per thread:
951 // 4*width floats to store one row during horizontal pass
952 // 16*filter_window floats for vertical pass
953 const size_t eff_height = _compute_effective_height(height,radius);
954 const size_t size = MAX(4*width,16*eff_height);
955 size_t padded_size;
956 float *const restrict scanlines = dt_pixelpipe_cache_alloc_perthread_float(size, &padded_size);
957 if(IS_NULL_PTR(scanlines)) return 1;
958
959 for(unsigned iteration = 0; iteration < iterations; iteration++)
960 {
961 blur_horizontal_4ch(buf, height, width, radius, scanlines, padded_size);
962 // we need to multiply width by 4 to get the correct stride for the vertical blur
963 blur_vertical_1ch(buf, height, 4*width, radius, scanlines, padded_size);
964 }
965
967 return 0;
968}
969
971static int box_mean_vert_1ch_Kahan(float *const buf, const int height, const size_t width, const size_t radius)
972{
973 const size_t eff_height = _compute_effective_height(height,radius);
974 size_t padded_size;
975 float *const restrict scratch_buf = dt_pixelpipe_cache_alloc_perthread_float(16*eff_height,&padded_size);
976 if(IS_NULL_PTR(scratch_buf)) return 1;
978 for (size_t col = 0; col < width; col += 16)
979 {
980 float *const restrict scratch = dt_get_perthread(scratch_buf,padded_size);
981 if (col + 16 <= width)
982 {
983 blur_vertical_16wide_Kahan(buf + col, height, width, radius, scratch);
984 }
985 else
986 {
987 // handle the 1..15 remaining columns
988 size_t col_ = col;
989 for( ; col_ < (width & ~3); col_ += 4)
990 blur_vertical_4wide_Kahan(buf + col_, height, width, radius, scratch);
991 for( ; col_ < width; col_++)
992 blur_vertical_1wide_Kahan(buf + col_, height, width, radius, scratch);
993 }
994 }
995
997 return 0;
998}
999
1001static int dt_box_mean_4ch_Kahan(float *const buf, const size_t height, const size_t width, const int radius,
1002 const unsigned iterations)
1003{
1004
1005 for(unsigned iteration = 0; iteration < iterations; iteration++)
1006 {
1007 size_t padded_size;
1008 float *const restrict scanlines = dt_pixelpipe_cache_alloc_perthread_float(4*width,&padded_size);
1009 if(IS_NULL_PTR(scanlines)) return 1;
1011 for (size_t row = 0; row < height; row++)
1012 {
1013 float *const restrict scratch = dt_get_perthread(scanlines,padded_size);
1014 blur_horizontal_4ch_Kahan(buf + row * 4 * width, width, radius, scratch);
1015 }
1016
1018
1019 if(box_mean_vert_1ch_Kahan(buf, height, 4*width, radius) != 0) return 1;
1020 }
1021
1022 return 0;
1023}
1024
1025static inline int box_mean_2ch(float *const restrict in, const size_t height, const size_t width,
1026 const int radius, const unsigned iterations)
1027{
1028 // Compute in-place a box average (filter) on a multi-channel image over a window of size 2*radius + 1
1029 // We make use of the separable nature of the filter kernel to speed-up the computation
1030 // by convolving along columns and rows separately (complexity O(2 x radius) instead of O(radius²)).
1031
1032 const size_t eff_height = _compute_effective_height(height, radius);
1033 const size_t Ndim = MAX(4*width,16*eff_height);
1034 size_t padded_size;
1035 float *const restrict temp = dt_pixelpipe_cache_alloc_perthread_float(Ndim, &padded_size);
1036 if (IS_NULL_PTR(temp)) return 1;
1037
1038 for (unsigned iteration = 0; iteration < iterations; iteration++)
1039 {
1040 blur_horizontal_2ch(in, height, width, radius, temp, padded_size);
1041 blur_vertical_1ch(in, height, 2*width, radius, temp, padded_size);
1042 }
1044 return 0;
1045}
1046
1047int dt_box_mean(float *const buf, const size_t height, const size_t width, const int ch,
1048 const int radius, const unsigned iterations)
1049{
1050 if (ch == 1)
1051 {
1052 return dt_box_mean_1ch(buf,height,width,radius,iterations);
1053 }
1054 else if (ch == 4)
1055 {
1056 return dt_box_mean_4ch(buf,height,width,radius,iterations);
1057 }
1058 else if (ch == (4|BOXFILTER_KAHAN_SUM))
1059 {
1060 return dt_box_mean_4ch_Kahan(buf,height,width,radius,iterations);
1061 }
1062 else if (ch == 2) // used by fast_guided_filter.h
1063 {
1064 return box_mean_2ch(buf,height,width,radius,iterations);
1065 }
1066 else
1068 return 1;
1069}
1070
1071int dt_box_mean_horizontal(float *const restrict buf, const size_t width, const int ch, const int radius,
1072 float *const restrict user_scratch)
1073{
1074 if (ch == (4|BOXFILTER_KAHAN_SUM))
1075 {
1076 float *const restrict scratch = user_scratch ? user_scratch
1078 if(IS_NULL_PTR(scratch)) return 1;
1079
1080 blur_horizontal_4ch_Kahan(buf, width, radius, scratch);
1081 if (IS_NULL_PTR(user_scratch))
1083 return 0;
1084 }
1085 else if (ch == (9|BOXFILTER_KAHAN_SUM))
1086 {
1087 float *const restrict scratch = user_scratch ? user_scratch
1089 if(IS_NULL_PTR(scratch)) return 1;
1090
1091 blur_horizontal_Nch_Kahan(9, buf, width, radius, scratch);
1092 if (IS_NULL_PTR(user_scratch))
1094 return 0;
1095 }
1096 else
1098 return 1;
1099}
1100
1101int dt_box_mean_vertical(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
1102{
1103 if ((ch & BOXFILTER_KAHAN_SUM) && (ch & ~BOXFILTER_KAHAN_SUM) <= 16)
1104 {
1105 size_t channels = ch & ~BOXFILTER_KAHAN_SUM;
1106 return box_mean_vert_1ch_Kahan(buf, height, channels*width, radius);
1107 }
1108 else
1110 return 1;
1111}
1112
1113static inline float window_max(const float *x, int n)
1114{
1115 float m = -(FLT_MAX);
1116 __OMP_SIMD__(reduction(max : m))
1117 for(int j = 0; j < n; j++)
1118 m = MAX(m, x[j]);
1119 return m;
1120}
1121
1122// calculate the one-dimensional moving maximum over a window of size 2*w+1
1123// input array x has stride 1, output array y has stride stride_y
1124static inline void box_max_1d(int N, const float *const restrict x, float *const restrict y, size_t stride_y, int w)
1125{
1126 float m = window_max(x, MIN(w + 1, N));
1127 for(int i = 0; i < N; i++)
1128 {
1129 // store maximum of current window at center position
1130 y[i * stride_y] = m;
1131 // if the earliest member of the current window is the max, we need to
1132 // rescan the window to determine the new maximum
1133 if(i - w >= 0 && x[i - w] == m)
1134 {
1135 const int start = i - w + 1;
1136 m = window_max(x + start, MIN(i + w + 2, N) - start);
1137 }
1138 // if the window has not yet exceeded the end of the row/column, update the maximum value
1139 if(i + w + 1 < N)
1140 m = MAX(x[i + w + 1], m);
1141 }
1142}
1143
1145static void set_16wide(float *const restrict out, const float value)
1146{
1147 __OMP_SIMD__(aligned(out : 64))
1148 for (size_t c = 0; c < 16; c++)
1149 out[c] = value;
1150}
1151
1152static inline void update_max_16wide(float m[16], const float *const restrict base)
1153{
1154 __OMP_SIMD__(aligned(m, base : 64))
1155 for (size_t c = 0; c < 16; c++)
1156 {
1157 m[c] = fmaxf(m[c], base[c]);
1158 }
1159}
1160
1161static inline void load_update_max_16wide(float *const restrict out, float m[16], const float *const restrict base)
1162{
1163 __OMP_SIMD__(aligned(out, m : 64))
1164 for (size_t c = 0; c < 16; c++)
1165 {
1166 const float v = base[c];
1167 out[c] = v;
1168 m[c] = fmaxf(m[c], v);
1169 }
1170}
1171
1172// calculate the one-dimensional moving maximum on four adjacent columns over a window of size 2*w+1
1173// input/output array 'buf' has stride 'stride' and we will write 16 consecutive elements every stride elements
1174// (thus processing a cache line at a time)
1175static inline void box_max_vert_16wide(const int N, float *const restrict scratch, float *const restrict buf,
1176 const int stride, const int w, const size_t mask)
1177{
1178 float DT_ALIGNED_ARRAY m[16] = { -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX),
1179 -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX),
1180 -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX),
1181 -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX) };
1182 for(size_t i = 0; i < MIN(w + 1, N); i++)
1183 {
1184 PREFETCH_NTA(buf + stride*(i+24));
1185 load_update_max_16wide(scratch + 16 * (i&mask),m, buf + stride*i);
1186 }
1187 for(size_t i = 0; i < N; i++)
1188 {
1189 PREFETCH_NTA(buf + stride*(i+24));
1190 // store maximum of current window at center position
1191 store_16wide(buf + stride * i, m);
1192 // If the earliest member of the current window is the max, we need to
1193 // rescan the window to determine the new maximum
1194 if (i >= w)
1195 {
1196 set_16wide(m, -(FLT_MAX)); // reset max values to lowest possible
1197 for(int j = i - w + 1; j < MIN(i + w + 1, N); j++)
1198 {
1199 update_max_16wide(m,scratch + 16*(j&mask));
1200 }
1201 }
1202 // if the window has not yet exceeded the end of the row/column, update the maximum value
1203 const size_t n = i + w + 1;
1204 if(n < N)
1205 {
1206 load_update_max_16wide(scratch + 16 * (n&mask), m, buf + stride * n);
1207 }
1208 }
1209}
1210
1211// calculate the two-dimensional moving maximum over a box of size (2*w+1) x (2*w+1)
1212// does the calculation in-place if input and output images are identical
1214static int box_max_1ch(float *const buf, const size_t height, const size_t width, const unsigned w)
1215{
1216 const size_t eff_height = _compute_effective_height(height, w);
1217 const size_t scratch_size = MAX(width,MAX(height,16*eff_height));
1218 size_t allocsize;
1219 float *const restrict scratch_buffers = dt_pixelpipe_cache_alloc_perthread_float(scratch_size,&allocsize);
1220 if(IS_NULL_PTR(scratch_buffers)) return 1;
1222 for(size_t row = 0; row < height; row++)
1223 {
1224 float *const restrict scratch = dt_get_perthread(scratch_buffers,allocsize);
1225 memcpy(scratch, buf + row * width, sizeof(float) * width);
1226 box_max_1d(width, scratch, buf + row * width, 1, w);
1227 }
1229 for(int col = 0; col < (width & ~15); col += 16)
1230 {
1231 float *const restrict scratch = dt_get_perthread(scratch_buffers,allocsize);
1232 box_max_vert_16wide(height, scratch, buf + col, width, w, eff_height-1);
1233 }
1234 // handle the leftover 0..15 columns
1235 for (size_t col = width & ~15 ; col < width; col++)
1236 {
1237 float *const restrict scratch = scratch_buffers;
1238 for(size_t row = 0; row < height; row++)
1239 scratch[row] = buf[row * width + col];
1240 box_max_1d(height, scratch, buf + col, width, w);
1241 }
1242 dt_pixelpipe_cache_free_align(scratch_buffers);
1243 return 0;
1244}
1245
1246
1247// in-place calculate the two-dimensional moving maximum over a box of size (2*radius+1) x (2*radius+1)
1248int dt_box_max(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
1249{
1250 if (ch == 1)
1251 return box_max_1ch(buf, height, width, radius);
1252 else
1253 //TODO: 4ch version if needed
1255 return 1;
1256}
1257
1258static inline float window_min(const float *x, int n)
1259{
1260 float m = FLT_MAX;
1261 __OMP_SIMD__(reduction(min : m))
1262 for(int j = 0; j < n; j++)
1263 m = MIN(m, x[j]);
1264 return m;
1265}
1266
1267// calculate the one-dimensional moving minimum over a window of size 2*w+1
1268// input array x has stride 1, output array y has stride stride_y
1269static inline void box_min_1d(int N, const float *x, float *y, size_t stride_y, int w)
1270{
1271 float m = window_min(x, MIN(w + 1, N));
1272 for(int i = 0; i < N; i++)
1273 {
1274 y[i * stride_y] = m;
1275 if(i - w >= 0 && x[i - w] == m)
1276 {
1277 const int start = (i - w + 1);
1278 m = window_min(x + start, MIN((i + w + 2), N) - start);
1279 }
1280 // if the window has not yet exceeded the end of the row/column, update the minimum value
1281 if(i + w + 1 < N)
1282 m = MIN(x[i + w + 1], m);
1283 }
1284}
1285
1286static inline void update_min_16wide(float m[16], const float *const restrict base)
1287{
1288 __OMP_SIMD__(aligned(m, base : 64))
1289 for (size_t c = 0; c < 16; c++)
1290 {
1291 m[c] = fminf(m[c], base[c]);
1292 }
1293}
1294
1295static inline void load_update_min_16wide(float *const restrict out, float m[16], const float *const restrict base)
1296{
1297 __OMP_SIMD__(aligned(out, m : 64))
1298 for (size_t c = 0; c < 16; c++)
1299 {
1300 const float v = base[c];
1301 out[c] = v;
1302 m[c] = fminf(m[c], v);
1303 }
1304}
1305
1306// calculate the one-dimensional moving minimum on four adjacent columns over a window of size 2*w+1
1307// input/output array 'buf' has stride 'stride' and we will write 16 consecutive elements every stride elements
1308// (thus processing a cache line at a time)
1309static inline void box_min_vert_16wide(const int N, float *const restrict scratch, float *const restrict buf,
1310 const int stride, const int w, const size_t mask)
1311{
1312 float DT_ALIGNED_ARRAY m[16] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
1313 FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
1314 FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
1315 FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
1316 for(size_t i = 0; i < MIN(w + 1, N); i++)
1317 {
1318 PREFETCH_NTA(buf + stride*(i+24));
1319 load_update_min_16wide(scratch + 16*(i&mask), m, buf + stride*i);
1320 }
1321 for(size_t i = 0; i < N; i++)
1322 {
1323 PREFETCH_NTA(buf + stride*(i+24));
1324 // store minimum of current window at center position
1325 store_16wide(buf + i * stride, m);
1326 // If the earliest member of the current window is the min, we need to
1327 // rescan the window to determine the new minimum
1328 if (i >= w)
1329 {
1330 set_16wide(m, FLT_MAX); // reset min values to the highest possible
1331 for(int j = i - w + 1; j < MIN(i + w + 1, N); j++)
1332 {
1333 update_min_16wide(m,scratch + 16*(j&mask));
1334 }
1335 }
1336 // if the window has not yet exceeded the end of the row/column, update the minimum value
1337 const size_t n = i + w + 1;
1338 if(n < N)
1339 {
1340 load_update_min_16wide(scratch + 16 * (n&mask), m, buf + stride * n);
1341 }
1342 }
1343}
1344
1345
1346// calculate the two-dimensional moving minimum over a box of size (2*w+1) x (2*w+1)
1347// does the calculation in-place if input and output images are identical
1349static int box_min_1ch(float *const buf, const size_t height, const size_t width, const int w)
1350{
1351 const size_t eff_height = _compute_effective_height(height, w);
1352 const size_t scratch_size = MAX(width,MAX(height,16*eff_height));
1353 size_t allocsize;
1354 float *const restrict scratch_buffers = dt_pixelpipe_cache_alloc_perthread_float(scratch_size,&allocsize);
1355 if(IS_NULL_PTR(scratch_buffers)) return 1;
1357 for(size_t row = 0; row < height; row++)
1358 {
1359 float *const restrict scratch = dt_get_perthread(scratch_buffers,allocsize);
1360 memcpy(scratch, buf + row * width, sizeof(float) * width);
1361 box_min_1d(width, scratch, buf + row * width, 1, w);
1362 }
1364 for(size_t col = 0; col < (width & ~15); col += 16)
1365 {
1366 float *const restrict scratch = dt_get_perthread(scratch_buffers,allocsize);
1367 box_min_vert_16wide(height, scratch, buf + col, width, w, eff_height-1);
1368 }
1369 // handle the leftover 0..15 columns
1370 for (size_t col = width & ~15 ; col < width; col++)
1371 {
1372 float *const restrict scratch = scratch_buffers;
1373 for(size_t row = 0; row < height; row++)
1374 scratch[row] = buf[row * width + col];
1375 box_min_1d(height, scratch, buf + col, width, w);
1376 }
1377
1378 dt_pixelpipe_cache_free_align(scratch_buffers);
1379 return 0;
1380}
1381
1382int dt_box_min(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
1383{
1384 if (ch == 1)
1385 return box_min_1ch(buf, height, width, radius);
1386 else
1387 //TODO: 4ch version if needed
1389 return 1;
1390}
1391// clang-format off
1392// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
1393// vim: shiftwidth=2 expandtab tabstop=2 cindent
1394// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
1395// clang-format on
#define m
Definition basecurve.c:278
int width
Definition bilateral.h:1
int height
Definition bilateral.h:1
static __DT_CLONE_TARGETS__ void blur_vertical_16wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void set_16wide(float *const restrict out, const float value)
static void update_max_16wide(float m[16], const float *const restrict base)
static __DT_CLONE_TARGETS__ void blur_vertical_1wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static void load_update_max_16wide(float *const restrict out, float m[16], const float *const restrict base)
#define PREFETCH_NTA(addr)
Definition box_filters.c:45
static void update_min_16wide(float m[16], const float *const restrict base)
static float window_max(const float *x, int n)
static int box_mean_2ch(float *const restrict in, const size_t height, const size_t width, const int radius, const unsigned iterations)
static __DT_CLONE_TARGETS__ int box_max_1ch(float *const buf, const size_t height, const size_t width, const unsigned w)
static __DT_CLONE_TARGETS__ void blur_vertical_16wide_Kahan(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void sub_16wide_Kahan(float *const restrict accum, const float *const restrict values, float *const restrict comp)
int dt_box_max(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
static __DT_CLONE_TARGETS__ void blur_vertical_1ch(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scanlines, const size_t padded_size)
static __DT_CLONE_TARGETS__ int dt_box_mean_4ch_Kahan(float *const buf, const size_t height, const size_t width, const int radius, const unsigned iterations)
static __DT_CLONE_TARGETS__ void blur_horizontal_4ch(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scanlines, const size_t padded_size)
static __DT_CLONE_TARGETS__ void blur_vertical_4wide_Kahan(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void blur_horizontal_1ch(float *const restrict buf, const int height, const int width, const int radius, float *const restrict scanlines, const size_t padded_size)
Definition box_filters.c:49
static __DT_CLONE_TARGETS__ void blur_horizontal_2ch(float *const restrict buf, const int height, const int width, const int radius, float *const restrict scanlines, const size_t padded_size)
static float window_min(const float *x, int n)
static __DT_CLONE_TARGETS__ int dt_box_mean_1ch(float *const buf, const size_t height, const size_t width, const size_t radius, const unsigned iterations)
static __DT_CLONE_TARGETS__ int dt_box_mean_4ch(float *const buf, const int height, const int width, const int radius, const unsigned iterations)
#define DT_PREFETCH(addr)
Definition box_filters.c:44
static __DT_CLONE_TARGETS__ void blur_vertical_4wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void store_scaled_Nwide(const size_t N, float *const restrict out, const float *const restrict in, const float scale)
static __DT_CLONE_TARGETS__ void sub_Nwide_Kahan(const size_t N, float *const restrict accum, const float *const restrict values, float *const restrict comp)
static __DT_CLONE_TARGETS__ void sub_16wide(float *const restrict accum, const float *const restrict values)
static __DT_CLONE_TARGETS__ void store_16wide(float *const restrict out, const float *const restrict in)
static __DT_CLONE_TARGETS__ int box_mean_vert_1ch_Kahan(float *const buf, const int height, const size_t width, const size_t radius)
int dt_box_mean_horizontal(float *const restrict buf, const size_t width, const int ch, const int radius, float *const restrict user_scratch)
static __DT_CLONE_TARGETS__ void blur_vertical_1wide_Kahan(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
int dt_box_mean(float *const buf, const size_t height, const size_t width, const int ch, const int radius, const unsigned iterations)
static void box_max_1d(int N, const float *const restrict x, float *const restrict y, size_t stride_y, int w)
static __DT_CLONE_TARGETS__ int box_min_1ch(float *const buf, const size_t height, const size_t width, const int w)
int dt_box_mean_vertical(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
static __DT_CLONE_TARGETS__ void store_scaled_16wide(float *const restrict out, const float *const restrict in, const float scale)
static void load_update_min_16wide(float *const restrict out, float m[16], const float *const restrict base)
static __DT_CLONE_TARGETS__ void blur_horizontal_4ch_Kahan(float *const restrict buf, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void blur_horizontal_Nch_Kahan(const size_t N, float *const restrict buf, const size_t width, const size_t radius, float *const restrict scratch)
static void box_min_vert_16wide(const int N, float *const restrict scratch, float *const restrict buf, const int stride, const int w, const size_t mask)
static __DT_CLONE_TARGETS__ void load_add_Nwide_Kahan(const size_t N, float *const restrict out, float *const restrict accum, const float *const restrict in, float *const restrict comp)
static void box_max_vert_16wide(const int N, float *const restrict scratch, float *const restrict buf, const int stride, const int w, const size_t mask)
static __DT_CLONE_TARGETS__ void load_add_16wide(float *const restrict out, float *const restrict accum, const float *const restrict in)
static __DT_CLONE_TARGETS__ size_t _compute_effective_height(const size_t height, const size_t radius)
int dt_box_min(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
static __DT_CLONE_TARGETS__ void load_add_16wide_Kahan(float *const restrict out, float *const restrict accum, const float *const restrict in, float *const restrict comp)
static void box_min_1d(int N, const float *x, float *y, size_t stride_y, int w)
#define BOXFILTER_KAHAN_SUM
Definition box_filters.h:36
static const float const float const float min
const float max
const dt_colormatrix_t dt_aligned_pixel_t out
static const int row
#define __OMP_SIMD__(...)
Definition darktable.h:262
#define DT_ALIGNED_ARRAY
Definition darktable.h:388
#define dt_pixelpipe_cache_alloc_align_float_cache(pixels, id)
Definition darktable.h:447
float dt_aligned_pixel_simd_t __attribute__((vector_size(16), aligned(16)))
Enable aggressive floating-point arithmetic optimizations, in denormals handling. Set through user pr...
Definition darktable.h:524
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453
#define __DT_CLONE_TARGETS__
Definition darktable.h:367
#define dt_get_perthread(buf, padsize)
Definition darktable.h:1035
#define for_four_channels(_var,...)
Definition darktable.h:664
#define __OMP_PARALLEL_FOR__(...)
Definition darktable.h:258
static const dt_aligned_pixel_simd_t value
Definition darktable.h:577
#define dt_pixelpipe_cache_alloc_perthread_float(n, padded_size)
Definition darktable.h:1030
#define dt_unreachable_codepath()
Definition darktable.h:979
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281
static const float x
const float v
float *const restrict const size_t const size_t ch
static float Kahan_sum(const float m, float *const __restrict__ c, const float add)
Definition math.h:103
size_t size
Definition mipmap_cache.c:3
#define N
float dt_aligned_pixel_t[4]
const float r
#define MIN(a, b)
Definition thinplate.c:32
#define MAX(a, b)
Definition thinplate.c:29