37#if defined(__x86_64__) || defined(__i386__)
38#define DT_PREFETCH(addr) _mm_prefetch(addr, _MM_HINT_T2)
39#define PREFETCH_NTA(addr) _mm_prefetch(addr, _MM_HINT_NTA)
40#elif defined(__GNUC__) && __GNUC__ > 7
41#define DT_PREFETCH(addr) __builtin_prefetch(addr,1,1)
42#define PREFETCH_NTA(addr) __builtin_prefetch(addr,1,0)
44#define DT_PREFETCH(addr)
45#define PREFETCH_NTA(addr)
50 float *
const restrict scanlines,
const size_t padded_size)
53 for(
int y = 0; y <
height; y++)
57 const size_t index = (size_t)y *
width;
67 for (
x = 0; (
x <= radius) && ((
x + radius) <
width);
x++)
69 const int np =
x + radius;
75 scanline[
x] =
L / hits;
81 scanline[
x] =
L / hits;
86 const int op =
x - radius - 1;
87 const int np =
x + radius;
90 scanline[
x] =
L / hits;
95 const int op =
x - radius - 1;
98 scanline[
x] =
L / hits;
102 buf[index +
x] = scanline[
x];
109 float *
const restrict scanlines,
const size_t padded_size)
112 for(
int y = 0; y <
height; y++)
115 float L1 = 0.0f, L2 = 0.0f;
117 const size_t index = (size_t)2 * y *
width;
122 L1 += buf[index + 2*
x];
123 L2 += buf[index + 2*
x + 1];
127 for (
x = 0; (
x <= radius) && ((
x + radius) <
width);
x++)
129 const int np =
x + radius;
133 L1 += buf[index + 2*np];
134 L2 += buf[index + 2*np + 1];
136 scanline[2*
x] = L1 / hits;
137 scanline[2*
x+1] = L2 / hits;
141 for(;
x <= radius &&
x <
width;
x++)
143 scanline[2*
x] = L1 / hits;
144 scanline[2*
x+1] = L2 / hits;
149 const int op =
x - radius - 1;
150 const int np =
x + radius;
151 L1 = L1 - buf[index + 2*op] + buf[index + 2*np];
152 L2 = L2 - buf[index + 2*op + 1] + buf[index + 2*np + 1];
153 scanline[2*
x] = L1 / hits;
154 scanline[2*
x+1] = L2 / hits;
159 const int op =
x - radius - 1;
161 L1 -= buf[index + 2*op];
162 L2 -= buf[index + 2*op + 1];
163 scanline[2*
x] = L1 / hits;
164 scanline[2*
x+1] = L2 / hits;
169 buf[index +
x] = scanline[
x];
182 const float v = values[c];
198 const float *
const restrict values,
float *
const restrict comp)
205 const float t1 =
v - comp[
c];
206 const float t2 = accum[
c] + t1;
207 comp[
c] = (t2 - accum[
c]) - t1;
213 float *
const restrict comp)
218 const float t1 = -
values[
c] - comp[
c];
219 const float t2 = accum[
c] + t1;
220 comp[
c] = (t2 - accum[
c]) - t1;
228 out[
c] = in[
c] / scale;
232static void sub_16wide(
float *
const restrict accum,
const float *
const restrict values)
235 for(
size_t c = 0; c < 16; c++)
236 accum[c] -= values[c];
241static void load_add_16wide(
float *
const restrict
out,
float *
const restrict accum,
const float *
const restrict in)
244 for (
size_t c = 0; c < 16; c++)
246 const float v = in[c];
253static void sub_16wide_Kahan(
float *
const restrict accum,
const float *
const restrict values,
254 float *
const restrict comp)
256 __OMP_SIMD__(aligned(accum,comp : 64) aligned(values : 16))
257 for(
size_t c = 0; c < 16; c++)
259 const float v = -values[c];
261 const float t1 =
v - comp[c];
262 const float t2 = accum[c] + t1;
263 comp[c] = (t2 - accum[c]) - t1;
271 const float *
const restrict in,
float *
const restrict comp)
274 for (
size_t c = 0; c < 16; c++)
276 const float v = in[c];
279 const float t1 =
v - comp[c];
280 const float t2 = accum[c] + t1;
281 comp[c] = (t2 - accum[c]) - t1;
291 for (
size_t c = 0; c < 16; c++)
299 for(
size_t c = 0; c < 16; c++)
300 out[c] = in[c] / scale;
304static void sub_Nwide_Kahan(
const size_t N,
float *
const restrict accum,
const float *
const restrict values,
305 float *
const restrict comp)
308 for(
size_t c = 0; c <
N; c++)
310 const float v = -values[c];
312 const float t1 =
v - comp[c];
313 const float t2 = accum[c] + t1;
314 comp[c] = (t2 - accum[c]) - t1;
322 const float *
const restrict in,
float *
const restrict comp)
325 for (
size_t c = 0; c <
N; c++)
327 const float v = in[c];
330 const float t1 =
v - comp[c];
331 const float t2 = accum[c] + t1;
332 comp[c] = (t2 - accum[c]) - t1;
342 for(
size_t c = 0; c <
N; c++)
343 out[c] = in[c] / scale;
349 float *
const restrict scanlines,
const size_t padded_size)
352 for(
int y = 0; y <
height; y++)
357 const size_t index = (size_t)4 * y *
width;
358 float *
const restrict bufp = buf + index;
363 load_add_4wide(scratch + 4*
x,
L, bufp + 4*
x);
367 for (
x = 0; (
x <= radius) && ((
x + radius) <
width);
x++)
369 const int np =
x + radius;
371 load_add_4wide(scratch + 4*np,
L, bufp + 4*np);
372 store_scaled_4wide(bufp + 4*
x,
L, hits);
376 for(;
x <= radius &&
x <
width;
x++)
378 store_scaled_4wide(bufp + 4*
x,
L, hits);
385 const int op =
x - radius - 1;
386 const int np =
x + radius;
387 sub_4wide(
L, scratch + 4*op);
388 load_add_4wide(scratch + 4*np,
L, bufp + 4*np);
389 store_scaled_4wide(bufp + 4*
x,
L, hits);
394 const int op =
x - radius - 1;
396 sub_4wide(
L, scratch + 4*op);
397 store_scaled_4wide(bufp + 4*
x,
L, hits);
406 const size_t radius,
float *
const restrict scratch)
415 load_add_4wide_Kahan(scratch + 4*
x,
L, buf + 4*
x, comp);
419 for (
x = 0; (
x <= radius) && ((
x + radius) <
width);
x++)
421 const int np =
x + radius;
423 load_add_4wide_Kahan(scratch + 4*np,
L, buf + 4*np, comp);
424 store_scaled_4wide(buf + 4*
x,
L, hits);
428 for(;
x <= radius &&
x <
width;
x++)
430 store_scaled_4wide(buf + 4*
x,
L, hits);
435 const int op =
x - radius - 1;
436 const int np =
x + radius;
437 sub_4wide_Kahan(
L, scratch + 4*op, comp);
438 load_add_4wide_Kahan(scratch + 4*np,
L, buf + 4*np, comp);
439 store_scaled_4wide(buf + 4*
x,
L, hits);
444 const int op =
x - radius - 1;
446 sub_4wide_Kahan(
L, scratch + 4*op, comp);
447 store_scaled_4wide(buf + 4*
x,
L, hits);
455 const size_t radius,
float *
const restrict scratch)
471 for (
x = 0; (
x <= radius) && ((
x + radius) <
width);
x++)
473 const int np =
x + radius;
480 for(;
x <= radius &&
x <
width;
x++)
487 const int op =
x - radius - 1;
488 const int np =
x + radius;
496 const int op =
x - radius - 1;
507 const size_t radius,
float *
const restrict scratch)
515 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) mask = (mask << 1) | 1;
520 for (
size_t y = 0; y <
MIN(radius,
height); y++)
524 const float v = buf[y*
width];
530 for (y = 0; y <= radius && y + radius <
height; y++)
533 const int np = y + radius;
536 const float v = buf[np*
width];
538 scratch[np&mask] =
v;
543 for(; y <= radius && y <
height; y++)
548 for( ; y + radius <
height; y++)
550 const int np = y + radius;
551 const int op = y - radius - 1;
553 L -= scratch[op&mask];
554 const float v = buf[np*
width];
556 scratch[np&mask] =
v;
563 const int op = y - radius - 1;
565 L -= scratch[op&mask];
575 const size_t radius,
float *
const restrict scratch)
583 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) mask = (mask << 1) | 1;
589 for (
size_t y = 0; y <
MIN(radius,
height); y++)
593 const float v = buf[y*
width];
599 for (y = 0; y <= radius && y + radius <
height; y++)
602 const int np = y + radius;
605 const float v = buf[np*
width];
607 scratch[np&mask] =
v;
612 for(; y <= radius && y <
height; y++)
617 for( ; y + radius <
height; y++)
619 const int np = y + radius;
620 const int op = y - radius - 1;
623 const float v = buf[np*
width];
625 scratch[np&mask] =
v;
632 const int op = y - radius - 1;
644 float *
const restrict scratch)
652 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) mask = (mask << 1) | 1;
657 for (
size_t y = 0; y <
MIN(radius,
height); y++)
661 load_add_4wide(scratch + 4*(y&mask),
L, buf + y *
width);
665 for (y = 0; y <= radius && y + radius <
height; y++)
668 const int np = y + radius;
671 load_add_4wide(scratch + 4*(np&mask),
L, buf + np*
width);
672 store_scaled_4wide(buf + y*
width,
L, hits);
676 for(; y <= radius && y <
height; y++)
678 store_scaled_4wide(buf + y*
width,
L, hits);
681 for ( ; y + radius <
height; y++)
683 const int np = y + radius;
684 const int op = y - radius - 1;
686 sub_4wide(
L, scratch + 4*(op&mask));
687 load_add_4wide(scratch + 4*(np&mask),
L, buf + np*
width);
688 store_scaled_4wide(buf + y*
width,
L, hits);
693 const int op = y - radius - 1;
695 sub_4wide(
L, scratch + 4*(op&mask));
696 store_scaled_4wide(buf + y*
width,
L, hits);
704 const size_t radius,
float *
const restrict scratch)
712 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) mask = (mask << 1) | 1;
718 for (
size_t y = 0; y <
MIN(radius,
height); y++)
722 load_add_4wide_Kahan(scratch + 4*(y&mask),
L, buf + y *
width, comp);
726 for (y = 0; y <= radius && y + radius <
height; y++)
729 const int np = y + radius;
732 load_add_4wide_Kahan(scratch + 4*(np&mask),
L, buf + np*
width, comp);
733 store_scaled_4wide(buf + y*
width,
L, hits);
737 for(; y <= radius && y <
height; y++)
739 store_scaled_4wide(buf + y*
width,
L, hits);
742 for ( ; y + radius <
height; y++)
744 const int np = y + radius;
745 const int op = y - radius - 1;
747 sub_4wide_Kahan(
L, scratch + 4*(op&mask), comp);
748 load_add_4wide_Kahan(scratch + 4*(np&mask),
L, buf + np*
width, comp);
749 store_scaled_4wide(buf + y*
width,
L, hits);
754 const int op = y - radius - 1;
756 sub_4wide_Kahan(
L, scratch + 4*(op&mask), comp);
757 store_scaled_4wide(buf + y*
width,
L, hits);
765 const size_t radius,
float *
const restrict scratch)
773 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) mask = (mask << 1) | 1;
778 for (
size_t y = 0; y <
MIN(radius,
height); y++)
786 for (y = 0; y <= radius && y + radius <
height; y++)
789 const int np = y + radius;
797 for(; y <= radius && y <
height; y++)
802 for ( ; y + radius <
height; y++)
804 const int np = y + radius;
805 const int op = y - radius - 1;
815 const int op = y - radius - 1;
827 const size_t radius,
float *
const restrict scratch)
835 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) mask = (mask << 1) | 1;
841 for (
size_t y = 0; y <
MIN(radius,
height); y++)
849 for (y = 0; y <= radius && y + radius <
height; y++)
852 const int np = y + radius;
860 for(; y <= radius && y <
height; y++)
865 for ( ; y + radius <
height; y++)
867 const int np = y + radius;
868 const int op = y - radius - 1;
878 const int op = y - radius - 1;
889 float *
const restrict scanlines,
const size_t padded_size)
903 for( ; col < (
width & ~3); col += 4)
905 for( ; col <
width; col++)
917 size_t eff_height = 2;
918 for(
size_t r = (2*radius+1);
r > 1 ;
r >>= 1) eff_height <<= 1;
925 const unsigned iterations)
936 for(
unsigned iteration = 0; iteration < iterations; iteration++)
948 const unsigned iterations)
959 for(
unsigned iteration = 0; iteration < iterations; iteration++)
978 for (
size_t col = 0; col <
width; col += 16)
981 if (col + 16 <=
width)
989 for( ; col_ < (
width & ~3); col_ += 4)
991 for( ; col_ <
width; col_++)
1002 const unsigned iterations)
1005 for(
unsigned iteration = 0; iteration < iterations; iteration++)
1026 const int radius,
const unsigned iterations)
1033 const size_t Ndim =
MAX(4*
width,16*eff_height);
1038 for (
unsigned iteration = 0; iteration < iterations; iteration++)
1048 const int radius,
const unsigned iterations)
1072 float *
const restrict user_scratch)
1076 float *
const restrict scratch = user_scratch ? user_scratch
1087 float *
const restrict scratch = user_scratch ? user_scratch
1105 size_t channels =
ch & ~BOXFILTER_KAHAN_SUM;
1115 float m = -(FLT_MAX);
1117 for(
int j = 0; j <
n; j++)
1124static inline void box_max_1d(
int N,
const float *
const restrict
x,
float *
const restrict y,
size_t stride_y,
int w)
1127 for(
int i = 0;
i <
N;
i++)
1130 y[
i * stride_y] =
m;
1133 if(
i - w >= 0 &&
x[
i - w] ==
m)
1135 const int start =
i - w + 1;
1148 for (
size_t c = 0; c < 16; c++)
1155 for (
size_t c = 0; c < 16; c++)
1157 m[c] = fmaxf(
m[c], base[c]);
1164 for (
size_t c = 0; c < 16; c++)
1166 const float v = base[c];
1168 m[c] = fmaxf(
m[c],
v);
1176 const int stride,
const int w,
const size_t mask)
1178 float DT_ALIGNED_ARRAY m[16] = { -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX),
1179 -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX),
1180 -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX),
1181 -(FLT_MAX), -(FLT_MAX), -(FLT_MAX), -(FLT_MAX) };
1182 for(
size_t i = 0;
i <
MIN(w + 1,
N);
i++)
1187 for(
size_t i = 0;
i <
N;
i++)
1197 for(
int j =
i - w + 1; j <
MIN(
i + w + 1,
N); j++)
1203 const size_t n =
i + w + 1;
1224 float *
const restrict scratch =
dt_get_perthread(scratch_buffers,allocsize);
1229 for(
int col = 0; col < (
width & ~15); col += 16)
1231 float *
const restrict scratch =
dt_get_perthread(scratch_buffers,allocsize);
1235 for (
size_t col =
width & ~15 ; col <
width; col++)
1237 float *
const restrict scratch = scratch_buffers;
1262 for(
int j = 0; j <
n; j++)
1269static inline void box_min_1d(
int N,
const float *
x,
float *y,
size_t stride_y,
int w)
1272 for(
int i = 0;
i <
N;
i++)
1274 y[
i * stride_y] =
m;
1275 if(
i - w >= 0 &&
x[
i - w] ==
m)
1277 const int start = (
i - w + 1);
1289 for (
size_t c = 0; c < 16; c++)
1291 m[c] = fminf(
m[c], base[c]);
1298 for (
size_t c = 0; c < 16; c++)
1300 const float v = base[c];
1302 m[c] = fminf(
m[c],
v);
1310 const int stride,
const int w,
const size_t mask)
1313 FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
1314 FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
1315 FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
1316 for(
size_t i = 0;
i <
MIN(w + 1,
N);
i++)
1321 for(
size_t i = 0;
i <
N;
i++)
1331 for(
int j =
i - w + 1; j <
MIN(
i + w + 1,
N); j++)
1337 const size_t n =
i + w + 1;
1359 float *
const restrict scratch =
dt_get_perthread(scratch_buffers,allocsize);
1364 for(
size_t col = 0; col < (
width & ~15); col += 16)
1366 float *
const restrict scratch =
dt_get_perthread(scratch_buffers,allocsize);
1370 for (
size_t col =
width & ~15 ; col <
width; col++)
1372 float *
const restrict scratch = scratch_buffers;
static __DT_CLONE_TARGETS__ void blur_vertical_16wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void set_16wide(float *const restrict out, const float value)
static void update_max_16wide(float m[16], const float *const restrict base)
static __DT_CLONE_TARGETS__ void blur_vertical_1wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static void load_update_max_16wide(float *const restrict out, float m[16], const float *const restrict base)
#define PREFETCH_NTA(addr)
static void update_min_16wide(float m[16], const float *const restrict base)
static float window_max(const float *x, int n)
static int box_mean_2ch(float *const restrict in, const size_t height, const size_t width, const int radius, const unsigned iterations)
static __DT_CLONE_TARGETS__ int box_max_1ch(float *const buf, const size_t height, const size_t width, const unsigned w)
static __DT_CLONE_TARGETS__ void blur_vertical_16wide_Kahan(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void sub_16wide_Kahan(float *const restrict accum, const float *const restrict values, float *const restrict comp)
int dt_box_max(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
static __DT_CLONE_TARGETS__ void blur_vertical_1ch(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scanlines, const size_t padded_size)
static __DT_CLONE_TARGETS__ int dt_box_mean_4ch_Kahan(float *const buf, const size_t height, const size_t width, const int radius, const unsigned iterations)
static __DT_CLONE_TARGETS__ void blur_horizontal_4ch(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scanlines, const size_t padded_size)
static __DT_CLONE_TARGETS__ void blur_vertical_4wide_Kahan(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void blur_horizontal_1ch(float *const restrict buf, const int height, const int width, const int radius, float *const restrict scanlines, const size_t padded_size)
static __DT_CLONE_TARGETS__ void blur_horizontal_2ch(float *const restrict buf, const int height, const int width, const int radius, float *const restrict scanlines, const size_t padded_size)
static float window_min(const float *x, int n)
static __DT_CLONE_TARGETS__ int dt_box_mean_1ch(float *const buf, const size_t height, const size_t width, const size_t radius, const unsigned iterations)
static __DT_CLONE_TARGETS__ int dt_box_mean_4ch(float *const buf, const int height, const int width, const int radius, const unsigned iterations)
#define DT_PREFETCH(addr)
static __DT_CLONE_TARGETS__ void blur_vertical_4wide(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void store_scaled_Nwide(const size_t N, float *const restrict out, const float *const restrict in, const float scale)
static __DT_CLONE_TARGETS__ void sub_Nwide_Kahan(const size_t N, float *const restrict accum, const float *const restrict values, float *const restrict comp)
static __DT_CLONE_TARGETS__ void sub_16wide(float *const restrict accum, const float *const restrict values)
static __DT_CLONE_TARGETS__ void store_16wide(float *const restrict out, const float *const restrict in)
static __DT_CLONE_TARGETS__ int box_mean_vert_1ch_Kahan(float *const buf, const int height, const size_t width, const size_t radius)
int dt_box_mean_horizontal(float *const restrict buf, const size_t width, const int ch, const int radius, float *const restrict user_scratch)
static __DT_CLONE_TARGETS__ void blur_vertical_1wide_Kahan(float *const restrict buf, const size_t height, const size_t width, const size_t radius, float *const restrict scratch)
int dt_box_mean(float *const buf, const size_t height, const size_t width, const int ch, const int radius, const unsigned iterations)
static void box_max_1d(int N, const float *const restrict x, float *const restrict y, size_t stride_y, int w)
static __DT_CLONE_TARGETS__ int box_min_1ch(float *const buf, const size_t height, const size_t width, const int w)
int dt_box_mean_vertical(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
static __DT_CLONE_TARGETS__ void store_scaled_16wide(float *const restrict out, const float *const restrict in, const float scale)
static void load_update_min_16wide(float *const restrict out, float m[16], const float *const restrict base)
static __DT_CLONE_TARGETS__ void blur_horizontal_4ch_Kahan(float *const restrict buf, const size_t width, const size_t radius, float *const restrict scratch)
static __DT_CLONE_TARGETS__ void blur_horizontal_Nch_Kahan(const size_t N, float *const restrict buf, const size_t width, const size_t radius, float *const restrict scratch)
static void box_min_vert_16wide(const int N, float *const restrict scratch, float *const restrict buf, const int stride, const int w, const size_t mask)
static __DT_CLONE_TARGETS__ void load_add_Nwide_Kahan(const size_t N, float *const restrict out, float *const restrict accum, const float *const restrict in, float *const restrict comp)
static void box_max_vert_16wide(const int N, float *const restrict scratch, float *const restrict buf, const int stride, const int w, const size_t mask)
static __DT_CLONE_TARGETS__ void load_add_16wide(float *const restrict out, float *const restrict accum, const float *const restrict in)
static __DT_CLONE_TARGETS__ size_t _compute_effective_height(const size_t height, const size_t radius)
int dt_box_min(float *const buf, const size_t height, const size_t width, const int ch, const int radius)
static __DT_CLONE_TARGETS__ void load_add_16wide_Kahan(float *const restrict out, float *const restrict accum, const float *const restrict in, float *const restrict comp)
static void box_min_1d(int N, const float *x, float *y, size_t stride_y, int w)
#define BOXFILTER_KAHAN_SUM
static const float const float const float min
const dt_colormatrix_t dt_aligned_pixel_t out
#define __OMP_SIMD__(...)
#define dt_pixelpipe_cache_alloc_align_float_cache(pixels, id)
float dt_aligned_pixel_simd_t __attribute__((vector_size(16), aligned(16)))
Enable aggressive floating-point arithmetic optimizations, in denormals handling. Set through user pr...
#define dt_pixelpipe_cache_free_align(mem)
#define __DT_CLONE_TARGETS__
#define dt_get_perthread(buf, padsize)
#define for_four_channels(_var,...)
#define __OMP_PARALLEL_FOR__(...)
static const dt_aligned_pixel_simd_t value
#define dt_pixelpipe_cache_alloc_perthread_float(n, padded_size)
#define dt_unreachable_codepath()
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
float *const restrict const size_t const size_t ch
static float Kahan_sum(const float m, float *const __restrict__ c, const float add)
float dt_aligned_pixel_t[4]