54#define SLICE_HEIGHT 60
78#if !(defined(__x86_64__) || defined(__i386__))
79# define _mm_prefetch(where,hint)
82static inline float gh(
const float f)
87static inline int sign(
int a)
89 return (a > 0) - (a < 0);
93static int scatter(
const float scale,
const float scattering,
const int index1,
const int index2)
99 const int abs_i1 = abs(index1);
100 const int abs_i2 = abs(index2);
101 return scale * ((abs_i1 * abs_i1 * abs_i1 + 7.0 * abs_i1 * sqrt(abs_i2)) *
sign(index1) * scattering / 6.0 + index1);
108 const int search_radius = params->search_radius;
109 const float scale = params->scale;
110 const float scattering = params->scattering;
111 int decimate = params->decimate;
113 int n_patches = (2 * search_radius + 1) * (2 * search_radius + 1);
115 n_patches = (n_patches + 1) / 2;
116 *num_patches = n_patches ;
124 for (
int row_index = -search_radius; row_index <= search_radius; row_index++)
126 for (
int col_index = -search_radius; col_index <= search_radius; col_index++)
128 if (decimate && (++decimate & 1))
continue;
129 int r =
scatter(scale,scattering,row_index,col_index);
130 int c =
scatter(scale,scattering,col_index,row_index);
131 patches[patch_num].
rows =
r;
132 patches[patch_num].
cols = c;
133 if (
r > shift) shift =
r;
134 else if (-
r > shift) shift = -
r;
135 if (c > shift) shift = c;
136 else if (-c > shift) shift = -c;
137 patches[patch_num].
offset = (
r * stride + c * 4);
149 const int width = 2 * radius + 1;
159 const float diff = pix1[
i] - pix2[
i];
160 sum[
i] = diff * diff * norm[
i];
162 return sum[0] + sum[1] + sum[2];
167 const float*
const pix3,
const float* pix4,
173 const float diff1 = pix1[
i] - pix2[
i];
174 const float diff2 = pix3[
i] - pix4[
i];
175 sum[
i] = (diff1 * diff1 - diff2 * diff2) * norm[
i];
177 return sum[0] + sum[1] + sum[2];
180#if defined(CACHE_PIXDIFFS) || defined(CACHE_PIXDIFFS_SSE)
181static inline float get_pixdiff(
const float *
const col_sums,
const int radius,
const int row,
const int col)
183 const int stride = 2*(radius+1);
184 const int modrow = 1 + (
row + stride) % stride;
185 const float *
const pixrow = col_sums + (
SLICE_WIDTH + 2*radius)*modrow;
190#if defined(CACHE_PIXDIFFS) || defined(CACHE_PIXDIFFS_SSE)
191static inline void set_pixdiff(
float *
const col_sums,
const int radius,
const int row,
const int col,
194 const int stride = 2*(radius+1);
195 const int modrow = 1 + (
row + stride) % stride;
196 float *
const pixrow = col_sums + (
SLICE_WIDTH + 2*radius)*modrow;
201#if defined(CACHE_PIXDIFFS) || defined(CACHE_PIXDIFFS_SSE)
202static inline float pixdiff_column_sum(
const float *
const col_sums,
const int radius,
const int col)
205 float sum = col_sums[stride+col];
206 for (
int i = 2;
i <= (2*radius+1) ;
i++)
207 sum += col_sums[
i*stride+col];
213 const int row,
const int chunk_left,
const int chunk_right,
214 const int height,
const int width,
const int stride,
215 const int radius,
const float *
const norm)
223 const int scol = patch->
cols;
224 const int col_min = chunk_left -
MIN(radius,
MIN(chunk_left,chunk_left+scol));
225 const int col_max = chunk_right +
MIN(radius,
MIN(
width-chunk_right,
width-(chunk_right+scol)));
227 const int srow = patch->
rows;
230 for (
int col = chunk_left-radius-1; col <
MIN(col_min,chunk_right+radius); col++)
234 for(
int i =
row-radius;
i <=
row+radius;
i++)
235 set_pixdiff(col_sums,radius,
i,col,0.0f);
238 for (
int col = col_min; col < col_max; col++)
241 for (
int r = rmin;
r <= rmax;
r++)
243 const float *pixel = in +
r*stride + 4*col;
246 set_pixdiff(col_sums,radius,
r,col,diff);
253 for (
int col =
MAX(col_min,col_max); col < chunk_right + radius; col++)
257 for(
int i =
row-radius;
i <=
row+radius;
i++)
258 set_pixdiff(col_sums,radius,
i,col,0.0f);
272 for (
int incr = 1; incr < 10; incr++)
277 else if (plus_rem > best)
285 else if (minus_rem > best)
300 int rem =
width % sl_width;
305 rem =
width % sl_width;
320 const dt_aligned_pixel_t invert = { 1.0f - params->luma, 1.0f - params->chroma, 1.0f - params->chroma, 0.0f };
321 const bool skip_blend = (params->luma == 1.0 && params->chroma == 1.0);
328 const size_t stride = 4 * roi_in->
width;
333 const int radius = params->patch_radius;
334#if defined(CACHE_PIXDIFFS)
335 const size_t scratch_size = (2*radius+3)*(
SLICE_WIDTH + 2*radius + 1);
337 const size_t scratch_size =
SLICE_WIDTH + 2*radius + 1 + 48;
339 size_t padded_scratch_size;
346 for (
int chunk_top = 0 ; chunk_top < roi_out->
height; chunk_top += chk_height)
348 for (
int chunk_left = 0; chunk_left < roi_out->
width; chunk_left += chk_width)
352 float *
const restrict tmpbuf =
dt_get_perthread(scratch_buf, padded_scratch_size);
353 float *
const col_sums = tmpbuf + (radius+1) - chunk_left;
355 const int chunk_bot =
MIN(chunk_top + chk_height, roi_out->
height);
357 const int chunk_right =
MIN(chunk_left + chk_width, roi_out->
width);
359 for (
int i = chunk_top;
i < chunk_bot;
i++)
361 memset(outbuf + 4*(
i*roi_out->
width+chunk_left),
'\0',
sizeof(
float) * 4 * (chunk_right-chunk_left));
364 for (
int p = 0;
p < num_patches;
p++)
370 const int row_min =
MAX(chunk_top,
MAX(0,-patch->
rows));
374 const int row_top =
MAX(row_min,
MAX(radius,radius-patch->
rows));
378 const int scol = patch->
cols;
379 const int col_min =
MAX(chunk_left,-scol);
380 const int col_max =
MIN(chunk_right,roi_out->
width - scol);
383 stride,radius,params->norm);
384 for (
int row = row_min;
row < row_max;
row++)
387 float distortion = 0.0;
388 for (
int i = col_min - radius;
i <
MIN(col_min+radius, col_max);
i++)
390 distortion += col_sums[
i];
393 const float *in = inbuf + stride *
row;
394 float *
const out = outbuf + (size_t)4 *
width *
row;
396 const float sharpness = params->sharpness;
397 if (params->center_weight < 0)
400 for (
int col = col_min; col < col_max; col++)
402 distortion += (col_sums[col+radius] - col_sums[col-radius-1]);
403 const float wt =
gh(distortion * sharpness);
404 const float *
const inpx = in+4*col;
408 out[4*col+c] += pixel[c] * wt;
416 for (
int col = col_min; col < col_max; col++)
418 distortion += (col_sums[col+radius] - col_sums[col-radius-1]);
420 / (1.0f + params->center_weight);
421 const float wt =
gh(fmaxf(0.0f, dissimilarity * sharpness - 2.0f));
422 const float *
const inpx = in + 4*col;
426 out[4*col+c] += pixel[c] * wt;
431 const int pcol_min = chunk_left -
MIN(radius,
MIN(chunk_left,chunk_left+scol));
432 const int pcol_max = chunk_right +
MIN(radius,
MIN(
width-chunk_right,
width-(chunk_right+scol)));
433 if (
row <
MIN(row_top, row_bot))
436 const float *bot_row = inbuf + (
row+1+radius)*stride;
437 for (
int col = pcol_min; col < pcol_max; col++)
439 const float *
const bot_px = bot_row + 4*col;
443 set_pixdiff(col_sums,radius,
row+radius+1,col,diff);
445 col_sums[col] += diff;
449 else if (
row < row_bot)
451#ifndef CACHE_PIXDIFFS
452 const float *
const top_row = inbuf + (
row-radius)*stride ;
454 const float *
const bot_row = inbuf + (
row+1+radius)*stride ;
456 for (
int col = pcol_min; col < pcol_max; col++)
459 const float *
const bot_px = bot_row + 4*col;
461 col_sums[col] += diff - get_pixdiff(col_sums,radius,
row-radius,col);
463 set_pixdiff(col_sums,radius,
row+1+radius,col,diff);
465 const float *
const top_px = top_row + 4*col;
466 const float *
const bot_px = bot_row + 4*col;
469 col_sums[col] += diff;
474 else if (
row >= row_top &&
row + 1 < row_max)
477#ifndef CACHE_PIXDIFFS
478 const float *top_row = inbuf + (
row-radius)*stride;
480 for (
int col = pcol_min; col < pcol_max; col++)
483 col_sums[col] -= get_pixdiff(col_sums,radius,
row-radius,col);
485 const float *
const top_px = top_row + 4*col;
495 for (
int row = chunk_top;
row < chunk_bot;
row++)
497 float *
const out = outbuf + 4 *
row * roi_out->
width;
498 for (
int col = chunk_left; col < chunk_right; col++)
502 out[4*col+c] /=
out[4*col+3];
510 for (
int row = chunk_top;
row < chunk_bot;
row++)
512 const float *in = inbuf +
row * stride;
514 for (
int col = chunk_left; col < chunk_right; col++)
518 out[4*col+c] = (in[4*col+c] * invert[c]) + (
out[4*col+c] /
out[4*col+3] *
weight[c]);
541 unsigned int current = *
state;
542 unsigned int next = (current >=
max - 1 ? 0 : current + 1);
552 const int horiz_kernel,
const int vert_kernel)
556 .cellsize =
sizeof(float), .overhead = 0,
557 .sizex = 1 << 16, .sizey = 1 };
563 .cellsize =
sizeof(float), .overhead = 0,
564 .sizex = 1, .sizey = 1 << 16 };
587 const int P,
const int q[2],
const int height,
const int width,
588 const int bwidth,
const int hblocksize)
591 const size_t local[3] = { hblocksize, 1, 1 };
606 cl_mem dev_out,
const int q[2],
const int height,
const int width,
607 const size_t sizes[3])
621 cl_mem dev_in, cl_mem dev_out,
const dt_iop_roi_t *
const roi_in)
625 const int P = params->patch_radius;
626 const float nL2 = params->norm[0] * params->norm[0];
627 const float nC2 = params->norm[1] * params->norm[1];
630 const size_t stride = 4 * roi_in->
width;
637 unsigned int state = 0;
641 if(buckets[
k] == NULL)
goto error;
646 get_blocksizes(&hblocksize, &vblocksize,
P, devid, params->kernel_horiz, params->kernel_vert);
650 if(err != CL_SUCCESS)
goto error;
656 for(
int p = 0;
p < num_patches;
p++)
659 int q[2] = { patch->
rows, patch->
cols };
671 if(err != CL_SUCCESS)
break;
676 if(err != CL_SUCCESS)
break;
680 const size_t local[3] = { 1, vblocksize, 1 };
681 const float sharpness = params->sharpness;
692 if(err != CL_SUCCESS)
break;
696 if(err != CL_SUCCESS)
break;
715 cl_mem dev_in, cl_mem dev_out,
const dt_iop_roi_t *
const roi_in)
719 const int P = params->patch_radius;
720 const float norm = params->sharpness;
723 const size_t stride = 4 * roi_in->
width;
730 unsigned int state = 0;
734 if(buckets[
k] == NULL)
goto error;
739 get_blocksizes(&hblocksize, &vblocksize,
P, devid, params->kernel_horiz, params->kernel_vert);
743 if(err != CL_SUCCESS)
goto error;
749 for(
int p = 0;
p < num_patches;
p++)
752 int q[2] = { patch->
rows, patch->
cols };
762 if(err != CL_SUCCESS)
break;
767 if(err != CL_SUCCESS)
break;
771 const size_t local[3] = { 1, vblocksize, 1 };
772 const float central_pixel_weight = params->center_weight;
785 if(err != CL_SUCCESS)
break;
789 if(err != CL_SUCCESS)
break;
static void error(char *msg)
static const dt_aligned_pixel_simd_t const dt_adaptation_t const float p
const dt_aligned_pixel_t f
const dt_colormatrix_t dt_aligned_pixel_t out
#define for_each_channel(_var,...)
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
static const dt_aligned_pixel_simd_t sign
#define dt_pixelpipe_cache_free_align(mem)
#define __DT_CLONE_TARGETS__
#define dt_get_perthread(buf, padsize)
#define for_four_channels(_var,...)
#define __OMP_PARALLEL_FOR__(...)
#define dt_pixelpipe_cache_alloc_perthread_float(n, padded_size)
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
static void weight(const float *c1, const float *c2, const float sharpen, dt_aligned_pixel_t weight)
void dt_iop_nap(int32_t usec)
static float kernel(const float *x, const float *y)
float *const restrict const size_t k
static float dt_fast_mexp2f(const float x)
static int bucket_next(unsigned int *state, unsigned int max)
__DT_CLONE_TARGETS__ void nlmeans_denoise(const float *const inbuf, float *const outbuf, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const dt_nlmeans_param_t *const params)
static float gh(const float f)
static void init_column_sums(float *const col_sums, const patch_t *const patch, const float *const in, const int row, const int chunk_left, const int chunk_right, const int height, const int width, const int stride, const int radius, const float *const norm)
static float pixel_difference(const float *const pix1, const float *pix2, const dt_aligned_pixel_t norm)
static int compute_slice_height(const int height)
static void get_blocksizes(int *h, int *v, const int radius, const int devid, const int horiz_kernel, const int vert_kernel)
static cl_int nlmeans_cl_accu(const int devid, const int kernel, cl_mem dev_in, cl_mem dev_U4_tt, cl_mem dev_out, const int q[2], const int height, const int width, const size_t sizes[3])
static int scatter(const float scale, const float scattering, const int index1, const int index2)
int nlmeans_denoise_cl(const dt_nlmeans_param_t *const params, const int devid, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in)
int nlmeans_denoiseprofile_cl(const dt_nlmeans_param_t *const params, const int devid, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in)
#define _mm_prefetch(where, hint)
static struct patch_t * define_patches(const dt_nlmeans_param_t *const params, const int stride, int *num_patches, int *max_shift)
static float compute_center_pixel_norm(const float center_weight, const int radius)
static cl_int nlmeans_cl_horiz(const int devid, const int kernel, cl_mem dev_U4, cl_mem dev_U4_t, const int P, const int q[2], const int height, const int width, const int bwidth, const int hblocksize)
static int compute_slice_width(const int width)
static float diff_of_pixels_diff(const float *const pix1, const float *pix2, const float *const pix3, const float *pix4, const dt_aligned_pixel_t norm)
static cl_int nlmeans_cl_init(const int devid, const int kernel, cl_mem dev_out, const int height, const int width)
float dt_aligned_pixel_t[4]
int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes)
void * dt_opencl_alloc_device_buffer(const int devid, const size_t size)
int dt_opencl_micro_nap(const int devid)
int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size, const void *arg)
int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes, const size_t *local)
void dt_opencl_release_mem_object(cl_mem mem)
const float uint32_t state[4]
int32_t num_openmp_threads
Region of interest passed through the pixelpipe.