Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
nlmeans_core.c
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2020 Hubert Kowalski.
4 Copyright (C) 2020-2021 Pascal Obry.
5 Copyright (C) 2020-2021 Ralf Brown.
6 Copyright (C) 2021 Roman Khatko.
7 Copyright (C) 2022 Hanno Schwalm.
8 Copyright (C) 2022 Martin Bařinka.
9 Copyright (C) 2024 Alynx Zhou.
10 Copyright (C) 2025-2026 Aurélien PIERRE.
11
12 darktable is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
16
17 darktable is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with darktable. If not, see <http://www.gnu.org/licenses/>.
24*/
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28#include "common/math.h"
29#include "common/opencl.h"
30#include "control/control.h"
31#include "develop/imageop.h"
33#include "develop/tiling.h"
34#include "iop/iop_api.h"
35#include "common/nlmeans_core.h"
36#include <stdbool.h>
37#include <stdlib.h>
38
39// to avoid accumulation of rounding errors, we should do a full recomputation of the patch differences
40// every so many rows of the image. We'll also use that interval as the target maximum chunk size for
41// parallelization
42// in addition, to keep the working set within L1 cache, we need to limit the width of the chunks that
43// are processed. The working set uses (2*radius+3)*(ceil(width/4)+1) + (2*radius+3)*(ceil(width/16)+1)
44// 64-byte cache lines. The typical x86 CPU has an L1 cache containing 256 lines, and we'll need to
45// reserve a few for variables in the stack frame and the like. That results in a maximal width of
46// 96 pixels for radius=2, 72 pixels for radius=3, and 56 for radius=4 (default patch radius is 2)
47
48// lower values for SLICE_HEIGHT reduce the accumulation of rounding errors at the cost of more computation;
49// to avoid excessive overhead, width*height should be at least 2000. Keeping width*height below 10000 or so
50// will greatly improve L2/L3 cache hit rates and help with scaling beyond 16 threads. Note that the values
51// specified here are targets and may be adjusted slightly to avoid having extremely small chunks at the
52// right/bottom edge of the images (width will only be reduced, height could be either reduced or increased)
53#define SLICE_WIDTH 72
54#define SLICE_HEIGHT 60
55
56// try to speed up processing by caching pixel differences? If cached, they won't need to be computed a
57// second time when sliding the patch window away from the pixel. Testing shows it to be slower than
58// recomputing for both scalar and SSE on a Threadripper due to increased memory writes; this may differ on
59// architectures with slower multiplication.
60//#define CACHE_PIXDIFFS
61//#define CACHE_PIXDIFFS_SSE
62
63// number of intermediate buffers used by OpenCL code path. If you change this, you must also change
64// the definition in src/iop/nlmeans.c and src/iop/denoiseprofile.c
65#define NUM_BUCKETS 4
66
67// a structure to collect together the items which define the location of a patch relative to the pixel
68// being denoised
69struct patch_t
70{
71 short rows; // number of rows difference
72 short cols; // number of columns difference
73 int offset; // array distance between corresponding pixels
74};
75typedef struct patch_t patch_t;
76
77// avoid cluttering the scalar codepath with #ifdefs by hiding the dependency on SSE2
78#if !(defined(__x86_64__) || defined(__i386__))
79# define _mm_prefetch(where,hint)
80#endif
81
82static inline float gh(const float f)
83{
84 return dt_fast_mexp2f(f) ;
85}
86
87static inline int sign(int a)
88{
89 return (a > 0) - (a < 0);
90}
91
92// map the basic row/column offset into a possible much larger offset based on a user parameter
93static int scatter(const float scale, const float scattering, const int index1, const int index2)
94{
95 // this formula is designed to
96 // - produce an identity mapping when scattering = 0
97 // - avoiding duplicate patches provided that 0 <= scattering <= 1
98 // - avoiding grid artifacts by trying to take patches on various rows and columns
99 const int abs_i1 = abs(index1);
100 const int abs_i2 = abs(index2);
101 return scale * ((abs_i1 * abs_i1 * abs_i1 + 7.0 * abs_i1 * sqrt(abs_i2)) * sign(index1) * scattering / 6.0 + index1);
102}
103
104// allocate and fill an array of patch definitions
105static struct patch_t*
106define_patches(const dt_nlmeans_param_t *const params, const int stride, int *num_patches, int *max_shift)
107{
108 const int search_radius = params->search_radius;
109 const float scale = params->scale;
110 const float scattering = params->scattering;
111 int decimate = params->decimate;
112 // determine how many patches we have
113 int n_patches = (2 * search_radius + 1) * (2 * search_radius + 1);
114 if (decimate)
115 n_patches = (n_patches + 1) / 2;
116 *num_patches = n_patches ;
117 // allocate a cacheline-aligned buffer
118 struct patch_t *patches = dt_pixelpipe_cache_alloc_align_cache(sizeof(struct patch_t) * n_patches, 0);
119 if(IS_NULL_PTR(patches)) return NULL;
120
121 // set up the patch offsets
122 int patch_num = 0;
123 int shift = 0;
124 for (int row_index = -search_radius; row_index <= search_radius; row_index++)
125 {
126 for (int col_index = -search_radius; col_index <= search_radius; col_index++)
127 {
128 if (decimate && (++decimate & 1)) continue; // skip every other patch
129 int r = scatter(scale,scattering,row_index,col_index);
130 int c = scatter(scale,scattering,col_index,row_index);
131 patches[patch_num].rows = r;
132 patches[patch_num].cols = c;
133 if (r > shift) shift = r;
134 else if (-r > shift) shift = -r;
135 if (c > shift) shift = c;
136 else if (-c > shift) shift = -c;
137 patches[patch_num].offset = (r * stride + c * 4);
138 patch_num++;
139 }
140 }
141 *max_shift = shift;
142 return patches;
143}
144
145static float compute_center_pixel_norm(const float center_weight, const int radius)
146{
147 // scale the central pixel's contribution by the size of the patch so that the center-weight
148 // setting can be independent of patch size
149 const int width = 2 * radius + 1;
150 return center_weight * width * width;
151}
152
153// compute the channel-normed squared difference between two pixels
154static inline float pixel_difference(const float* const pix1, const float* pix2, const dt_aligned_pixel_t norm)
155{
156 dt_aligned_pixel_t sum = { 0.f, 0.f, 0.f, 0.f };
157 for_each_channel(i, aligned(sum:16))
158 {
159 const float diff = pix1[i] - pix2[i];
160 sum[i] = diff * diff * norm[i];
161 }
162 return sum[0] + sum[1] + sum[2];
163}
164
165// optimized: pixel_difference(pix1, pix2, norm) - pixel_difference(pix3, pix4, norm)
166static inline float diff_of_pixels_diff(const float* const pix1, const float* pix2,
167 const float* const pix3, const float* pix4,
168 const dt_aligned_pixel_t norm)
169{
170 dt_aligned_pixel_t sum = { 0.f, 0.f, 0.f, 0.f };
171 for_each_channel(i, aligned(sum:16))
172 {
173 const float diff1 = pix1[i] - pix2[i];
174 const float diff2 = pix3[i] - pix4[i];
175 sum[i] = (diff1 * diff1 - diff2 * diff2) * norm[i];
176 }
177 return sum[0] + sum[1] + sum[2];
178}
179
180#if defined(CACHE_PIXDIFFS) || defined(CACHE_PIXDIFFS_SSE)
181static inline float get_pixdiff(const float *const col_sums, const int radius, const int row, const int col)
182{
183 const int stride = 2*(radius+1);
184 const int modrow = 1 + (row + stride) % stride;
185 const float *const pixrow = col_sums + (SLICE_WIDTH + 2*radius)*modrow;
186 return pixrow[col];
187}
188#endif
189
190#if defined(CACHE_PIXDIFFS) || defined(CACHE_PIXDIFFS_SSE)
191static inline void set_pixdiff(float *const col_sums, const int radius, const int row, const int col,
192 const float diff)
193{
194 const int stride = 2*(radius+1);
195 const int modrow = 1 + (row + stride) % stride;
196 float *const pixrow = col_sums + (SLICE_WIDTH + 2*radius)*modrow;
197 pixrow[col] = diff;
198}
199#endif
200
201#if defined(CACHE_PIXDIFFS) || defined(CACHE_PIXDIFFS_SSE)
202static inline float pixdiff_column_sum(const float *const col_sums, const int radius, const int col)
203{
204 const int stride = SLICE_WIDTH + 2*radius;
205 float sum = col_sums[stride+col];
206 for (int i = 2; i <= (2*radius+1) ; i++)
207 sum += col_sums[i*stride+col];
208 return sum;
209}
210#endif
211
212static void init_column_sums(float *const col_sums, const patch_t *const patch, const float *const in,
213 const int row, const int chunk_left, const int chunk_right,
214 const int height, const int width, const int stride,
215 const int radius, const float *const norm)
216{
217 // Compute column sums from scratch. Needed for the very first row, and at intervals thereafter
218 // to limit accumulation of rounding errors
219
220 // figure out which columns can possibly contribute to patches whose centers lie within the RoI
221 // we can go up to 'radius' columns beyond the current chunk provided that the patch does not
222 // lie in the same direction from the pixel being denoised and that we're still in the RoI
223 const int scol = patch->cols;
224 const int col_min = chunk_left - MIN(radius,MIN(chunk_left,chunk_left+scol));
225 const int col_max = chunk_right + MIN(radius,MIN(width-chunk_right,width-(chunk_right+scol)));
226 // adjust bounds if the patch extends past top/bottom of RoI
227 const int srow = patch->rows;
228 const int rmin = row - MIN(radius,MIN(row,row+srow));
229 const int rmax = row + MIN(radius,MIN(height-1-row,height-1-(row+srow)));
230 for (int col = chunk_left-radius-1; col < MIN(col_min,chunk_right+radius); col++)
231 {
232 col_sums[col] = 0;
233#ifdef CACHE_PIXDIFFS
234 for(int i = row-radius; i <= row+radius; i++)
235 set_pixdiff(col_sums,radius,i,col,0.0f);
236#endif
237 }
238 for (int col = col_min; col < col_max; col++)
239 {
240 float sum = 0;
241 for (int r = rmin; r <= rmax; r++)
242 {
243 const float *pixel = in + r*stride + 4*col;
244 const float diff = pixel_difference(pixel,pixel+patch->offset,norm);
245#ifdef CACHE_PIXDIFFS
246 set_pixdiff(col_sums,radius,r,col,diff);
247#endif
248 sum += diff;
249 }
250 col_sums[col] = sum;
251 }
252 // clear out any columns where the patch column would be outside the RoI, as well as our overrun area
253 for (int col = MAX(col_min,col_max); col < chunk_right + radius; col++)
254 {
255 col_sums[col] = 0;
256#ifdef CACHE_PIXDIFFS
257 for(int i = row-radius; i <= row+radius; i++)
258 set_pixdiff(col_sums,radius,i,col,0.0f);
259#endif
260 }
261 return;
262}
263
264// determine the height of the horizontal slice each thread will process
265static int compute_slice_height(const int height)
266{
267 if (height % SLICE_HEIGHT == 0)
268 return SLICE_HEIGHT;
269 // try to make the heights of the chunks as even as possible
270 int best = height % SLICE_HEIGHT;
271 int best_incr = 0;
272 for (int incr = 1; incr < 10; incr++)
273 {
274 int plus_rem = height % (SLICE_HEIGHT + incr);
275 if (plus_rem == 0)
276 return SLICE_HEIGHT + incr;
277 else if (plus_rem > best)
278 {
279 best_incr = +incr;
280 best = plus_rem;
281 }
282 int minus_rem = height % (SLICE_HEIGHT - incr);
283 if (minus_rem == 0)
284 return SLICE_HEIGHT - incr;
285 else if (minus_rem > best)
286 {
287 best_incr = -incr;
288 best = minus_rem;
289 }
290 }
291 return SLICE_HEIGHT + best_incr;
292}
293
294// determine the width of the horizontal slice each thread will process
295static int compute_slice_width(const int width)
296{
297 int sl_width = SLICE_WIDTH;
298 // if there's just a sliver left over for the last column, see whether slicing a few pixels off each gives
299 // us a more nearly full final chunk
300 int rem = width % sl_width;
301 if (rem < SLICE_WIDTH/2 && (width % (sl_width-4)) > rem)
302 {
303 sl_width -= 4;
304 // check whether removing an additional sliver improves things even more
305 rem = width % sl_width;
306 if (rem < SLICE_WIDTH/2 && (width % (sl_width-4)) > rem)
307 sl_width -= 4;
308 }
309 return sl_width;
310}
311
313void nlmeans_denoise(const float *const inbuf, float *const outbuf,
314 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
315 const dt_nlmeans_param_t *const params)
316{
317 // define the factors for applying blending between the original image and the denoised version
318 // if running in RGB space, 'luma' should equal 'chroma'
319 const dt_aligned_pixel_t weight = { params->luma, params->chroma, params->chroma, 1.0f };
320 const dt_aligned_pixel_t invert = { 1.0f - params->luma, 1.0f - params->chroma, 1.0f - params->chroma, 0.0f };
321 const bool skip_blend = (params->luma == 1.0 && params->chroma == 1.0);
322
323 // define the normalization to convert central pixel differences into central pixel weights
324 const float cp_norm = compute_center_pixel_norm(params->center_weight,params->patch_radius);
325 const dt_aligned_pixel_t center_norm = { cp_norm, cp_norm, cp_norm, 1.0f };
326
327 // define the patches to be compared when denoising a pixel
328 const size_t stride = 4 * roi_in->width;
329 int num_patches;
330 int max_shift;
331 struct patch_t* patches = define_patches(params,stride,&num_patches,&max_shift);
332 // allocate scratch space, including an overrun area on each end so we don't need a boundary check on every access
333 const int radius = params->patch_radius;
334#if defined(CACHE_PIXDIFFS)
335 const size_t scratch_size = (2*radius+3)*(SLICE_WIDTH + 2*radius + 1);
336#else
337 const size_t scratch_size = SLICE_WIDTH + 2*radius + 1 + 48; // getting false sharing without the +48....
338#endif /* CACHE_PIXDIFFS */
339 size_t padded_scratch_size;
340 float *const restrict scratch_buf = dt_pixelpipe_cache_alloc_perthread_float(scratch_size, &padded_scratch_size);
341 if(IS_NULL_PTR(scratch_buf)) return;
342
343 const int chk_height = compute_slice_height(roi_out->height);
344 const int chk_width = compute_slice_width(roi_out->width);
345 __OMP_PARALLEL_FOR__(num_threads(darktable.num_openmp_threads) collapse(2))
346 for (int chunk_top = 0 ; chunk_top < roi_out->height; chunk_top += chk_height)
347 {
348 for (int chunk_left = 0; chunk_left < roi_out->width; chunk_left += chk_width)
349 {
350 // locate our scratch space within the big buffer allocated above
351 // we'll offset by chunk_left so that we don't have to subtract on every access
352 float *const restrict tmpbuf = dt_get_perthread(scratch_buf, padded_scratch_size);
353 float *const col_sums = tmpbuf + (radius+1) - chunk_left;
354 // determine which horizontal slice of the image to process
355 const int chunk_bot = MIN(chunk_top + chk_height, roi_out->height);
356 // determine which vertical slice of the image to process
357 const int chunk_right = MIN(chunk_left + chk_width, roi_out->width);
358 // we want to incrementally sum results (especially weights in col[3]), so clear the output buffer to zeros
359 for (int i = chunk_top; i < chunk_bot; i++)
360 {
361 memset(outbuf + 4*(i*roi_out->width+chunk_left), '\0', sizeof(float) * 4 * (chunk_right-chunk_left));
362 }
363 // cycle through all of the patches over our slice of the image
364 for (int p = 0; p < num_patches; p++)
365 {
366 // retrieve info about the current patch
367 const patch_t *patch = &patches[p];
368 // skip any rows where the patch center would be above top of RoI or below bottom of RoI
369 const int height = roi_out->height;
370 const int row_min = MAX(chunk_top,MAX(0,-patch->rows));
371 const int row_max = MIN(chunk_bot,height - MAX(0,patch->rows));
372 // figure out which rows at top and bottom result in patches extending outside the RoI, even though the
373 // center pixel is inside
374 const int row_top = MAX(row_min,MAX(radius,radius-patch->rows));
375 const int row_bot = MIN(row_max,height-1-MAX(radius,radius+patch->rows));
376 // skip any columns where the patch center would be to the left or the right of the RoI
377 const int width = roi_out->width;
378 const int scol = patch->cols;
379 const int col_min = MAX(chunk_left,-scol);
380 const int col_max = MIN(chunk_right,roi_out->width - scol);
381
382 init_column_sums(col_sums,patch,inbuf,row_min,chunk_left,chunk_right,height,width,
383 stride,radius,params->norm);
384 for (int row = row_min; row < row_max; row++)
385 {
386 // add up the initial columns of the sliding window of total patch distortion
387 float distortion = 0.0;
388 for (int i = col_min - radius; i < MIN(col_min+radius, col_max); i++)
389 {
390 distortion += col_sums[i];
391 }
392 // now proceed down the current row of the image
393 const float *in = inbuf + stride * row;
394 float *const out = outbuf + (size_t)4 * width * row;
395 const int offset = patch->offset;
396 const float sharpness = params->sharpness;
397 if (params->center_weight < 0)
398 {
399 // computation as used by denoise(non-local) iop
400 for (int col = col_min; col < col_max; col++)
401 {
402 distortion += (col_sums[col+radius] - col_sums[col-radius-1]);
403 const float wt = gh(distortion * sharpness);
404 const float *const inpx = in+4*col;
405 const dt_aligned_pixel_t pixel = { inpx[offset], inpx[offset+1], inpx[offset+2], 1.0f };
406 for_four_channels(c,aligned(pixel,out:16))
407 {
408 out[4*col+c] += pixel[c] * wt;
409 }
410 _mm_prefetch(in+4*col+offset+stride,_MM_HINT_T0); // try to ensure next row is ready in time
411 }
412 }
413 else
414 {
415 // computation as used by denoiseprofiled iop with non-local means
416 for (int col = col_min; col < col_max; col++)
417 {
418 distortion += (col_sums[col+radius] - col_sums[col-radius-1]);
419 const float dissimilarity = (distortion + pixel_difference(in+4*col,in+4*col+offset,center_norm))
420 / (1.0f + params->center_weight);
421 const float wt = gh(fmaxf(0.0f, dissimilarity * sharpness - 2.0f));
422 const float *const inpx = in + 4*col;
423 const dt_aligned_pixel_t pixel = { inpx[offset], inpx[offset+1], inpx[offset+2], 1.0f };
424 for_four_channels(c,aligned(pixel,out:16))
425 {
426 out[4*col+c] += pixel[c] * wt;
427 }
428 _mm_prefetch(in+4*col+offset+stride,_MM_HINT_T0); // try to ensure next row is ready in time
429 }
430 }
431 const int pcol_min = chunk_left - MIN(radius,MIN(chunk_left,chunk_left+scol));
432 const int pcol_max = chunk_right + MIN(radius,MIN(width-chunk_right,width-(chunk_right+scol)));
433 if (row < MIN(row_top, row_bot))
434 {
435 // top edge of patch was above top of RoI, so it had a value of zero; just add in the new row
436 const float *bot_row = inbuf + (row+1+radius)*stride;
437 for (int col = pcol_min; col < pcol_max; col++)
438 {
439 const float *const bot_px = bot_row + 4*col;
440 const float diff = pixel_difference(bot_px,bot_px+offset,params->norm);
441 _mm_prefetch(bot_px+stride, _MM_HINT_T0);
442#ifdef CACHE_PIXDIFFS
443 set_pixdiff(col_sums,radius,row+radius+1,col,diff);
444#endif
445 col_sums[col] += diff;
446 _mm_prefetch(bot_px+offset+stride, _MM_HINT_T0);
447 }
448 }
449 else if (row < row_bot)
450 {
451#ifndef CACHE_PIXDIFFS
452 const float *const top_row = inbuf + (row-radius)*stride /* +(2*radius+1)*stride*/ ;
453#endif /* !CACHE_PIXDIFFS */
454 const float *const bot_row = inbuf + (row+1+radius)*stride ;
455 // both prior and new positions are entirely within the RoI, so subtract the old row and add the new one
456 for (int col = pcol_min; col < pcol_max; col++)
457 {
458#ifdef CACHE_PIXDIFFS
459 const float *const bot_px = bot_row + 4*col;
460 const float diff = pixel_difference(bot_px,bot_px+offset,params->norm);
461 col_sums[col] += diff - get_pixdiff(col_sums,radius,row-radius,col);
462 _mm_prefetch(bot_px+stride, _MM_HINT_T0);
463 set_pixdiff(col_sums,radius,row+1+radius,col,diff);
464#else
465 const float *const top_px = top_row + 4*col;
466 const float *const bot_px = bot_row + 4*col;
467 const float diff = diff_of_pixels_diff(bot_px,bot_px+offset,top_px,top_px+offset,params->norm);
468 _mm_prefetch(bot_px+stride, _MM_HINT_T0);
469 col_sums[col] += diff;
470#endif /* CACHE_PIXDIFFS */
471 _mm_prefetch(bot_px+offset+stride, _MM_HINT_T0);
472 }
473 }
474 else if (row >= row_top && row + 1 < row_max) // don't bother updating if last iteration
475 {
476 // new row of the patch is below the bottom of RoI, so its value is zero; just subtract the old row
477#ifndef CACHE_PIXDIFFS
478 const float *top_row = inbuf + (row-radius)*stride;
479#endif /* !CACHE_PIXDIFFS */
480 for (int col = pcol_min; col < pcol_max; col++)
481 {
482#ifdef CACHE_PIXDIFFS
483 col_sums[col] -= get_pixdiff(col_sums,radius,row-radius,col);
484#else
485 const float *const top_px = top_row + 4*col;
486 col_sums[col] -= pixel_difference(top_px,top_px+offset,params->norm);
487#endif /* CACHE_PIXDIFFS */
488 }
489 }
490 }
491 }
492 if (skip_blend)
493 {
494 // normalize the pixels
495 for (int row = chunk_top; row < chunk_bot; row++)
496 {
497 float *const out = outbuf + 4 * row * roi_out->width;
498 for (int col = chunk_left; col < chunk_right; col++)
499 {
500 for_each_channel(c,aligned(out:16))
501 {
502 out[4*col+c] /= out[4*col+3];
503 }
504 }
505 }
506 }
507 else
508 {
509 // normalize and apply chroma/luma blending
510 for (int row = chunk_top; row < chunk_bot; row++)
511 {
512 const float *in = inbuf + row * stride;
513 float *out = outbuf + row * 4 * roi_out->width;
514 for (int col = chunk_left; col < chunk_right; col++)
515 {
516 for_each_channel(c,aligned(in,out,weight,invert:16))
517 {
518 out[4*col+c] = (in[4*col+c] * invert[c]) + (out[4*col+c] / out[4*col+3] * weight[c]);
519 }
520 }
521 }
522 }
523 }
524 }
525
526 // clean up: free the work space
529 return;
530}
531
532/**************************************************************/
533/**************************************************************/
534/* Everything from here to end of file is WIP!! */
535/**************************************************************/
536/**************************************************************/
537
538#ifdef HAVE_OPENCL
539static int bucket_next(unsigned int *state, unsigned int max)
540{
541 unsigned int current = *state;
542 unsigned int next = (current >= max - 1 ? 0 : current + 1);
543
544 *state = next;
545
546 return next;
547}
548#endif /* HAVE_OPENCL */
549
550#ifdef HAVE_OPENCL
551static void get_blocksizes(int *h, int *v, const int radius, const int devid,
552 const int horiz_kernel, const int vert_kernel)
553{
555 = (dt_opencl_local_buffer_t){ .xoffset = 2 * radius, .xfactor = 1, .yoffset = 0, .yfactor = 1,
556 .cellsize = sizeof(float), .overhead = 0,
557 .sizex = 1 << 16, .sizey = 1 };
558
559 *h = dt_opencl_local_buffer_opt(devid, horiz_kernel, &hlocopt) ? hlocopt.sizex : 1;
560
562 = (dt_opencl_local_buffer_t){ .xoffset = 1, .xfactor = 1, .yoffset = 2 * radius, .yfactor = 1,
563 .cellsize = sizeof(float), .overhead = 0,
564 .sizex = 1, .sizey = 1 << 16 };
565
566 *v = dt_opencl_local_buffer_opt(devid, vert_kernel, &vlocopt) ? vlocopt.sizey : 1;
567 return;
568}
569#endif /* HAVE_OPENCL */
570
571#ifdef HAVE_OPENCL
572// zero output pixels, as we will be accumulating them one patch at a time
573static inline cl_int nlmeans_cl_init(const int devid, const int kernel, cl_mem dev_out, const int height,
574 const int width)
575{
576 const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
577 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_out);
578 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(int), (void *)&width);
579 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&height);
580 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
581}
582#endif /* HAVE_OPENCL */
583
584#ifdef HAVE_OPENCL
585// horizontal pass, add together columns of each patch
586static inline cl_int nlmeans_cl_horiz(const int devid, const int kernel, cl_mem dev_U4, cl_mem dev_U4_t,
587 const int P, const int q[2], const int height, const int width,
588 const int bwidth, const int hblocksize)
589{
590 const size_t sizesl[3] = { bwidth, ROUNDUPDHT(height, devid), 1 };
591 const size_t local[3] = { hblocksize, 1, 1 };
592 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_U4);
593 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_U4_t);
594 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);
595 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);
596 dt_opencl_set_kernel_arg(devid, kernel, 4, 2 * sizeof(int), (void *)&q);
597 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(int), (void *)&P);
598 dt_opencl_set_kernel_arg(devid, kernel, 6, (hblocksize + 2 * P) * sizeof(float), NULL);
599 return dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizesl, local);
600}
601#endif /* HAVE_OPENCL */
602
603#ifdef HAVE_OPENCL
604// add difference-weighted proportion of patch-center pixel to output pixel
605static inline cl_int nlmeans_cl_accu(const int devid, const int kernel, cl_mem dev_in, cl_mem dev_U4_tt,
606 cl_mem dev_out, const int q[2], const int height, const int width,
607 const size_t sizes[3])
608{
609 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);
610 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);
611 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(cl_mem), (void *)&dev_U4_tt);
612 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&width);
613 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(int), (void *)&height);
614 dt_opencl_set_kernel_arg(devid, kernel, 5, 2 * sizeof(int), (void *)&q);
615 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
616}
617#endif /* HAVE_OPENCL */
618
619#ifdef HAVE_OPENCL
620int nlmeans_denoise_cl(const dt_nlmeans_param_t *const params, const int devid,
621 cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in)
622{
623 const int width = roi_in->width;
624 const int height = roi_in->height;
625 const int P = params->patch_radius;
626 const float nL2 = params->norm[0] * params->norm[0];
627 const float nC2 = params->norm[1] * params->norm[1];
628
629 // define the patches to be compared when denoising a pixel
630 const size_t stride = 4 * roi_in->width;
631 int num_patches;
632 int max_shift;
633 struct patch_t* patches = define_patches(params,stride,&num_patches,&max_shift);
634
635 cl_int err = -999;
636 cl_mem buckets[NUM_BUCKETS] = { NULL };
637 unsigned int state = 0;
638 for(int k = 0; k < NUM_BUCKETS; k++)
639 {
640 buckets[k] = dt_opencl_alloc_device_buffer(devid, sizeof(float) * width * height);
641 if(buckets[k] == NULL) goto error;
642 }
643
644 int hblocksize;
645 int vblocksize;
646 get_blocksizes(&hblocksize, &vblocksize, P, devid, params->kernel_horiz, params->kernel_vert);
647
648 // zero the output buffer into which we will be accumulating results
649 err = nlmeans_cl_init(devid,params->kernel_init,dev_out,height,width);
650 if(err != CL_SUCCESS) goto error;
651
652 const size_t bwidth = ROUNDUP(width, hblocksize);
653 const size_t bheight = ROUNDUP(height, vblocksize);
654 const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
655
656 for(int p = 0; p < num_patches; p++)
657 {
658 const patch_t *patch = &patches[p];
659 int q[2] = { patch->rows, patch->cols };
660
661 // compute channel-normed squared differences between input pixels and shifted (by q) pixels
662 cl_mem dev_U4 = buckets[bucket_next(&state, NUM_BUCKETS)];
663 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 0, sizeof(cl_mem), (void *)&dev_in);
664 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 1, sizeof(cl_mem), (void *)&dev_U4);
665 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 2, sizeof(int), (void *)&width);
666 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 3, sizeof(int), (void *)&height);
667 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 4, 2 * sizeof(int), (void *)&q);
668 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 5, sizeof(float), (void *)&nL2);
669 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 6, sizeof(float), (void *)&nC2);
670 err = dt_opencl_enqueue_kernel_2d(devid, params->kernel_dist, sizes);
671 if(err != CL_SUCCESS) break;
672
673 // add up individual columns
674 cl_mem dev_U4_t = buckets[bucket_next(&state, NUM_BUCKETS)];
675 err = nlmeans_cl_horiz(devid,params->kernel_horiz,dev_U4,dev_U4_t,P,q,height,width,bwidth,hblocksize);
676 if(err != CL_SUCCESS) break;
677
678 // add together the column sums and compute the weighting of the current patch for each pixel
679 const size_t sizesl[3] = { ROUNDUPDWD(width, devid), bheight, 1 };
680 const size_t local[3] = { 1, vblocksize, 1 };
681 const float sharpness = params->sharpness;
682 cl_mem dev_U4_tt = buckets[bucket_next(&state, NUM_BUCKETS)];
683 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 0, sizeof(cl_mem), (void *)&dev_U4_t);
684 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 1, sizeof(cl_mem), (void *)&dev_U4_tt);
685 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 2, sizeof(int), (void *)&width);
686 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 3, sizeof(int), (void *)&height);
687 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 4, 2 * sizeof(int), (void *)&q);
688 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 5, sizeof(int), (void *)&P);
689 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 6, sizeof(float), (void *)&sharpness);
690 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 7, (vblocksize + 2 * P) * sizeof(float), NULL);
691 err = dt_opencl_enqueue_kernel_2d_with_local(devid, params->kernel_vert, sizesl, local);
692 if(err != CL_SUCCESS) break;
693
694 // add weighted proportion of patch's center pixel to output pixel
695 err = nlmeans_cl_accu(devid,params->kernel_accu,dev_in,dev_U4_tt,dev_out,q,height,width,sizes);
696 if(err != CL_SUCCESS) break;
697
698 // indirectly give gpu some air to breathe (and to do display related stuff)
700 }
701
702error:
703 // clean up and return status
705 for(int k = 0; k < NUM_BUCKETS; k++)
706 {
708 }
709 return err;
710}
711#endif /* HAVE_OPENCL */
712
713#ifdef HAVE_OPENCL
714int nlmeans_denoiseprofile_cl(const dt_nlmeans_param_t *const params, const int devid,
715 cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in)
716{
717 const int width = roi_in->width;
718 const int height = roi_in->height;
719 const int P = params->patch_radius;
720 const float norm = params->sharpness;
721
722 // define the patches to be compared when denoising a pixel
723 const size_t stride = 4 * roi_in->width;
724 int num_patches;
725 int max_shift;
726 struct patch_t* patches = define_patches(params,stride,&num_patches,&max_shift);
727
728 cl_int err = -999;
729 cl_mem buckets[NUM_BUCKETS] = { NULL };
730 unsigned int state = 0;
731 for(int k = 0; k < NUM_BUCKETS; k++)
732 {
733 buckets[k] = dt_opencl_alloc_device_buffer(devid, sizeof(float) * width * height);
734 if(buckets[k] == NULL) goto error;
735 }
736
737 int hblocksize;
738 int vblocksize;
739 get_blocksizes(&hblocksize, &vblocksize, P, devid, params->kernel_horiz, params->kernel_vert);
740
741 // zero the output buffer into which we will be accumulating results
742 err = nlmeans_cl_init(devid,params->kernel_init,dev_out,height,width);
743 if(err != CL_SUCCESS) goto error;
744
745 const size_t bwidth = ROUNDUP(width, hblocksize);
746 const size_t bheight = ROUNDUP(height, vblocksize);
747 const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
748
749 for(int p = 0; p < num_patches; p++)
750 {
751 const patch_t *patch = &patches[p];
752 int q[2] = { patch->rows, patch->cols };
753
754 // compute squared differences between input pixels and shifted (by q) pixels
755 cl_mem dev_U4 = buckets[bucket_next(&state, NUM_BUCKETS)];
756 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 0, sizeof(cl_mem), (void *)&dev_in);
757 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 1, sizeof(cl_mem), (void *)&dev_U4);
758 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 2, sizeof(int), (void *)&width);
759 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 3, sizeof(int), (void *)&height);
760 dt_opencl_set_kernel_arg(devid, params->kernel_dist, 4, 2 * sizeof(int), (void *)&q);
761 err = dt_opencl_enqueue_kernel_2d(devid, params->kernel_dist, sizes);
762 if(err != CL_SUCCESS) break;
763
764 // add up individual columns
765 cl_mem dev_U4_t = buckets[bucket_next(&state, NUM_BUCKETS)];
766 err = nlmeans_cl_horiz(devid,params->kernel_horiz,dev_U4,dev_U4_t,P,q,height,width,bwidth,hblocksize);
767 if(err != CL_SUCCESS) break;
768
769 // add together the column sums and compute the weighting of the current patch for each pixel
770 const size_t sizesl[3] = { ROUNDUPDWD(width, devid), bheight, 1 };
771 const size_t local[3] = { 1, vblocksize, 1 };
772 const float central_pixel_weight = params->center_weight;
773 cl_mem dev_U4_tt = buckets[bucket_next(&state, NUM_BUCKETS)];
774 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 0, sizeof(cl_mem), (void *)&dev_U4_t);
775 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 1, sizeof(cl_mem), (void *)&dev_U4_tt);
776 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 2, sizeof(int), (void *)&width);
777 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 3, sizeof(int), (void *)&height);
778 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 4, 2 * sizeof(int), (void *)&q);
779 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 5, sizeof(int), (void *)&P);
780 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 6, sizeof(float), (void *)&norm);
781 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 7, (vblocksize + 2 * P) * sizeof(float), NULL);
782 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 8, sizeof(float), (void *)&central_pixel_weight);
783 dt_opencl_set_kernel_arg(devid, params->kernel_vert, 9, sizeof(cl_mem), ((void *)&dev_U4));
784 err = dt_opencl_enqueue_kernel_2d_with_local(devid, params->kernel_vert, sizesl, local);
785 if(err != CL_SUCCESS) break;
786
787 // add weighted proportion of patch's center pixel to output pixel
788 err = nlmeans_cl_accu(devid,params->kernel_accu,dev_in,dev_U4_tt,dev_out,q,height,width,sizes);
789 if(err != CL_SUCCESS) break;
790 }
791
792error:
793 // clean up and return status
795 for(int k = 0; k < NUM_BUCKETS; k++)
796 {
798 }
799 return err;
800}
801#endif /* HAVE_OPENCL */
802// clang-format off
803// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
804// vim: shiftwidth=2 expandtab tabstop=2 cindent
805// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
806// clang-format on
static void error(char *msg)
Definition ashift_lsd.c:202
int width
Definition bilateral.h:1
int height
Definition bilateral.h:1
static const dt_aligned_pixel_simd_t const dt_adaptation_t const float p
const dt_aligned_pixel_t f
const float max
const dt_colormatrix_t dt_aligned_pixel_t out
static const int row
#define P(V, params)
darktable_t darktable
Definition darktable.c:181
#define for_each_channel(_var,...)
Definition darktable.h:662
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
Definition darktable.h:433
static const dt_aligned_pixel_simd_t sign
Definition darktable.h:551
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453
#define __DT_CLONE_TARGETS__
Definition darktable.h:367
#define dt_get_perthread(buf, padsize)
Definition darktable.h:1035
#define for_four_channels(_var,...)
Definition darktable.h:664
#define __OMP_PARALLEL_FOR__(...)
Definition darktable.h:258
#define dt_pixelpipe_cache_alloc_perthread_float(n, padded_size)
Definition darktable.h:1030
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281
static void weight(const float *c1, const float *c2, const float sharpen, dt_aligned_pixel_t weight)
Definition eaw.c:30
void dt_iop_nap(int32_t usec)
Definition imageop.c:2899
static float kernel(const float *x, const float *y)
const float v
float *const restrict const size_t k
static float dt_fast_mexp2f(const float x)
Definition math.h:288
static int bucket_next(unsigned int *state, unsigned int max)
__DT_CLONE_TARGETS__ void nlmeans_denoise(const float *const inbuf, float *const outbuf, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const dt_nlmeans_param_t *const params)
static float gh(const float f)
static void init_column_sums(float *const col_sums, const patch_t *const patch, const float *const in, const int row, const int chunk_left, const int chunk_right, const int height, const int width, const int stride, const int radius, const float *const norm)
#define NUM_BUCKETS
static float pixel_difference(const float *const pix1, const float *pix2, const dt_aligned_pixel_t norm)
static int compute_slice_height(const int height)
static void get_blocksizes(int *h, int *v, const int radius, const int devid, const int horiz_kernel, const int vert_kernel)
static cl_int nlmeans_cl_accu(const int devid, const int kernel, cl_mem dev_in, cl_mem dev_U4_tt, cl_mem dev_out, const int q[2], const int height, const int width, const size_t sizes[3])
static int scatter(const float scale, const float scattering, const int index1, const int index2)
#define SLICE_HEIGHT
int nlmeans_denoise_cl(const dt_nlmeans_param_t *const params, const int devid, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in)
int nlmeans_denoiseprofile_cl(const dt_nlmeans_param_t *const params, const int devid, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in)
#define SLICE_WIDTH
#define _mm_prefetch(where, hint)
static struct patch_t * define_patches(const dt_nlmeans_param_t *const params, const int stride, int *num_patches, int *max_shift)
static float compute_center_pixel_norm(const float center_weight, const int radius)
static cl_int nlmeans_cl_horiz(const int devid, const int kernel, cl_mem dev_U4, cl_mem dev_U4_t, const int P, const int q[2], const int height, const int width, const int bwidth, const int hblocksize)
static int compute_slice_width(const int width)
static float diff_of_pixels_diff(const float *const pix1, const float *pix2, const float *const pix3, const float *pix4, const dt_aligned_pixel_t norm)
static cl_int nlmeans_cl_init(const int devid, const int kernel, cl_mem dev_out, const int height, const int width)
float dt_aligned_pixel_t[4]
int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
Definition opencl.c:3156
int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes)
Definition opencl.c:2136
void * dt_opencl_alloc_device_buffer(const int devid, const size_t size)
Definition opencl.c:2544
int dt_opencl_micro_nap(const int devid)
Definition opencl.c:177
int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size, const void *arg)
Definition opencl.c:2127
int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes, const size_t *local)
Definition opencl.c:2142
void dt_opencl_release_mem_object(cl_mem mem)
Definition opencl.c:2383
#define ROUNDUP(a, n)
Definition opencl.h:78
#define ROUNDUPDHT(a, b)
Definition opencl.h:82
#define ROUNDUPDWD(a, b)
Definition opencl.h:81
const float uint32_t state[4]
const float r
int32_t num_openmp_threads
Definition darktable.h:758
Region of interest passed through the pixelpipe.
Definition imageop.h:72
short cols
short rows
#define MIN(a, b)
Definition thinplate.c:32
#define MAX(a, b)
Definition thinplate.c:29