Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
interpolation.c
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2012 Christian Tellefsen.
4 Copyright (C) 2012 Edouard Gomez.
5 Copyright (C) 2012 Jérémy Rosen.
6 Copyright (C) 2012 Richard Wonka.
7 Copyright (C) 2012-2016, 2019 Tobias Ellinghaus.
8 Copyright (C) 2012, 2014-2017 Ulrich Pegelow.
9 Copyright (C) 2013 Simon Spannagel.
10 Copyright (C) 2014-2016 Roman Lebedev.
11 Copyright (C) 2017-2018 luzpaz.
12 Copyright (C) 2019 Andreas Schneider.
13 Copyright (C) 2019, 2021, 2024-2026 Aurélien PIERRE.
14 Copyright (C) 2020-2021 Pascal Obry.
15 Copyright (C) 2020-2021 Ralf Brown.
16 Copyright (C) 2020-2021 Roman Khatko.
17 Copyright (C) 2021-2022 Hanno Schwalm.
18 Copyright (C) 2022 Martin Bařinka.
19 Copyright (C) 2024 Alynx Zhou.
20
21 darktable is free software: you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation, either version 3 of the License, or
24 (at your option) any later version.
25
26 darktable is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 GNU General Public License for more details.
30
31 You should have received a copy of the GNU General Public License
32 along with darktable. If not, see <http://www.gnu.org/licenses/>.
33*/
34
36#include "common/darktable.h"
37#include "common/math.h"
38#include "control/conf.h"
39
40#include <assert.h>
41#include <glib.h>
42#include <inttypes.h>
43#include <stddef.h>
44#include <stdint.h>
45
48{
49 BORDER_REPLICATE, // aaaa|abcdefg|gggg
50 BORDER_WRAP, // defg|abcdefg|abcd
51 BORDER_MIRROR, // edcb|abcdefg|fedc
52 BORDER_CLAMP // ....|abcdefg|....
53};
54
55/* Supporting them all might be overkill, let the compiler trim all
56 * unnecessary modes in clip for resampling codepath*/
57#define RESAMPLING_BORDER_MODE BORDER_REPLICATE
58
59/* Supporting them all might be overkill, let the compiler trim all
60 * unnecessary modes in interpolation codepath */
61#define INTERPOLATION_BORDER_MODE BORDER_MIRROR
62
63// Defines the maximum kernel half length
64// !! Make sure to sync this with the filter array !!
65#define MAX_HALF_FILTER_WIDTH 3
66
67// Add *verbose* (like one msg per pixel out) debug message to stderr
68#define DEBUG_PRINT_VERBOSE 0
69
70/* --------------------------------------------------------------------------
71 * Debug helpers
72 * ------------------------------------------------------------------------*/
73
74
75/* --------------------------------------------------------------------------
76 * Generic helpers
77 * ------------------------------------------------------------------------*/
78
83static inline __attribute__((always_inline)) ssize_t _clip(ssize_t i,
84 const ssize_t min,
85 const ssize_t max,
86 enum border_mode mode)
87{
88 switch(mode)
89 {
91 if(i < min)
92 {
93 i = min;
94 }
95 else if(i > max)
96 {
97 i = max;
98 }
99 break;
100 case BORDER_MIRROR:
101 if(i < min)
102 {
103 // i == min - 1 --> min + 1
104 // i == min - 2 --> min + 2, etc.
105 // but as min == 0 in all current cases, this really optimizes to i = -i
106 i = min + (min - i);
107 }
108 else if(i > max)
109 {
110 // i == max + 1 --> max - 1
111 // i == max + 2 --> max - 2, etc.
112 i = max - (i - max);
113 }
114 break;
115 case BORDER_WRAP:
116 if(i < min)
117 {
118 i = 1 + max - (min - i);
119 }
120 else if(i > max)
121 {
122 i = min + (i - max) - 1;
123 }
124 break;
125 case BORDER_CLAMP:
126 if(i < min || i > max)
127 {
128 /* Should not be used as is, we prevent -1 usage, filtering the taps
129 * we clip the sample indexes for. So understand this function is
130 * specific to its caller. */
131 i = -1;
132 }
133 break;
134 }
135
136 return i;
137}
138
139static inline __attribute__((always_inline)) void _prepare_tap_boundaries(int *tap_first,
140 int *tap_last,
141 const enum border_mode mode,
142 const int filterwidth,
143 const int t,
144 const int max)
145{
146 /* Check lower bound pixel index and skip as many pixels as necessary to
147 * fall into range */
148 *tap_first = 0;
149 if(mode == BORDER_CLAMP && t < 0)
150 {
151 *tap_first = -t;
152 }
153
154 // Same for upper bound pixel
155 *tap_last = filterwidth;
156 if(mode == BORDER_CLAMP && t + filterwidth >= max)
157 {
158 *tap_last = max - t;
159 }
160}
161
162/* --------------------------------------------------------------------------
163 * Interpolation kernels
164 * ------------------------------------------------------------------------*/
165
166/* --------------------------------------------------------------------------
167 * Bilinear interpolation
168 * ------------------------------------------------------------------------*/
169
170static float _maketaps_bilinear(float *taps,
171 const size_t num_taps,
172 const float width,
173 const float first_tap,
174 const float interval)
175{
176 static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };
177 const dt_aligned_pixel_simd_t interval_v = dt_simd_set1(interval);
178 const dt_aligned_pixel_simd_t iter = dt_simd_set1(4.0f * interval);
179 dt_aligned_pixel_simd_t vt = dt_simd_set1(first_tap) + bootstrap * interval_v;
180
181 const int runs = (num_taps + 3) / 4;
182
183 for(size_t i = 0; i < runs; i++)
184 {
185 dt_store_simd_aligned(taps + 4 * i, dt_simd_set1(1.0f) - dt_simd_abs(vt));
186 vt += iter;
187 }
188 return 1.0f; //kernel norm is 1.0f by construction
189}
190
191/* --------------------------------------------------------------------------
192 * Bicubic interpolation
193 * ------------------------------------------------------------------------*/
194
195static float _maketaps_bicubic(float *taps,
196 const size_t num_taps,
197 const float width,
198 const float first_tap,
199 const float interval)
200{
201 static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };
202 const dt_aligned_pixel_simd_t half = dt_simd_set1(0.5f);
203 const dt_aligned_pixel_simd_t two = dt_simd_set1(2.0f);
204 const dt_aligned_pixel_simd_t three = dt_simd_set1(3.0f);
205 const dt_aligned_pixel_simd_t four = dt_simd_set1(4.0f);
206 const dt_aligned_pixel_simd_t five = dt_simd_set1(5.0f);
207 const dt_aligned_pixel_simd_t eight = dt_simd_set1(8.0f);
208 const dt_aligned_pixel_simd_t interval_v = dt_simd_set1(interval);
209 const dt_aligned_pixel_simd_t iter = dt_simd_set1(4.0f * interval);
210 dt_aligned_pixel_simd_t vt = dt_simd_set1(first_tap) + bootstrap * interval_v;
211
212 const int runs = (num_taps + 3) / 4;
213
214 for(size_t i = 0; i < runs; i++)
215 {
216 const dt_aligned_pixel_simd_t vt_abs = dt_simd_abs(vt);
217 const dt_aligned_pixel_simd_t t2 = vt * vt;
218 const dt_aligned_pixel_simd_t t5 = five * vt_abs;
219 const dt_aligned_pixel_simd_t r12 = (vt_abs * (t5 - eight - t2) + four) * half;
220 const dt_aligned_pixel_simd_t r01 = ((three * t2 - t5) * vt_abs + two) * half;
221 dt_aligned_pixel_simd_t taps4 = r12;
223 taps4[c] = (vt_abs[c] <= 1.0f) ? r01[c] : r12[c];
224 dt_store_simd_aligned(taps + 4 * i, taps4);
225 vt += iter;
226 }
227 return 1.0f; //kernel norm is 1.0f by construction
228}
229
230/* --------------------------------------------------------------------------
231 * Mitchell-Netravali interpolation (B = C = 1/3)
232 *
233 * A separable cubic from the Mitchell-Netravali (B,C) family (SIGGRAPH 1988).
234 * B = C = 1/3 is the classic general-purpose reconstruction filter: it trades a
235 * hair of sharpness for drastically reduced ringing versus interpolating cubics
236 * (Catmull-Rom) and windowed-sinc (Lanczos). Its negative excursion is tiny
237 * (~3% vs Lanczos3's much larger overshoot), so it is effectively halo-free on
238 * photographic content and never blows alpha/colour far out of range at edges.
239 *
240 * Piecewise weights (already divided by 6), support [-2, 2]:
241 * |t| < 1 : (7/6)|t|^3 - 2|t|^2 + 8/9
242 * 1<=|t|<2: -(7/18)|t|^3 + 2|t|^2 - (10/3)|t| + 16/9
243 * It is a partition of unity (taps sum to 1 on the integer grid), so the
244 * upsampling norm is 1 by construction like bilinear/bicubic; the downsampling
245 * path renormalizes from the summed taps separately.
246 * ------------------------------------------------------------------------*/
247
248static float _maketaps_mitchell(float *taps,
249 const size_t num_taps,
250 const float width,
251 const float first_tap,
252 const float interval)
253{
254 static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };
255 const dt_aligned_pixel_simd_t c7_6 = dt_simd_set1(7.0f / 6.0f);
256 const dt_aligned_pixel_simd_t c2 = dt_simd_set1(2.0f);
257 const dt_aligned_pixel_simd_t c8_9 = dt_simd_set1(8.0f / 9.0f);
258 const dt_aligned_pixel_simd_t c7_18 = dt_simd_set1(7.0f / 18.0f);
259 const dt_aligned_pixel_simd_t c10_3 = dt_simd_set1(10.0f / 3.0f);
260 const dt_aligned_pixel_simd_t c16_9 = dt_simd_set1(16.0f / 9.0f);
261 const dt_aligned_pixel_simd_t interval_v = dt_simd_set1(interval);
262 const dt_aligned_pixel_simd_t iter = dt_simd_set1(4.0f * interval);
263 dt_aligned_pixel_simd_t vt = dt_simd_set1(first_tap) + bootstrap * interval_v;
264
265 const int runs = (num_taps + 3) / 4;
266
267 for(size_t i = 0; i < runs; i++)
268 {
269 const dt_aligned_pixel_simd_t a = dt_simd_abs(vt);
270 const dt_aligned_pixel_simd_t a2 = a * a;
271 const dt_aligned_pixel_simd_t a3 = a2 * a;
272 // inner lobe (|t| < 1) and outer lobe (1 <= |t| < 2)
273 const dt_aligned_pixel_simd_t r01 = c7_6 * a3 - c2 * a2 + c8_9;
274 const dt_aligned_pixel_simd_t r12 = c2 * a2 - c7_18 * a3 - c10_3 * a + c16_9;
275 dt_aligned_pixel_simd_t taps4 = r12;
277 taps4[c] = (a[c] <= 1.0f) ? r01[c] : r12[c];
278 dt_store_simd_aligned(taps + 4 * i, taps4);
279 vt += iter;
280 }
281 return 1.0f; // kernel norm is 1.0f by construction (partition of unity)
282}
283
284/* --------------------------------------------------------------------------
285 * All our known interpolators
286 * ------------------------------------------------------------------------*/
287
288/* !!! !!! !!!
289 * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width
290 * of this filter list. Otherwise bad things will happen
291 * !!! !!! !!!
292 */
293static const struct dt_interpolation dt_interpolator[] = {
295 .name = "bilinear",
296 .width = 1,
297 .maketaps = &_maketaps_bilinear,
298 },
300 .name = "bicubic",
301 .width = 2,
302 .maketaps = &_maketaps_bicubic,
303 },
305 .name = "mitchell",
306 .width = 2,
307 .maketaps = &_maketaps_mitchell,
308 },
309};
310
311/* --------------------------------------------------------------------------
312 * Kernel utility methods
313 * ------------------------------------------------------------------------*/
314
315static inline __attribute__((always_inline)) float _compute_upsampling_kernel(const struct dt_interpolation *itor,
316 float *kernel,
317 int *first,
318 float t)
319{
320 // find first pixel contributing to the filter's kernel. We need
321 // floorf() because a simple cast to int truncates toward zero,
322 // yielding an incorrect result for the slightly-negative positions
323 // that can occur at the top and left edges when doing perspective
324 // correction
325 int f = (int)floorf(t) - itor->width + 1;
326 if(first)
327 {
328 *first = f;
329 }
330
331 /* Find closest integer position and then offset that to match first
332 * filtered sample position */
333 t = t - (float)f;
334
335 // compute the taps and return the kernel norm
336 return itor->maketaps(kernel, 2*itor->width, itor->width, t, -1.0f);
337}
338
349static inline void _compute_downsampling_kernel(const struct dt_interpolation *itor,
350 int *taps,
351 int *first,
352 float *kernel,
353 float *norm,
354 const float outoinratio,
355 const int xout)
356{
357 // Keep this at hand
358 const float w = (float)itor->width;
359
360 /* Compute the phase difference between output pixel and its
361 * input corresponding input pixel */
362 const float xin = ceil_fast(((float)xout - w) / outoinratio);
363 if(first)
364 {
365 *first = (int)xin;
366 }
367
368 // Compute first interpolator parameter
369 float t = xin * outoinratio - (float)xout;
370
371 // Compute all filter taps
372 int num_taps = *taps = (int)((w - t) / outoinratio);
373 itor->maketaps(kernel, num_taps, itor->width, t, outoinratio);
374 // compute the kernel norm if requested
375 if (norm)
376 {
377 float n = 0.0f;
378 for(size_t i = 0; i < num_taps; i++)
379 n += kernel[i];
380 *norm = n;
381 }
382}
383
384/* --------------------------------------------------------------------------
385 * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
386 * ------------------------------------------------------------------------*/
387
388#define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))
389
392 const float *in,
393 const float x,
394 const float y,
395 const int width,
396 const int height,
397 const int samplestride,
398 const int linestride)
399{
400 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
401
402 float DT_ALIGNED_ARRAY kernelh[MAX_KERNEL_REQ];
403 float DT_ALIGNED_ARRAY kernelv[MAX_KERNEL_REQ];
404
405 // Compute both horizontal and vertical kernels
406 float normh = _compute_upsampling_kernel(itor, kernelh, NULL, x);
407 float normv = _compute_upsampling_kernel(itor, kernelv, NULL, y);
408
409 int ix = (int)x;
410 int iy = (int)y;
411
412 /* Now 2 cases, the pixel + filter width goes outside the image
413 * in that case we have to use index clipping to keep all reads
414 * in the input image (slow path) or we are sure it won't fall
415 * outside and can do more simple code */
416 float r;
417 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
418 && iy < (height - itor->width))
419 {
420 // Inside image boundary case
421
422 // Go to top left pixel
423 in = (float *)in + linestride * iy + ix * samplestride;
424 in = in - (itor->width - 1) * (samplestride + linestride);
425
426 // Apply the kernel
427 float s = 0.f;
428 for(int i = 0; i < 2 * itor->width; i++)
429 {
430 float h = 0.0f;
431 for(int j = 0; j < 2 * itor->width; j++)
432 {
433 h += kernelh[j] * in[j * samplestride];
434 }
435 s += kernelv[i] * h;
436 in += linestride;
437 }
438 r = fmaxf(0.0f, s / (normh * normv));
439 }
440 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
441 {
442 // At least a valid coordinate
443
444 // Point to the upper left pixel index wise
445 iy -= itor->width - 1;
446 ix -= itor->width - 1;
447
448 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
449 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
450
451 int xtap_first;
452 int xtap_last;
453 _prepare_tap_boundaries(&xtap_first, &xtap_last,
454 bordermode, 2 * itor->width, ix, width);
455
456 int ytap_first;
457 int ytap_last;
458 _prepare_tap_boundaries(&ytap_first, &ytap_last,
459 bordermode, 2 * itor->width, iy, height);
460
461 // Apply the kernel
462 float s = 0.f;
463 for(ssize_t i = ytap_first; i < ytap_last; i++)
464 {
465 const ssize_t clip_y = _clip(iy + i, 0, height - 1, bordermode);
466 float h = 0.0f;
467 for(ssize_t j = xtap_first; j < xtap_last; j++)
468 {
469 const ssize_t clip_x = _clip(ix + j, 0, width - 1, bordermode);
470 const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
471 h += kernelh[j] * ipixel[0];
472 }
473 s += kernelv[i] * h;
474 }
475
476 r = fmaxf(0.0f, s / (normh * normv));
477 }
478 else
479 {
480 // invalid coordinate
481 r = 0.0f;
482 }
483 return r;
484}
485
486/* --------------------------------------------------------------------------
487 * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
488 * ------------------------------------------------------------------------*/
489
492 const float *in,
493 float *out,
494 const float x,
495 const float y,
496 const int width,
497 const int height,
498 const int linestride)
499{
500 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
501
502 // Quite a bit of space for kernels
503 float DT_ALIGNED_ARRAY kernelh[MAX_KERNEL_REQ];
504 float DT_ALIGNED_ARRAY kernelv[MAX_KERNEL_REQ];
505
506 // Compute both horizontal and vertical kernels
507 float normh = _compute_upsampling_kernel(itor, kernelh, NULL, x);
508 float normv = _compute_upsampling_kernel(itor, kernelv, NULL, y);
509
510 // Precompute the inverse of the filter norm for later use
511 const float oonorm = (1.f / (normh * normv));
512
513 /* Now 2 cases, the pixel + filter width goes outside the image
514 * in that case we have to use index clipping to keep all reads
515 * in the input image (slow path) or we are sure it won't fall
516 * outside and can do more simple code */
517 int ix = (int)x;
518 int iy = (int)y;
519
520 if(ix >= (itor->width - 1)
521 && iy >= (itor->width - 1)
522 && ix < (width - itor->width)
523 && iy < (height - itor->width))
524 {
525 // Inside image boundary case
526
527 // Go to top left pixel
528 in = (float *)in + linestride * iy + ix * 4;
529 in = in - (itor->width - 1) * (4 + linestride);
530
531 const size_t itor_width = 2 * itor->width;
532
533 // Apply the kernel
534 dt_aligned_pixel_simd_t pixel = dt_simd_set1(0.0f);
535 for(size_t i = 0; i < itor_width; i++)
536 {
537 dt_aligned_pixel_simd_t h = dt_simd_set1(0.0f);
538 for(size_t j = 0; j < itor_width; j++)
539 h += dt_load_simd_aligned(in + 4 * j) * dt_simd_set1(kernelh[j]);
540 pixel += h * dt_simd_set1(kernelv[i]);
541 in += linestride;
542 }
543
544 dt_store_simd(out, dt_simd_max_zero(pixel * dt_simd_set1(oonorm)));
545 }
546 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
547 {
548 // At least a valid coordinate
549
550 // Point to the upper left pixel index wise
551 iy -= itor->width - 1;
552 ix -= itor->width - 1;
553
554 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
555 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
556
557 int xtap_first;
558 int xtap_last;
559 _prepare_tap_boundaries(&xtap_first, &xtap_last,
560 bordermode, 2 * itor->width, ix, width);
561
562 int ytap_first;
563 int ytap_last;
564 _prepare_tap_boundaries(&ytap_first, &ytap_last,
565 bordermode, 2 * itor->width, iy, height);
566
567 // Apply the kernel
568 dt_aligned_pixel_simd_t pixel = dt_simd_set1(0.0f);
569 for(ssize_t i = ytap_first; i < ytap_last; i++)
570 {
571 const ssize_t clip_y = _clip(iy + i, 0, height - 1, bordermode);
572 dt_aligned_pixel_simd_t h = dt_simd_set1(0.0f);
573 const float *ipixel = in + clip_y * linestride;
574 for(ssize_t j = xtap_first; j < xtap_last; j++)
575 {
576 const ssize_t clip_x = _clip(ix + j, 0, width - 1, bordermode);
577 h += dt_load_simd_aligned(ipixel + 4 * clip_x) * dt_simd_set1(kernelh[j]);
578 }
579 pixel += h * dt_simd_set1(kernelv[i]);
580 }
581
582 dt_store_simd(out, dt_simd_max_zero(pixel * dt_simd_set1(oonorm)));
583 }
584 else
585 {
586 // data for *out has no valid *in location so just set to zero.
588 }
589}
590
591/* --------------------------------------------------------------------------
592 * Interpolation factory
593 * ------------------------------------------------------------------------*/
594
596{
597 const struct dt_interpolation *itor = NULL;
598
600 {
601 // Find user preferred interpolation method
602 const char *uipref =
603 dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator");
604
605 for(int i = DT_INTERPOLATION_FIRST;
606 uipref && i < DT_INTERPOLATION_LAST;
607 i++)
608 {
609 if(!strcmp(uipref, dt_interpolator[i].name))
610 {
611 // Found the one
612 itor = &dt_interpolator[i];
613 break;
614 }
615 }
616
617 /* In the case the search failed (!uipref or name not found),
618 * prepare later search pass with default fallback */
620 }
622 {
623 // Find user preferred interpolation method
624 const char *uipref =
625 dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator_warp");
626 for(int i = DT_INTERPOLATION_FIRST;
627 uipref && i < DT_INTERPOLATION_LAST;
628 i++)
629 {
630 if(!strcmp(uipref, dt_interpolator[i].name))
631 {
632 // Found the one
633 itor = &dt_interpolator[i];
634 break;
635 }
636 }
637
638 /* In the case the search failed (!uipref or name not found),
639 * prepare later search pass with default fallback */
641 }
642 if(IS_NULL_PTR(itor))
643 {
644 // Did not find the userpref one or we've been asked for a specific one
646 {
647 if(dt_interpolator[i].id == type)
648 {
649 itor = &dt_interpolator[i];
650 break;
651 }
653 {
654 itor = &dt_interpolator[i];
655 }
656 }
657 }
658
659 return itor;
660}
661
662/* --------------------------------------------------------------------------
663 * Image resampling
664 * ------------------------------------------------------------------------*/
665
706static gboolean _prepare_resampling_plan(const struct dt_interpolation *itor,
707 const int in,
708 const int in_x0,
709 const int out,
710 const int out_x0,
711 const float scale,
712 int **plength,
713 float **pkernel,
714 int **pindex,
715 int **pmeta)
716{
717 // Safe return values
718 *plength = NULL;
719 *pkernel = NULL;
720 *pindex = NULL;
721 if(pmeta)
722 {
723 *pmeta = NULL;
724 }
725
726 if(scale == 1.f)
727 {
728 // No resampling required
729 return FALSE;
730 }
731
732 // Compute common upsampling/downsampling memory requirements
733 int maxtapsapixel;
734 if(scale > 1.f)
735 {
736 // Upscale... the easy one. The values are exact
737 maxtapsapixel = 2 * itor->width;
738 }
739 else
740 {
741 // Downscale... going for worst case values memory wise
742 maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);
743 }
744
745 int nlengths = out;
746 const int nindex = maxtapsapixel * out;
747 const int nkernel = maxtapsapixel * out;
748 const size_t lengthreq = dt_round_size(nlengths * sizeof(int), DT_CACHELINE_BYTES);
749 const size_t indexreq = dt_round_size(nindex * sizeof(int), DT_CACHELINE_BYTES);
750 const size_t kernelreq = dt_round_size(nkernel * sizeof(float), DT_CACHELINE_BYTES);
751 const size_t scratchreq = dt_round_size(maxtapsapixel * sizeof(float) + 4 * sizeof(float), DT_CACHELINE_BYTES);
752 // NB: because sse versions compute four taps a time
753 const size_t metareq = dt_round_size(pmeta ? 4 * sizeof(int) * out : 0, DT_CACHELINE_BYTES);
754
755 const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
756 void *blob = dt_pixelpipe_cache_alloc_align_cache(totalreq, 0);
757 if(IS_NULL_PTR(blob)) return TRUE;
758
759 int *lengths = (int *)blob;
760 blob = (char *)blob + lengthreq;
761 int *index = (int *)blob;
762 blob = (char *)blob + indexreq;
763 float *kernel = (float *)blob;
764 blob = (char *)blob + kernelreq;
765 float *scratchpad = scratchreq ? (float *)blob : NULL;
766 blob = (char *)blob + scratchreq;
767 int *meta = metareq ? (int *)blob : NULL;
768// blob = (char *)blob + metareq;
769
770 /* setting this as a const should help the compilers trim all unnecessary
771 * codepaths */
772 const enum border_mode bordermode = RESAMPLING_BORDER_MODE;
773
774 /* Upscale and downscale differ in subtle points, getting rid of code
775 * duplication might have been tricky and i prefer keeping the code
776 * as straight as possible */
777 if(scale > 1.f)
778 {
779 int kidx = 0;
780 int iidx = 0;
781 int lidx = 0;
782 int midx = 0;
783 for(int x = 0; x < out; x++)
784 {
785 if(meta)
786 {
787 meta[midx++] = lidx;
788 meta[midx++] = kidx;
789 meta[midx++] = iidx;
790 }
791
792 // Projected position in input samples
793 float fx = (float)(out_x0 + x) / scale - in_x0;
794
795 // Compute the filter kernel at that position
796 int first;
797 (void)_compute_upsampling_kernel(itor, scratchpad, &first, fx);
798
799 /* Check lower and higher bound pixel index and skip as many pixels as
800 * necessary to fall into range */
801 int tap_first;
802 int tap_last;
803 _prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->width, first, in);
804
805 // Track number of taps that will be used
806 lengths[lidx++] = tap_last - tap_first;
807
808 // Precompute the inverse of the norm
809 float norm = 0.f;
810 for(int tap = tap_first; tap < tap_last; tap++)
811 {
812 norm += scratchpad[tap];
813 }
814 norm = 1.f / norm;
815
816 /* Unlike single pixel or single sample code, here it's interesting to
817 * precompute the normalized filter kernel as this will avoid dividing
818 * by the norm for all processed samples/pixels
819 * NB: use the same loop to put in place the index list */
820 first += tap_first;
821 for(int tap = tap_first; tap < tap_last; tap++)
822 {
823 kernel[kidx++] = scratchpad[tap] * norm;
824 index[iidx++] = _clip(first++, 0, in - 1, bordermode);
825 }
826 }
827 }
828 else
829 {
830 int kidx = 0;
831 int iidx = 0;
832 int lidx = 0;
833 int midx = 0;
834 for(int x = 0; x < out; x++)
835 {
836 if(meta)
837 {
838 meta[midx++] = lidx;
839 meta[midx++] = kidx;
840 meta[midx++] = iidx;
841 }
842
843 // Compute downsampling kernel centered on output position
844 int taps;
845 int first;
846 _compute_downsampling_kernel(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);
847
848 /* Check lower and higher bound pixel index and skip as many pixels as
849 * necessary to fall into range */
850 int tap_first;
851 int tap_last;
852 _prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);
853
854 // Track number of taps that will be used
855 lengths[lidx++] = tap_last - tap_first;
856
857 // Precompute the inverse of the norm
858 float norm = 0.f;
859 for(int tap = tap_first; tap < tap_last; tap++)
860 {
861 norm += scratchpad[tap];
862 }
863 norm = 1.f / norm;
864
865 /* Unlike single pixel or single sample code, here it's interesting to
866 * precompute the normalized filter kernel as this will avoid dividing
867 * by the norm for all processed samples/pixels
868 * NB: use the same loop to put in place the index list */
869 first += tap_first;
870 for(int tap = tap_first; tap < tap_last; tap++)
871 {
872 kernel[kidx++] = scratchpad[tap] * norm;
873 index[iidx++] = _clip(first++, 0, in - 1, bordermode);
874 }
875 }
876 }
877
878 // Validate plan wrt caller
879 *plength = lengths;
880 *pindex = index;
881 *pkernel = kernel;
882 if(pmeta)
883 {
884 *pmeta = meta;
885 }
886
887 return FALSE;
888}
889
890#define TILE_ROWS 128
891
893static void _interpolation_resample_plain(const struct dt_interpolation *itor,
894 float *const restrict out,
895 const dt_iop_roi_t *const roi_out,
896 const float *const restrict in,
897 const dt_iop_roi_t *const roi_in)
898{
899 int *hindex = NULL;
900 int *hlength = NULL;
901 float *hkernel = NULL;
902 int *vindex = NULL;
903 int *vlength = NULL;
904 float *vkernel = NULL;
905 int *vmeta = NULL;
906
907 const int32_t in_stride_floats = roi_in->width * 4;
908 const int32_t out_stride_floats = roi_out->width * 4;
909
910 // Fast code path for 1:1 copy, only cropping area can change
911 if(roi_out->scale == 1.f || roi_out->scale == roi_in->scale)
912 {
913 const size_t x0 = (roi_out->x - roi_in->x) * 4 * sizeof(float);
914 const size_t y0 = (roi_out->y - roi_in->y);
916 for(int yt = 0; yt < roi_out->height; yt += TILE_ROWS)
917 {
918 const int y_end = MIN(yt + TILE_ROWS, roi_out->height);
919 for(int y = yt; y < y_end; y++)
920 memcpy((char *)__builtin_assume_aligned(out, 64) + (size_t)out_stride_floats * sizeof(float) * y,
921 (char *)__builtin_assume_aligned(in, 64) + (size_t)in_stride_floats * sizeof(float) * (y + y0) + x0,
922 out_stride_floats * sizeof(float));
923 }
924
925
926 // All done, so easy case
927 return;
928 }
929
930 // Generic non 1:1 case... much more complicated :D
931
932 // The actual resampling ratio between the two buffers,
933 // not the absolute pipeline scale
934 const float resample_scale = roi_out->scale / roi_in->scale;
935
936 if(_prepare_resampling_plan(itor, roi_in->width, roi_in->x,
937 roi_out->width, roi_out->x, resample_scale,
938 &hlength, &hkernel, &hindex, NULL))
939 goto exit;
940
941 if(_prepare_resampling_plan(itor, roi_in->height, roi_in->y,
942 roi_out->height, roi_out->y, resample_scale,
943 &vlength, &vkernel, &vindex, &vmeta))
944 goto exit;
945
946 const size_t height = roi_out->height;
947 const size_t width = roi_out->width;
948
949 // Process each output line
951 for(size_t oy = 0; oy < height; oy++)
952 {
953 // Initialize column resampling indexes
954 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
955 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
956 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
957
958 // Initialize row resampling indexes
959 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
960 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
961
962 // Number of lines contributing to the output line
963 int vl = vlength[vlidx++]; // V(ertical) L(ength)
964
965 // Process each output column
966 for(size_t ox = 0; ox < width; ox++)
967 {
968 // This will hold the resulting pixel
969 dt_aligned_pixel_simd_t vs = dt_simd_set1(0.0f);
970
971 // Number of horizontal samples contributing to the output
972 const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
973 const int *const column_hindex = hindex + hkidx;
974 const float *const column_hkernel = hkernel + hkidx;
975 const int *const column_vindex = vindex + viidx;
976 const float *const column_vkernel = vkernel + vkidx;
977
978 for(size_t iy = 0; iy < vl; iy++)
979 {
980 // This is our input line
981 const size_t baseidx_vindex = (size_t)column_vindex[iy] * in_stride_floats;
982
983 dt_aligned_pixel_simd_t vhs = dt_simd_set1(0.0f);
984
985 for(size_t ix = 0; ix < hl; ix++)
986 {
987 // Apply the precomputed filter kernel
988 const size_t baseidx = baseidx_vindex + (size_t)column_hindex[ix] * 4;
989 const float htap = column_hkernel[ix];
990 vhs += dt_load_simd_aligned(in + baseidx) * dt_simd_set1(htap);
991 }
992
993 // Accumulate contribution from this line
994 const float vtap = column_vkernel[iy];
995 vs += vhs * dt_simd_set1(vtap);
996 }
997
998 // Output pixel is ready
999 const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
1000
1001 // Clip negative RGB that may be produced by Lanczos undershooting
1002 // Negative RGB are invalid values no matter the RGB space (light is positive)
1003 dt_aligned_pixel_t pixel;
1004 dt_store_simd_aligned(pixel, dt_simd_max_zero(vs));
1005 copy_pixel_nontemporal(out + baseidx, pixel);
1006
1007 // The vertical support is fixed for the whole output row. Only the
1008 // horizontal plan advances from one output column to the next.
1009 hkidx += hl;
1010 }
1011 }
1012
1013
1015
1016exit:
1017 /* Free the resampling plans. It's nasty to optimize allocs like that, but
1018 * it simplifies the code :-D. The length array is in fact the only memory
1019 * allocated. */
1022}
1023
1028 float *out,
1029 const dt_iop_roi_t *const roi_out,
1030 const float *const in,
1031 const dt_iop_roi_t *const roi_in)
1032{
1033 return _interpolation_resample_plain(itor, out, roi_out, in, roi_in);
1034}
1035
1042 float *out,
1043 const dt_iop_roi_t *const roi_out,
1044 const float *const in,
1045 const dt_iop_roi_t *const roi_in)
1046{
1047 dt_iop_roi_t oroi = *roi_out;
1048 //oroi.x = oroi.y = 0;
1049
1050 dt_iop_roi_t iroi = *roi_in;
1051 //iroi.x = iroi.y = 0;
1052
1053 dt_interpolation_resample(itor, out, &oroi, in, &iroi);
1054}
1055
1056#ifdef HAVE_OPENCL
1058{
1061
1062 const int program = 2; // basic.cl, from programs.conf
1064 dt_opencl_create_kernel(program, "interpolation_resample");
1065 return g;
1066}
1067
1069{
1070 if(IS_NULL_PTR(g)) return;
1071 // destroy kernels
1072 dt_opencl_free_kernel(g->kernel_interpolation_resample);
1073 dt_free(g);
1074}
1075
1076static uint32_t roundToNextPowerOfTwo(uint32_t x)
1077{
1078 x--;
1079 x |= x >> 1;
1080 x |= x >> 2;
1081 x |= x >> 4;
1082 x |= x >> 8;
1083 x |= x >> 16;
1084 x++;
1085 return x;
1086}
1087
1092 const int devid,
1093 cl_mem dev_out,
1094 const dt_iop_roi_t *const roi_out,
1095 cl_mem dev_in,
1096 const dt_iop_roi_t *const roi_in)
1097{
1098 int *hindex = NULL;
1099 int *hlength = NULL;
1100 float *hkernel = NULL;
1101 int *hmeta = NULL;
1102 int *vindex = NULL;
1103 int *vlength = NULL;
1104 float *vkernel = NULL;
1105 int *vmeta = NULL;
1106
1107 cl_int err = DT_OPENCL_DEFAULT_ERROR;
1108
1109 cl_mem dev_hindex = NULL;
1110 cl_mem dev_hlength = NULL;
1111 cl_mem dev_hkernel = NULL;
1112 cl_mem dev_hmeta = NULL;
1113 cl_mem dev_vindex = NULL;
1114 cl_mem dev_vlength = NULL;
1115 cl_mem dev_vkernel = NULL;
1116 cl_mem dev_vmeta = NULL;
1117
1118 // Fast code path for 1:1 copy, only cropping area can change
1119 if(roi_out->scale == 1.f || roi_out->scale == roi_in->scale)
1120 {
1121 size_t iorigin[] = { roi_out->x - roi_in->x, roi_out->y - roi_in->y, 0 };
1122 size_t oorigin[] = { 0, 0, 0 };
1123 size_t region[] = { roi_out->width, roi_out->height, 1 };
1124
1125 // copy original input from dev_in -> dev_out as starting point
1126 err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
1127 if(err != CL_SUCCESS) goto error;
1128
1129 // All done, so easy case
1130 return CL_SUCCESS;
1131 }
1132
1133 // Generic non 1:1 case... much more complicated :D
1134
1135 // The actual resampling ratio between the two buffers,
1136 // not the absolute pipeline scale
1137 const float resample_scale = roi_out->scale / roi_in->scale;
1138
1139 if(_prepare_resampling_plan(itor, roi_in->width, roi_in->x,
1140 roi_out->width, roi_out->x, resample_scale,
1141 &hlength, &hkernel, &hindex, &hmeta))
1142 goto error;
1143
1144 if(_prepare_resampling_plan(itor, roi_in->height, roi_in->y,
1145 roi_out->height, roi_out->y, resample_scale,
1146 &vlength, &vkernel, &vindex, &vmeta))
1147 goto error;
1148
1149 int hmaxtaps = -1, vmaxtaps = -1;
1150 for(int k = 0; k < roi_out->width; k++) hmaxtaps = MAX(hmaxtaps, hlength[k]);
1151 for(int k = 0; k < roi_out->height; k++) vmaxtaps = MAX(vmaxtaps, vlength[k]);
1152
1153 // strategy: process image column-wise (local[0] = 1). For each row generate
1154 // a number of parallel work items each taking care of one horizontal convolution,
1155 // then sum over work items to do the vertical convolution
1156
1158 const int width = roi_out->width;
1159 const int height = roi_out->height;
1160
1161 // make sure blocksize is not too large
1162 const int taps = roundToNextPowerOfTwo(vmaxtaps);
1163 // the number of work items per row rounded up to a power of 2
1164 // (for quick recursive reduction)
1165
1166 int vblocksize;
1167
1170 { .xoffset = 0,
1171 .xfactor = 1,
1172 .yoffset = 0,
1173 .yfactor = 1,
1174 .cellsize = 4 * sizeof(float),
1175 .overhead = hmaxtaps * sizeof(float) + hmaxtaps * sizeof(int),
1176 .sizex = 1,
1177 .sizey = (1 << 16) * taps };
1178
1179 if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))
1180 vblocksize = locopt.sizey;
1181 else
1182 vblocksize = 1;
1183
1184 if(vblocksize < taps)
1185 {
1186 // our strategy does not work: the vertical number of taps exceeds
1187 // the vertical workgroupsize; there is no point in continuing on
1188 // the GPU - that would be way too slow; let's delegate the stuff
1189 // to the CPU then.
1190 err = CL_INVALID_WORK_GROUP_SIZE;
1191 goto error;
1192 }
1193
1194 size_t sizes[3] = { ROUNDUPDWD(width, devid), ROUNDUP(height * taps, vblocksize), 1 };
1195 size_t local[3] = { 1, vblocksize, 1 };
1196
1197 // store resampling plan to device memory hindex, vindex, hkernel,
1198 // vkernel: (v|h)maxtaps might be too small, so store a bit more
1199 // than needed
1200 err = -999;
1201
1202 dev_hindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * (hmaxtaps + 1), hindex);
1203 if(IS_NULL_PTR(dev_hindex)) goto error;
1204
1205 dev_hlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width, hlength);
1206 if(IS_NULL_PTR(dev_hlength)) goto error;
1207
1208 dev_hkernel = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * width * (hmaxtaps + 1), hkernel);
1209 if(IS_NULL_PTR(dev_hkernel)) goto error;
1210
1211 dev_hmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * 3, hmeta);
1212 if(IS_NULL_PTR(dev_hmeta)) goto error;
1213
1214 dev_vindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * (vmaxtaps + 1), vindex);
1215 if(IS_NULL_PTR(dev_vindex)) goto error;
1216
1217 dev_vlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height, vlength);
1218 if(IS_NULL_PTR(dev_vlength)) goto error;
1219
1220 dev_vkernel = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * height * (vmaxtaps + 1), vkernel);
1221 if(IS_NULL_PTR(dev_vkernel)) goto error;
1222
1223 dev_vmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * 3, vmeta);
1224 if(IS_NULL_PTR(dev_vmeta)) goto error;
1225
1226 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);
1227 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);
1228 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);
1229 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);
1230 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cl_mem), (void *)&dev_hmeta);
1231 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cl_mem), (void *)&dev_vmeta);
1232 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cl_mem), (void *)&dev_hlength);
1233 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cl_mem), (void *)&dev_vlength);
1234 dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cl_mem), (void *)&dev_hindex);
1235 dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(cl_mem), (void *)&dev_vindex);
1236 dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(cl_mem), (void *)&dev_hkernel);
1237 dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(cl_mem), (void *)&dev_vkernel);
1238 dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(int), (void *)&hmaxtaps);
1239 dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(int), (void *)&taps);
1240 dt_opencl_set_kernel_arg(devid, kernel, 14, hmaxtaps * sizeof(float), NULL);
1241 dt_opencl_set_kernel_arg(devid, kernel, 15, hmaxtaps * sizeof(int), NULL);
1242 dt_opencl_set_kernel_arg(devid, kernel, 16, vblocksize * 4 * sizeof(float), NULL);
1243 err = dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizes, local);
1244
1245error:
1246
1247 dt_opencl_release_mem_object(dev_hindex);
1248 dt_opencl_release_mem_object(dev_hlength);
1249 dt_opencl_release_mem_object(dev_hkernel);
1251 dt_opencl_release_mem_object(dev_vindex);
1252 dt_opencl_release_mem_object(dev_vlength);
1253 dt_opencl_release_mem_object(dev_vkernel);
1257 return err;
1258}
1259
1266 const int devid,
1267 cl_mem dev_out,
1268 const dt_iop_roi_t *const roi_out,
1269 cl_mem dev_in,
1270 const dt_iop_roi_t *const roi_in)
1271{
1272 dt_iop_roi_t oroi = *roi_out;
1273 //oroi.x = oroi.y = 0;
1274
1275 dt_iop_roi_t iroi = *roi_in;
1276 //iroi.x = iroi.y = 0;
1277
1278 return dt_interpolation_resample_cl(itor, devid, dev_out, &oroi, dev_in, &iroi);
1279}
1280#endif
1281
1283 float *out,
1284 const dt_iop_roi_t *const roi_out,
1285 const float *const in,
1286 const dt_iop_roi_t *const roi_in)
1287{
1288 int *hindex = NULL;
1289 int *hlength = NULL;
1290 float *hkernel = NULL;
1291 int *vindex = NULL;
1292 int *vlength = NULL;
1293 float *vkernel = NULL;
1294 int *vmeta = NULL;
1295
1296
1297 const size_t out_stride = roi_out->width * sizeof(float);
1298 const size_t in_stride = roi_in->width * sizeof(float);
1299
1300 // Fast code path for 1:1 copy, only cropping area can change
1301 if(roi_out->scale == 1.f || roi_out->scale == roi_in->scale)
1302 {
1303 const size_t x0 = (roi_out->x - roi_in->x) * sizeof(float);
1304 const size_t y0 = (roi_out->y - roi_in->y);
1306 for(int y = 0; y < roi_out->height; y++)
1307 {
1308 float *i = (float *)((char *)in + in_stride * (y + y0) + x0);
1309 float *o = (float *)((char *)out + out_stride * y);
1310 memcpy(o, i, out_stride);
1311 }
1312 // All done, so easy case
1313 return;
1314 }
1315
1316 // Generic non 1:1 case... much more complicated :D
1317
1318 // Prepare resampling plans once and for all
1319 if(_prepare_resampling_plan(itor, roi_in->width, roi_in->x,
1320 roi_out->width, roi_out->x, roi_out->scale,
1321 &hlength, &hkernel, &hindex, NULL))
1322 goto exit;
1323
1324 if(_prepare_resampling_plan(itor, roi_in->height, roi_in->y,
1325 roi_out->height, roi_out->y, roi_out->scale,
1326 &vlength, &vkernel, &vindex, &vmeta))
1327 goto exit;
1328
1329 // Process each output line
1331 for(int oy = 0; oy < roi_out->height; oy++)
1332 {
1333 // Initialize column resampling indexes
1334 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1335 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1336 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1337
1338 // Initialize row resampling indexes
1339 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1340 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1341 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1342
1343 // Number of lines contributing to the output line
1344 int vl = vlength[vlidx++]; // V(ertical) L(ength)
1345
1346 // Process each output column
1347 for(int ox = 0; ox < roi_out->width; ox++)
1348 {
1349 // This will hold the resulting pixel
1350 float vs = 0.0f;
1351
1352 // Number of horizontal samples contributing to the output
1353 const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1354 for(int iy = 0; iy < vl; iy++)
1355 {
1356 // This is our input line
1357 const float *i = (float *)((char *)in + in_stride * vindex[viidx++]);
1358
1359 float vhs = 0.0f;
1360
1361 for(int ix = 0; ix < hl; ix++)
1362 {
1363 // Apply the precomputed filter kernel
1364 const size_t baseidx = (size_t)hindex[hiidx++];
1365 const float htap = hkernel[hkidx++];
1366 vhs += i[baseidx] * htap;
1367 }
1368
1369 // Accumulate contribution from this line
1370 const float vtap = vkernel[vkidx++];
1371 vs += vhs * vtap;
1372
1373 // Reset horizontal resampling context
1374 hkidx -= hl;
1375 hiidx -= hl;
1376 }
1377
1378 // Output pixel is ready
1379 float *o = (float *)((char *)out + (size_t)oy * out_stride
1380 + (size_t)ox * sizeof(float));
1381 *o = vs;
1382
1383 // Reset vertical resampling context
1384 viidx -= vl;
1385 vkidx -= vl;
1386
1387 // Progress in horizontal context
1388 hiidx += hl;
1389 hkidx += hl;
1390 }
1391 }
1392
1393 exit:
1394 /* Free the resampling plans. It's nasty to optimize allocs like that, but
1395 * it simplifies the code :-D. The length array is in fact the only memory
1396 * allocated. */
1399}
1400
1405 float *out,
1406 const dt_iop_roi_t *const roi_out,
1407 const float *const in,
1408 const dt_iop_roi_t *const roi_in)
1409{
1410 return _interpolation_resample_1c_plain(itor, out, roi_out, in, roi_in);
1411}
1412
1418 float *out,
1419 const dt_iop_roi_t *const roi_out,
1420 const float *const in,
1421 const dt_iop_roi_t *const roi_in)
1422{
1423 dt_iop_roi_t oroi = *roi_out;
1424 //oroi.x = oroi.y = 0;
1425
1426 dt_iop_roi_t iroi = *roi_in;
1427 //iroi.x = iroi.y = 0;
1428
1429 dt_interpolation_resample_1c(itor, out, &oroi, in, &iroi);
1430}
1431
1432// clang-format off
1433// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
1434// vim: shiftwidth=2 expandtab tabstop=2 cindent
1435// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
1436// clang-format on
static void error(char *msg)
Definition ashift_lsd.c:202
#define TRUE
Definition ashift_lsd.c:162
#define FALSE
Definition ashift_lsd.c:158
int width
Definition bilateral.h:1
int height
Definition bilateral.h:1
return vector dt_simd_set1(valid ?(scaling+NORM_MIN) :NORM_MIN)
const dt_aligned_pixel_t f
static const float const float const float min
const float max
const dt_colormatrix_t dt_aligned_pixel_t out
dt_store_simd_aligned(out, dt_mat3x4_mul_vec4(vin, dt_colormatrix_row_to_simd(matrix, 0), dt_colormatrix_row_to_simd(matrix, 1), dt_colormatrix_row_to_simd(matrix, 2)))
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))
int type
char * name
const char * dt_conf_get_string_const(const char *name)
darktable_t darktable
Definition darktable.c:181
dt_store_simd(out, value)
#define DT_ALIGNED_ARRAY
Definition darktable.h:388
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
Definition darktable.h:433
float dt_aligned_pixel_simd_t __attribute__((vector_size(16), aligned(16)))
Enable aggressive floating-point arithmetic optimizations, in denormals handling. Set through user pr...
Definition darktable.h:524
static void copy_pixel_nontemporal(float *const __restrict__ out, const float *const __restrict__ in)
Definition darktable.h:677
#define dt_free(ptr)
Definition darktable.h:456
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453
static size_t dt_round_size(const size_t size, const size_t alignment)
Definition darktable.h:397
#define __DT_CLONE_TARGETS__
Definition darktable.h:367
#define for_four_channels(_var,...)
Definition darktable.h:664
#define __OMP_PARALLEL_FOR__(...)
Definition darktable.h:258
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281
#define DT_CACHELINE_BYTES
Definition darktable.h:380
static CameraMetaData * meta
#define dt_omploop_sfence()
Definition imageop.h:702
const struct dt_interpolation * dt_interpolation_new(enum dt_interpolation_type type)
void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
#define MAX_HALF_FILTER_WIDTH
__DT_CLONE_TARGETS__ void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out, const float x, const float y, const int width, const int height, const int linestride)
static void _interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
static float _maketaps_mitchell(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
static uint32_t roundToNextPowerOfTwo(uint32_t x)
#define RESAMPLING_BORDER_MODE
void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
border_mode
@ BORDER_WRAP
@ BORDER_MIRROR
@ BORDER_CLAMP
@ BORDER_REPLICATE
static __DT_CLONE_TARGETS__ gboolean _prepare_resampling_plan(const struct dt_interpolation *itor, const int in, const int in_x0, const int out, const int out_x0, const float scale, int **plength, float **pkernel, int **pindex, int **pmeta)
void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
void dt_interpolation_resample(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
int dt_interpolation_resample_cl(const struct dt_interpolation *itor, const int devid, cl_mem dev_out, const dt_iop_roi_t *const roi_out, cl_mem dev_in, const dt_iop_roi_t *const roi_in)
void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
dt_interpolation_cl_global_t * dt_interpolation_init_cl_global()
static float _maketaps_bilinear(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
#define MAX_KERNEL_REQ
int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, const int devid, cl_mem dev_out, const dt_iop_roi_t *const roi_out, cl_mem dev_in, const dt_iop_roi_t *const roi_in)
static const struct dt_interpolation dt_interpolator[]
#define TILE_ROWS
__DT_CLONE_TARGETS__ float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x, const float y, const int width, const int height, const int samplestride, const int linestride)
#define INTERPOLATION_BORDER_MODE
static __DT_CLONE_TARGETS__ void _interpolation_resample_plain(const struct dt_interpolation *itor, float *const restrict out, const dt_iop_roi_t *const roi_out, const float *const restrict in, const dt_iop_roi_t *const roi_in)
static void _compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first, float *kernel, float *norm, const float outoinratio, const int xout)
static float _maketaps_bicubic(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
dt_interpolation_type
@ DT_INTERPOLATION_BICUBIC
@ DT_INTERPOLATION_BILINEAR
@ DT_INTERPOLATION_DEFAULT
@ DT_INTERPOLATION_LAST
@ DT_INTERPOLATION_MITCHELL
@ DT_INTERPOLATION_USERPREF
@ DT_INTERPOLATION_DEFAULT_WARP
@ DT_INTERPOLATION_FIRST
@ DT_INTERPOLATION_USERPREF_WARP
static float kernel(const float *x, const float *y)
static const float x
const int t
float *const restrict const size_t k
static float ceil_fast(float x)
Definition math.h:322
float dt_aligned_pixel_t[4]
int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
Definition opencl.c:3156
int dt_opencl_create_kernel(const int prog, const char *name)
Definition opencl.c:2030
void * dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host)
Definition opencl.c:2332
int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst, size_t *region)
Definition opencl.c:2261
void dt_opencl_free_kernel(const int kernel)
Definition opencl.c:2073
int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size, const void *arg)
Definition opencl.c:2127
int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes, const size_t *local)
Definition opencl.c:2142
void dt_opencl_release_mem_object(cl_mem mem)
Definition opencl.c:2383
#define DT_OPENCL_DEFAULT_ERROR
Definition opencl.h:57
#define ROUNDUP(a, n)
Definition opencl.h:78
#define ROUNDUPDWD(a, b)
Definition opencl.h:81
const float r
struct dt_opencl_t * opencl
Definition darktable.h:785
dt_interpolation_func maketaps
enum dt_interpolation_type id
Region of interest passed through the pixelpipe.
Definition imageop.h:72
double scale
Definition imageop.h:74
struct dt_interpolation_cl_global_t * interpolation
Definition opencl.h:260
#define MIN(a, b)
Definition thinplate.c:32
#define MAX(a, b)
Definition thinplate.c:29