57#define RESAMPLING_BORDER_MODE BORDER_REPLICATE
61#define INTERPOLATION_BORDER_MODE BORDER_MIRROR
65#define MAX_HALF_FILTER_WIDTH 3
68#define DEBUG_PRINT_VERBOSE 0
126 if(i < min || i >
max)
139static inline __attribute__((always_inline))
void _prepare_tap_boundaries(
int *tap_first,
142 const int filterwidth,
155 *tap_last = filterwidth;
171 const size_t num_taps,
173 const float first_tap,
174 const float interval)
176 static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };
177 const dt_aligned_pixel_simd_t interval_v =
dt_simd_set1(interval);
178 const dt_aligned_pixel_simd_t iter =
dt_simd_set1(4.0f * interval);
179 dt_aligned_pixel_simd_t vt =
dt_simd_set1(first_tap) + bootstrap * interval_v;
181 const int runs = (num_taps + 3) / 4;
183 for(
size_t i = 0;
i < runs;
i++)
196 const size_t num_taps,
198 const float first_tap,
199 const float interval)
201 static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };
202 const dt_aligned_pixel_simd_t half =
dt_simd_set1(0.5f);
204 const dt_aligned_pixel_simd_t three =
dt_simd_set1(3.0f);
205 const dt_aligned_pixel_simd_t four =
dt_simd_set1(4.0f);
206 const dt_aligned_pixel_simd_t five =
dt_simd_set1(5.0f);
207 const dt_aligned_pixel_simd_t eight =
dt_simd_set1(8.0f);
208 const dt_aligned_pixel_simd_t interval_v =
dt_simd_set1(interval);
209 const dt_aligned_pixel_simd_t iter =
dt_simd_set1(4.0f * interval);
210 dt_aligned_pixel_simd_t vt =
dt_simd_set1(first_tap) + bootstrap * interval_v;
212 const int runs = (num_taps + 3) / 4;
214 for(
size_t i = 0;
i < runs;
i++)
216 const dt_aligned_pixel_simd_t vt_abs = dt_simd_abs(vt);
217 const dt_aligned_pixel_simd_t t2 = vt * vt;
218 const dt_aligned_pixel_simd_t t5 = five * vt_abs;
219 const dt_aligned_pixel_simd_t r12 = (vt_abs * (t5 - eight - t2) + four) * half;
220 const dt_aligned_pixel_simd_t r01 = ((three * t2 - t5) * vt_abs + two) * half;
221 dt_aligned_pixel_simd_t taps4 = r12;
223 taps4[c] = (vt_abs[c] <= 1.0f) ? r01[c] : r12[c];
249 const size_t num_taps,
251 const float first_tap,
252 const float interval)
254 static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };
255 const dt_aligned_pixel_simd_t c7_6 =
dt_simd_set1(7.0f / 6.0f);
257 const dt_aligned_pixel_simd_t c8_9 =
dt_simd_set1(8.0f / 9.0f);
258 const dt_aligned_pixel_simd_t c7_18 =
dt_simd_set1(7.0f / 18.0f);
259 const dt_aligned_pixel_simd_t c10_3 =
dt_simd_set1(10.0f / 3.0f);
260 const dt_aligned_pixel_simd_t c16_9 =
dt_simd_set1(16.0f / 9.0f);
261 const dt_aligned_pixel_simd_t interval_v =
dt_simd_set1(interval);
262 const dt_aligned_pixel_simd_t iter =
dt_simd_set1(4.0f * interval);
263 dt_aligned_pixel_simd_t vt =
dt_simd_set1(first_tap) + bootstrap * interval_v;
265 const int runs = (num_taps + 3) / 4;
267 for(
size_t i = 0;
i < runs;
i++)
269 const dt_aligned_pixel_simd_t a = dt_simd_abs(vt);
270 const dt_aligned_pixel_simd_t a2 = a * a;
271 const dt_aligned_pixel_simd_t a3 = a2 * a;
273 const dt_aligned_pixel_simd_t r01 = c7_6 * a3 -
c2 * a2 + c8_9;
274 const dt_aligned_pixel_simd_t r12 =
c2 * a2 - c7_18 * a3 - c10_3 * a + c16_9;
275 dt_aligned_pixel_simd_t taps4 = r12;
277 taps4[c] = (a[c] <= 1.0f) ? r01[c] : r12[c];
325 int f = (int)floorf(
t) - itor->
width + 1;
354 const float outoinratio,
358 const float w = (float)itor->
width;
362 const float xin =
ceil_fast(((
float)xout - w) / outoinratio);
369 float t = xin * outoinratio - (float)xout;
372 int num_taps = *taps = (int)((w -
t) / outoinratio);
378 for(
size_t i = 0;
i < num_taps;
i++)
388#define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))
397 const int samplestride,
398 const int linestride)
406 float normh = _compute_upsampling_kernel(itor, kernelh, NULL,
x);
407 float normv = _compute_upsampling_kernel(itor, kernelv, NULL, y);
423 in = (
float *)in + linestride * iy + ix * samplestride;
424 in = in - (itor->
width - 1) * (samplestride + linestride);
428 for(
int i = 0;
i < 2 * itor->
width;
i++)
431 for(
int j = 0; j < 2 * itor->
width; j++)
433 h += kernelh[j] * in[j * samplestride];
438 r = fmaxf(0.0f, s / (normh * normv));
440 else if(ix >= 0 && iy >= 0 && ix <
width && iy <
height)
445 iy -= itor->
width - 1;
446 ix -= itor->
width - 1;
453 _prepare_tap_boundaries(&xtap_first, &xtap_last,
458 _prepare_tap_boundaries(&ytap_first, &ytap_last,
463 for(ssize_t
i = ytap_first;
i < ytap_last;
i++)
465 const ssize_t clip_y = _clip(iy +
i, 0,
height - 1, bordermode);
467 for(ssize_t j = xtap_first; j < xtap_last; j++)
469 const ssize_t clip_x = _clip(ix + j, 0,
width - 1, bordermode);
470 const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
471 h += kernelh[j] * ipixel[0];
476 r = fmaxf(0.0f, s / (normh * normv));
498 const int linestride)
507 float normh = _compute_upsampling_kernel(itor, kernelh, NULL,
x);
508 float normv = _compute_upsampling_kernel(itor, kernelv, NULL, y);
511 const float oonorm = (1.f / (normh * normv));
520 if(ix >= (itor->
width - 1)
521 && iy >= (itor->
width - 1)
528 in = (
float *)in + linestride * iy + ix * 4;
529 in = in - (itor->
width - 1) * (4 + linestride);
531 const size_t itor_width = 2 * itor->
width;
535 for(
size_t i = 0;
i < itor_width;
i++)
538 for(
size_t j = 0; j < itor_width; j++)
539 h += dt_load_simd_aligned(in + 4 * j) *
dt_simd_set1(kernelh[j]);
546 else if(ix >= 0 && iy >= 0 && ix <
width && iy <
height)
551 iy -= itor->
width - 1;
552 ix -= itor->
width - 1;
559 _prepare_tap_boundaries(&xtap_first, &xtap_last,
564 _prepare_tap_boundaries(&ytap_first, &ytap_last,
569 for(ssize_t
i = ytap_first;
i < ytap_last;
i++)
571 const ssize_t clip_y = _clip(iy +
i, 0,
height - 1, bordermode);
573 const float *ipixel = in + clip_y * linestride;
574 for(ssize_t j = xtap_first; j < xtap_last; j++)
576 const ssize_t clip_x = _clip(ix + j, 0,
width - 1, bordermode);
577 h += dt_load_simd_aligned(ipixel + 4 * clip_x) *
dt_simd_set1(kernelh[j]);
737 maxtapsapixel = 2 * itor->
width;
746 const int nindex = maxtapsapixel *
out;
747 const int nkernel = maxtapsapixel *
out;
755 const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
759 int *lengths = (
int *)blob;
760 blob = (
char *)blob + lengthreq;
761 int *index = (
int *)blob;
762 blob = (
char *)blob + indexreq;
763 float *
kernel = (
float *)blob;
764 blob = (
char *)blob + kernelreq;
765 float *scratchpad = scratchreq ? (
float *)blob : NULL;
766 blob = (
char *)blob + scratchreq;
767 int *
meta = metareq ? (
int *)blob : NULL;
783 for(
int x = 0;
x <
out;
x++)
793 float fx = (float)(out_x0 +
x) / scale - in_x0;
797 (
void)_compute_upsampling_kernel(itor, scratchpad, &first,
fx);
803 _prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->
width, first, in);
806 lengths[lidx++] = tap_last - tap_first;
810 for(
int tap = tap_first; tap < tap_last; tap++)
812 norm += scratchpad[tap];
821 for(
int tap = tap_first; tap < tap_last; tap++)
823 kernel[kidx++] = scratchpad[tap] * norm;
824 index[iidx++] = _clip(first++, 0, in - 1, bordermode);
834 for(
int x = 0;
x <
out;
x++)
852 _prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);
855 lengths[lidx++] = tap_last - tap_first;
859 for(
int tap = tap_first; tap < tap_last; tap++)
861 norm += scratchpad[tap];
870 for(
int tap = tap_first; tap < tap_last; tap++)
872 kernel[kidx++] = scratchpad[tap] * norm;
873 index[iidx++] = _clip(first++, 0, in - 1, bordermode);
894 float *
const restrict
out,
896 const float *
const restrict in,
901 float *hkernel = NULL;
904 float *vkernel = NULL;
907 const int32_t in_stride_floats = roi_in->
width * 4;
908 const int32_t out_stride_floats = roi_out->
width * 4;
913 const size_t x0 = (roi_out->
x - roi_in->
x) * 4 *
sizeof(
float);
914 const size_t y0 = (roi_out->
y - roi_in->
y);
919 for(
int y = yt; y < y_end; y++)
920 memcpy((
char *)__builtin_assume_aligned(
out, 64) + (
size_t)out_stride_floats *
sizeof(
float) * y,
921 (
char *)__builtin_assume_aligned(in, 64) + (
size_t)in_stride_floats *
sizeof(
float) * (y + y0) + x0,
922 out_stride_floats *
sizeof(
float));
934 const float resample_scale = roi_out->
scale / roi_in->
scale;
937 roi_out->
width, roi_out->
x, resample_scale,
938 &hlength, &hkernel, &hindex, NULL))
942 roi_out->
height, roi_out->
y, resample_scale,
943 &vlength, &vkernel, &vindex, &vmeta))
951 for(
size_t oy = 0; oy <
height; oy++)
954 int vlidx = vmeta[3 * oy + 0];
955 int vkidx = vmeta[3 * oy + 1];
956 int viidx = vmeta[3 * oy + 2];
963 int vl = vlength[vlidx++];
966 for(
size_t ox = 0; ox <
width; ox++)
972 const int hl = hlength[hlidx++];
973 const int *
const column_hindex = hindex + hkidx;
974 const float *
const column_hkernel = hkernel + hkidx;
975 const int *
const column_vindex = vindex + viidx;
976 const float *
const column_vkernel = vkernel + vkidx;
978 for(
size_t iy = 0; iy < vl; iy++)
981 const size_t baseidx_vindex = (size_t)column_vindex[iy] * in_stride_floats;
985 for(
size_t ix = 0; ix < hl; ix++)
988 const size_t baseidx = baseidx_vindex + (size_t)column_hindex[ix] * 4;
989 const float htap = column_hkernel[ix];
990 vhs += dt_load_simd_aligned(in + baseidx) *
dt_simd_set1(htap);
994 const float vtap = column_vkernel[iy];
999 const size_t baseidx = (size_t)oy * out_stride_floats + (
size_t)ox * 4;
1030 const float *
const in,
1044 const float *
const in,
1062 const int program = 2;
1099 int *hlength = NULL;
1100 float *hkernel = NULL;
1103 int *vlength = NULL;
1104 float *vkernel = NULL;
1109 cl_mem dev_hindex = NULL;
1110 cl_mem dev_hlength = NULL;
1111 cl_mem dev_hkernel = NULL;
1112 cl_mem dev_hmeta = NULL;
1113 cl_mem dev_vindex = NULL;
1114 cl_mem dev_vlength = NULL;
1115 cl_mem dev_vkernel = NULL;
1116 cl_mem dev_vmeta = NULL;
1121 size_t iorigin[] = { roi_out->
x - roi_in->
x, roi_out->
y - roi_in->
y, 0 };
1122 size_t oorigin[] = { 0, 0, 0 };
1123 size_t region[] = { roi_out->
width, roi_out->
height, 1 };
1127 if(err != CL_SUCCESS)
goto error;
1137 const float resample_scale = roi_out->
scale / roi_in->
scale;
1140 roi_out->
width, roi_out->
x, resample_scale,
1141 &hlength, &hkernel, &hindex, &hmeta))
1145 roi_out->
height, roi_out->
y, resample_scale,
1146 &vlength, &vkernel, &vindex, &vmeta))
1149 int hmaxtaps = -1, vmaxtaps = -1;
1150 for(
int k = 0;
k < roi_out->
width;
k++) hmaxtaps =
MAX(hmaxtaps, hlength[
k]);
1151 for(
int k = 0;
k < roi_out->
height;
k++) vmaxtaps =
MAX(vmaxtaps, vlength[
k]);
1174 .cellsize = 4 *
sizeof(float),
1175 .overhead = hmaxtaps *
sizeof(
float) + hmaxtaps *
sizeof(int),
1177 .sizey = (1 << 16) * taps };
1180 vblocksize = locopt.
sizey;
1184 if(vblocksize < taps)
1190 err = CL_INVALID_WORK_GROUP_SIZE;
1195 size_t local[3] = { 1, vblocksize, 1 };
1285 const float *
const in,
1289 int *hlength = NULL;
1290 float *hkernel = NULL;
1292 int *vlength = NULL;
1293 float *vkernel = NULL;
1297 const size_t out_stride = roi_out->
width *
sizeof(float);
1298 const size_t in_stride = roi_in->
width *
sizeof(float);
1303 const size_t x0 = (roi_out->
x - roi_in->
x) *
sizeof(
float);
1304 const size_t y0 = (roi_out->
y - roi_in->
y);
1306 for(
int y = 0; y < roi_out->
height; y++)
1308 float *
i = (
float *)((
char *)in + in_stride * (y + y0) + x0);
1309 float *o = (
float *)((
char *)
out + out_stride * y);
1310 memcpy(o,
i, out_stride);
1321 &hlength, &hkernel, &hindex, NULL))
1326 &vlength, &vkernel, &vindex, &vmeta))
1331 for(
int oy = 0; oy < roi_out->
height; oy++)
1334 int vlidx = vmeta[3 * oy + 0];
1335 int vkidx = vmeta[3 * oy + 1];
1336 int viidx = vmeta[3 * oy + 2];
1344 int vl = vlength[vlidx++];
1347 for(
int ox = 0; ox < roi_out->
width; ox++)
1353 const int hl = hlength[hlidx++];
1354 for(
int iy = 0; iy < vl; iy++)
1357 const float *
i = (
float *)((
char *)in + in_stride * vindex[viidx++]);
1361 for(
int ix = 0; ix < hl; ix++)
1364 const size_t baseidx = (size_t)hindex[hiidx++];
1365 const float htap = hkernel[hkidx++];
1366 vhs +=
i[baseidx] * htap;
1370 const float vtap = vkernel[vkidx++];
1379 float *o = (
float *)((
char *)
out + (size_t)oy * out_stride
1380 + (
size_t)ox *
sizeof(float));
1407 const float *
const in,
1420 const float *
const in,
static void error(char *msg)
return vector dt_simd_set1(valid ?(scaling+NORM_MIN) :NORM_MIN)
const dt_aligned_pixel_t f
static const float const float const float min
const dt_colormatrix_t dt_aligned_pixel_t out
dt_store_simd_aligned(out, dt_mat3x4_mul_vec4(vin, dt_colormatrix_row_to_simd(matrix, 0), dt_colormatrix_row_to_simd(matrix, 1), dt_colormatrix_row_to_simd(matrix, 2)))
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))
const char * dt_conf_get_string_const(const char *name)
dt_store_simd(out, value)
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
float dt_aligned_pixel_simd_t __attribute__((vector_size(16), aligned(16)))
Enable aggressive floating-point arithmetic optimizations, in denormals handling. Set through user pr...
static void copy_pixel_nontemporal(float *const __restrict__ out, const float *const __restrict__ in)
#define dt_pixelpipe_cache_free_align(mem)
static size_t dt_round_size(const size_t size, const size_t alignment)
#define __DT_CLONE_TARGETS__
#define for_four_channels(_var,...)
#define __OMP_PARALLEL_FOR__(...)
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
#define DT_CACHELINE_BYTES
static CameraMetaData * meta
#define dt_omploop_sfence()
const struct dt_interpolation * dt_interpolation_new(enum dt_interpolation_type type)
void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
#define MAX_HALF_FILTER_WIDTH
__DT_CLONE_TARGETS__ void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out, const float x, const float y, const int width, const int height, const int linestride)
static void _interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
static float _maketaps_mitchell(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
static uint32_t roundToNextPowerOfTwo(uint32_t x)
#define RESAMPLING_BORDER_MODE
void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
static __DT_CLONE_TARGETS__ gboolean _prepare_resampling_plan(const struct dt_interpolation *itor, const int in, const int in_x0, const int out, const int out_x0, const float scale, int **plength, float **pkernel, int **pindex, int **pmeta)
void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
void dt_interpolation_resample(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
int dt_interpolation_resample_cl(const struct dt_interpolation *itor, const int devid, cl_mem dev_out, const dt_iop_roi_t *const roi_out, cl_mem dev_in, const dt_iop_roi_t *const roi_in)
void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
dt_interpolation_cl_global_t * dt_interpolation_init_cl_global()
static float _maketaps_bilinear(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, const int devid, cl_mem dev_out, const dt_iop_roi_t *const roi_out, cl_mem dev_in, const dt_iop_roi_t *const roi_in)
static const struct dt_interpolation dt_interpolator[]
__DT_CLONE_TARGETS__ float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x, const float y, const int width, const int height, const int samplestride, const int linestride)
#define INTERPOLATION_BORDER_MODE
static __DT_CLONE_TARGETS__ void _interpolation_resample_plain(const struct dt_interpolation *itor, float *const restrict out, const dt_iop_roi_t *const roi_out, const float *const restrict in, const dt_iop_roi_t *const roi_in)
static void _compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first, float *kernel, float *norm, const float outoinratio, const int xout)
static float _maketaps_bicubic(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
@ DT_INTERPOLATION_BICUBIC
@ DT_INTERPOLATION_BILINEAR
@ DT_INTERPOLATION_DEFAULT
@ DT_INTERPOLATION_MITCHELL
@ DT_INTERPOLATION_USERPREF
@ DT_INTERPOLATION_DEFAULT_WARP
@ DT_INTERPOLATION_USERPREF_WARP
static float kernel(const float *x, const float *y)
float *const restrict const size_t k
static float ceil_fast(float x)
float dt_aligned_pixel_t[4]
int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
int dt_opencl_create_kernel(const int prog, const char *name)
void * dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host)
int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst, size_t *region)
void dt_opencl_free_kernel(const int kernel)
int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size, const void *arg)
int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes, const size_t *local)
void dt_opencl_release_mem_object(cl_mem mem)
#define DT_OPENCL_DEFAULT_ERROR
struct dt_opencl_t * opencl
int kernel_interpolation_resample
dt_interpolation_func maketaps
enum dt_interpolation_type id
Region of interest passed through the pixelpipe.
struct dt_interpolation_cl_global_t * interpolation