interpolation_8c_source.html

/*

    This file is part of darktable,

    Copyright (C) 2012 Christian Tellefsen.

    Copyright (C) 2012 Edouard Gomez.

    Copyright (C) 2012 Jérémy Rosen.

    Copyright (C) 2012 Richard Wonka.

    Copyright (C) 2012-2016, 2019 Tobias Ellinghaus.

    Copyright (C) 2012, 2014-2017 Ulrich Pegelow.

    Copyright (C) 2013 Simon Spannagel.

    Copyright (C) 2014-2016 Roman Lebedev.

    Copyright (C) 2017-2018 luzpaz.

    Copyright (C) 2019 Andreas Schneider.

    Copyright (C) 2019, 2021, 2024-2026 Aurélien PIERRE.

    Copyright (C) 2020-2021 Pascal Obry.

    Copyright (C) 2020-2021 Ralf Brown.

    Copyright (C) 2020-2021 Roman Khatko.

    Copyright (C) 2021-2022 Hanno Schwalm.

    Copyright (C) 2022 Martin Bařinka.

    Copyright (C) 2024 Alynx Zhou.


    darktable is free software: you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or

    (at your option) any later version.


    darktable is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License

    along with darktable.  If not, see <http://www.gnu.org/licenses/>.

*/


#include "common/interpolation.h"

#include "common/darktable.h"

#include "common/math.h"

#include "control/conf.h"


#include <assert.h>

#include <glib.h>

#include <inttypes.h>

#include <stddef.h>

#include <stdint.h>


enum border_mode

{

  BORDER_REPLICATE, // aaaa|abcdefg|gggg

  BORDER_WRAP,      // defg|abcdefg|abcd

  BORDER_MIRROR,    // edcb|abcdefg|fedc

  BORDER_CLAMP      // ....|abcdefg|....

};


/* Supporting them all might be overkill, let the compiler trim all

 * unnecessary modes in clip for resampling codepath*/

#define RESAMPLING_BORDER_MODE BORDER_REPLICATE


/* Supporting them all might be overkill, let the compiler trim all

 * unnecessary modes in interpolation codepath */

#define INTERPOLATION_BORDER_MODE BORDER_MIRROR


// Defines the maximum kernel half length

// !! Make sure to sync this with the filter array !!

#define MAX_HALF_FILTER_WIDTH 3


// Add *verbose* (like one msg per pixel out) debug message to stderr

#define DEBUG_PRINT_VERBOSE 0


/* --------------------------------------------------------------------------

 * Debug helpers

 * ------------------------------------------------------------------------*/


/* --------------------------------------------------------------------------

 * Generic helpers

 * ------------------------------------------------------------------------*/


static inline __attribute__((always_inline)) ssize_t _clip(ssize_t i,

                            const ssize_t min,

                            const ssize_t max,

                            enum border_mode mode)

{

  switch(mode)

  {

    case BORDER_REPLICATE:

      if(i < min)

      {

        i = min;

      }

      else if(i > max)

      {

        i = max;

      }

      break;

    case BORDER_MIRROR:

      if(i < min)

      {

        // i == min - 1  -->  min + 1

        // i == min - 2  -->  min + 2, etc.

        // but as min == 0 in all current cases, this really optimizes to i = -i

        i = min + (min - i);

      }

      else if(i > max)

      {

        // i == max + 1  -->  max - 1

        // i == max + 2  -->  max - 2, etc.

        i = max - (i - max);

      }

      break;

    case BORDER_WRAP:

      if(i < min)

      {

        i = 1 + max - (min - i);

      }

      else if(i > max)

      {

        i = min + (i - max) - 1;

      }

      break;

    case BORDER_CLAMP:

      if(i < min || i > max)

      {

        /* Should not be used as is, we prevent -1 usage, filtering the taps

         * we clip the sample indexes for. So understand this function is

         * specific to its caller. */

        i = -1;

      }

      break;

  }


  return i;

}


static inline __attribute__((always_inline)) void _prepare_tap_boundaries(int *tap_first,

                                           int *tap_last,

                                           const enum border_mode mode,

                                           const int filterwidth,

                                           const int t,

                                           const int max)

{

  /* Check lower bound pixel index and skip as many pixels as necessary to

   * fall into range */

  *tap_first = 0;

  if(mode == BORDER_CLAMP && t < 0)

  {

    *tap_first = -t;

  }


  // Same for upper bound pixel

  *tap_last = filterwidth;

  if(mode == BORDER_CLAMP && t + filterwidth >= max)

  {

    *tap_last = max - t;

  }

}


/* --------------------------------------------------------------------------

 * Interpolation kernels

 * ------------------------------------------------------------------------*/


/* --------------------------------------------------------------------------

 * Bilinear interpolation

 * ------------------------------------------------------------------------*/


static float _maketaps_bilinear(float *taps,

                                const size_t num_taps,

                                const float  width,

                                const float first_tap,

                                const float interval)

{

  static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };

  const dt_aligned_pixel_simd_t interval_v = dt_simd_set1(interval);

  const dt_aligned_pixel_simd_t iter = dt_simd_set1(4.0f * interval);

  dt_aligned_pixel_simd_t vt = dt_simd_set1(first_tap) + bootstrap * interval_v;


  const int runs = (num_taps + 3) / 4;


  for(size_t i = 0; i < runs; i++)

  {

    dt_store_simd_aligned(taps + 4 * i, dt_simd_set1(1.0f) - dt_simd_abs(vt));

    vt += iter;

  }

  return 1.0f; //kernel norm is 1.0f by construction

}


/* --------------------------------------------------------------------------

 * Bicubic interpolation

 * ------------------------------------------------------------------------*/


static float _maketaps_bicubic(float *taps,

                               const size_t num_taps,

                               const float  width,

                               const float first_tap,

                               const float interval)

{

  static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };

  const dt_aligned_pixel_simd_t half = dt_simd_set1(0.5f);

  const dt_aligned_pixel_simd_t two = dt_simd_set1(2.0f);

  const dt_aligned_pixel_simd_t three = dt_simd_set1(3.0f);

  const dt_aligned_pixel_simd_t four = dt_simd_set1(4.0f);

  const dt_aligned_pixel_simd_t five = dt_simd_set1(5.0f);

  const dt_aligned_pixel_simd_t eight = dt_simd_set1(8.0f);

  const dt_aligned_pixel_simd_t interval_v = dt_simd_set1(interval);

  const dt_aligned_pixel_simd_t iter = dt_simd_set1(4.0f * interval);

  dt_aligned_pixel_simd_t vt = dt_simd_set1(first_tap) + bootstrap * interval_v;


  const int runs = (num_taps + 3) / 4;


  for(size_t i = 0; i < runs; i++)

  {

    const dt_aligned_pixel_simd_t vt_abs = dt_simd_abs(vt);

    const dt_aligned_pixel_simd_t t2 = vt * vt;

    const dt_aligned_pixel_simd_t t5 = five * vt_abs;

    const dt_aligned_pixel_simd_t r12 = (vt_abs * (t5 - eight - t2) + four) * half;

    const dt_aligned_pixel_simd_t r01 = ((three * t2 - t5) * vt_abs + two) * half;

    dt_aligned_pixel_simd_t taps4 = r12;

    for_four_channels(c)

      taps4[c] = (vt_abs[c] <= 1.0f) ? r01[c] : r12[c];

    dt_store_simd_aligned(taps + 4 * i, taps4);

    vt += iter;

  }

  return 1.0f; //kernel norm is 1.0f by construction

}


/* --------------------------------------------------------------------------

 * Mitchell-Netravali interpolation (B = C = 1/3)

 *

 * A separable cubic from the Mitchell-Netravali (B,C) family (SIGGRAPH 1988).

 * B = C = 1/3 is the classic general-purpose reconstruction filter: it trades a

 * hair of sharpness for drastically reduced ringing versus interpolating cubics

 * (Catmull-Rom) and windowed-sinc (Lanczos). Its negative excursion is tiny

 * (~3% vs Lanczos3's much larger overshoot), so it is effectively halo-free on

 * photographic content and never blows alpha/colour far out of range at edges.

 *

 * Piecewise weights (already divided by 6), support [-2, 2]:

 *   |t| < 1 : (7/6)|t|^3 - 2|t|^2 + 8/9

 *   1<=|t|<2: -(7/18)|t|^3 + 2|t|^2 - (10/3)|t| + 16/9

 * It is a partition of unity (taps sum to 1 on the integer grid), so the

 * upsampling norm is 1 by construction like bilinear/bicubic; the downsampling

 * path renormalizes from the summed taps separately.

 * ------------------------------------------------------------------------*/


static float _maketaps_mitchell(float *taps,

                                const size_t num_taps,

                                const float  width,

                                const float first_tap,

                                const float interval)

{

  static const dt_aligned_pixel_simd_t bootstrap = { 0.0f, 1.0f, 2.0f, 3.0f };

  const dt_aligned_pixel_simd_t c7_6 = dt_simd_set1(7.0f / 6.0f);

  const dt_aligned_pixel_simd_t c2 = dt_simd_set1(2.0f);

  const dt_aligned_pixel_simd_t c8_9 = dt_simd_set1(8.0f / 9.0f);

  const dt_aligned_pixel_simd_t c7_18 = dt_simd_set1(7.0f / 18.0f);

  const dt_aligned_pixel_simd_t c10_3 = dt_simd_set1(10.0f / 3.0f);

  const dt_aligned_pixel_simd_t c16_9 = dt_simd_set1(16.0f / 9.0f);

  const dt_aligned_pixel_simd_t interval_v = dt_simd_set1(interval);

  const dt_aligned_pixel_simd_t iter = dt_simd_set1(4.0f * interval);

  dt_aligned_pixel_simd_t vt = dt_simd_set1(first_tap) + bootstrap * interval_v;


  const int runs = (num_taps + 3) / 4;


  for(size_t i = 0; i < runs; i++)

  {

    const dt_aligned_pixel_simd_t a = dt_simd_abs(vt);

    const dt_aligned_pixel_simd_t a2 = a * a;

    const dt_aligned_pixel_simd_t a3 = a2 * a;

    // inner lobe (|t| < 1) and outer lobe (1 <= |t| < 2)

    const dt_aligned_pixel_simd_t r01 = c7_6 * a3 - c2 * a2 + c8_9;

    const dt_aligned_pixel_simd_t r12 = c2 * a2 - c7_18 * a3 - c10_3 * a + c16_9;

    dt_aligned_pixel_simd_t taps4 = r12;

    for_four_channels(c)

      taps4[c] = (a[c] <= 1.0f) ? r01[c] : r12[c];

    dt_store_simd_aligned(taps + 4 * i, taps4);

    vt += iter;

  }

  return 1.0f; // kernel norm is 1.0f by construction (partition of unity)

}


/* --------------------------------------------------------------------------

 * All our known interpolators

 * ------------------------------------------------------------------------*/


/* !!! !!! !!!

 * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width

 * of this filter list. Otherwise bad things will happen

 * !!! !!! !!!

 */


static const struct dt_interpolation dt_interpolator[] = {

  {.id = DT_INTERPOLATION_BILINEAR,

   .name = "bilinear",

   .width = 1,

   .maketaps = &_maketaps_bilinear,

  },

  {.id = DT_INTERPOLATION_BICUBIC,

   .name = "bicubic",

   .width = 2,

   .maketaps = &_maketaps_bicubic,

  },

  {.id = DT_INTERPOLATION_MITCHELL,

   .name = "mitchell",

   .width = 2,

   .maketaps = &_maketaps_mitchell,

  },

};


/* --------------------------------------------------------------------------

 * Kernel utility methods

 * ------------------------------------------------------------------------*/


static inline __attribute__((always_inline)) float _compute_upsampling_kernel(const struct dt_interpolation *itor,

                                               float *kernel,

                                               int *first,

                                               float t)

{

  // find first pixel contributing to the filter's kernel.  We need

  // floorf() because a simple cast to int truncates toward zero,

  // yielding an incorrect result for the slightly-negative positions

  // that can occur at the top and left edges when doing perspective

  // correction

  int f = (int)floorf(t) - itor->width + 1;

  if(first)

  {

    *first = f;

  }


  /* Find closest integer position and then offset that to match first

   * filtered sample position */

  t = t - (float)f;


  // compute the taps and return the kernel norm

  return itor->maketaps(kernel, 2*itor->width, itor->width, t, -1.0f);

}


static inline void _compute_downsampling_kernel(const struct dt_interpolation *itor,

                                                int *taps,

                                                int *first,

                                                float *kernel,

                                                float *norm,

                                                const float outoinratio,

                                                const int xout)

{

  // Keep this at hand

  const float w = (float)itor->width;


  /* Compute the phase difference between output pixel and its

   * input corresponding input pixel */

  const float xin = ceil_fast(((float)xout - w) / outoinratio);

  if(first)

  {

    *first = (int)xin;

  }


  // Compute first interpolator parameter

  float t = xin * outoinratio - (float)xout;


  // Compute all filter taps

  int num_taps = *taps = (int)((w - t) / outoinratio);

  itor->maketaps(kernel, num_taps, itor->width, t, outoinratio);

  // compute the kernel norm if requested

  if (norm)

  {

    float n  = 0.0f;

    for(size_t i = 0; i < num_taps; i++)

      n += kernel[i];

    *norm = n;

  }

}


/* --------------------------------------------------------------------------

 * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)

 * ------------------------------------------------------------------------*/


#define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))


__DT_CLONE_TARGETS__


float dt_interpolation_compute_sample(const struct dt_interpolation *itor,

                                      const float *in,

                                      const float x,

                                      const float y,

                                      const int width,

                                      const int height,

                                      const int samplestride,

                                      const int linestride)

{

  assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));


  float DT_ALIGNED_ARRAY kernelh[MAX_KERNEL_REQ];

  float DT_ALIGNED_ARRAY kernelv[MAX_KERNEL_REQ];


  // Compute both horizontal and vertical kernels

  float normh = _compute_upsampling_kernel(itor, kernelh, NULL, x);

  float normv = _compute_upsampling_kernel(itor, kernelv, NULL, y);


  int ix = (int)x;

  int iy = (int)y;


  /* Now 2 cases, the pixel + filter width goes outside the image

   * in that case we have to use index clipping to keep all reads

   * in the input image (slow path) or we are sure it won't fall

   * outside and can do more simple code */

  float r;

  if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)

     && iy < (height - itor->width))

  {

    // Inside image boundary case


    // Go to top left pixel

    in = (float *)in + linestride * iy + ix * samplestride;

    in = in - (itor->width - 1) * (samplestride + linestride);


    // Apply the kernel

    float s = 0.f;

    for(int i = 0; i < 2 * itor->width; i++)

    {

      float h = 0.0f;

      for(int j = 0; j < 2 * itor->width; j++)

      {

        h += kernelh[j] * in[j * samplestride];

      }

      s += kernelv[i] * h;

      in += linestride;

    }

    r = fmaxf(0.0f, s / (normh * normv));

  }

  else if(ix >= 0 && iy >= 0 && ix < width && iy < height)

  {

    // At least a valid coordinate


    // Point to the upper left pixel index wise

    iy -= itor->width - 1;

    ix -= itor->width - 1;


    static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;

    assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong


    int xtap_first;

    int xtap_last;

    _prepare_tap_boundaries(&xtap_first, &xtap_last,

                           bordermode, 2 * itor->width, ix, width);


    int ytap_first;

    int ytap_last;

    _prepare_tap_boundaries(&ytap_first, &ytap_last,

                           bordermode, 2 * itor->width, iy, height);


    // Apply the kernel

    float s = 0.f;

    for(ssize_t i = ytap_first; i < ytap_last; i++)

    {

      const ssize_t clip_y = _clip(iy + i, 0, height - 1, bordermode);

      float h = 0.0f;

      for(ssize_t j = xtap_first; j < xtap_last; j++)

      {

        const ssize_t clip_x = _clip(ix + j, 0, width - 1, bordermode);

        const float *ipixel = in + clip_y * linestride + clip_x * samplestride;

        h += kernelh[j] * ipixel[0];

      }

      s += kernelv[i] * h;

    }


    r = fmaxf(0.0f, s / (normh * normv));

  }

  else

  {

    // invalid coordinate

    r = 0.0f;

  }

  return r;

}


/* --------------------------------------------------------------------------

 * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)

 * ------------------------------------------------------------------------*/


__DT_CLONE_TARGETS__


void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor,

                                      const float *in,

                                      float *out,

                                      const float x,

                                      const float y,

                                      const int width,

                                      const int height,

                                      const int linestride)

{

  assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));


  // Quite a bit of space for kernels

  float DT_ALIGNED_ARRAY kernelh[MAX_KERNEL_REQ];

  float DT_ALIGNED_ARRAY kernelv[MAX_KERNEL_REQ];


  // Compute both horizontal and vertical kernels

  float normh = _compute_upsampling_kernel(itor, kernelh, NULL, x);

  float normv = _compute_upsampling_kernel(itor, kernelv, NULL, y);


  // Precompute the inverse of the filter norm for later use

  const float oonorm = (1.f / (normh * normv));


  /* Now 2 cases, the pixel + filter width goes outside the image

   * in that case we have to use index clipping to keep all reads

   * in the input image (slow path) or we are sure it won't fall

   * outside and can do more simple code */

  int ix = (int)x;

  int iy = (int)y;


  if(ix >= (itor->width - 1)

    && iy >= (itor->width - 1)

    && ix < (width - itor->width)

    && iy < (height - itor->width))

  {

    // Inside image boundary case


    // Go to top left pixel

    in = (float *)in + linestride * iy + ix * 4;

    in = in - (itor->width - 1) * (4 + linestride);


    const size_t itor_width = 2 * itor->width;


    // Apply the kernel

    dt_aligned_pixel_simd_t pixel = dt_simd_set1(0.0f);

    for(size_t i = 0; i < itor_width; i++)

    {

      dt_aligned_pixel_simd_t h = dt_simd_set1(0.0f);

      for(size_t j = 0; j < itor_width; j++)

        h += dt_load_simd_aligned(in + 4 * j) * dt_simd_set1(kernelh[j]);

      pixel += h * dt_simd_set1(kernelv[i]);

      in += linestride;

    }


    dt_store_simd(out, dt_simd_max_zero(pixel * dt_simd_set1(oonorm)));

  }

  else if(ix >= 0 && iy >= 0 && ix < width && iy < height)

  {

    // At least a valid coordinate


    // Point to the upper left pixel index wise

    iy -= itor->width - 1;

    ix -= itor->width - 1;


    static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;

    assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong


    int xtap_first;

    int xtap_last;

    _prepare_tap_boundaries(&xtap_first, &xtap_last,

                           bordermode, 2 * itor->width, ix, width);


    int ytap_first;

    int ytap_last;

    _prepare_tap_boundaries(&ytap_first, &ytap_last,

                           bordermode, 2 * itor->width, iy, height);


    // Apply the kernel

    dt_aligned_pixel_simd_t pixel = dt_simd_set1(0.0f);

    for(ssize_t i = ytap_first; i < ytap_last; i++)

    {

      const ssize_t clip_y = _clip(iy + i, 0, height - 1, bordermode);

      dt_aligned_pixel_simd_t h = dt_simd_set1(0.0f);

      const float *ipixel = in + clip_y * linestride;

      for(ssize_t j = xtap_first; j < xtap_last; j++)

      {

        const ssize_t clip_x = _clip(ix + j, 0, width - 1, bordermode);

        h += dt_load_simd_aligned(ipixel + 4 * clip_x) * dt_simd_set1(kernelh[j]);

      }

      pixel += h * dt_simd_set1(kernelv[i]);

    }


    dt_store_simd(out, dt_simd_max_zero(pixel * dt_simd_set1(oonorm)));

  }

  else

  {

    // data for *out has no valid *in location so just set to zero.

    dt_store_simd(out, dt_simd_set1(0.0f));

  }

}


/* --------------------------------------------------------------------------

 * Interpolation factory

 * ------------------------------------------------------------------------*/


const struct dt_interpolation *dt_interpolation_new(enum dt_interpolation_type type)

{

  const struct dt_interpolation *itor = NULL;


  if(type == DT_INTERPOLATION_USERPREF)

  {

    // Find user preferred interpolation method

    const char *uipref =

      dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator");


    for(int i = DT_INTERPOLATION_FIRST;

        uipref && i < DT_INTERPOLATION_LAST;

        i++)

    {

      if(!strcmp(uipref, dt_interpolator[i].name))

      {

        // Found the one

        itor = &dt_interpolator[i];

        break;

      }

    }


    /* In the case the search failed (!uipref or name not found),

     * prepare later search pass with default fallback */

    type = DT_INTERPOLATION_DEFAULT;

  }

  else if(type == DT_INTERPOLATION_USERPREF_WARP)

  {

    // Find user preferred interpolation method

    const char *uipref =

      dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator_warp");

    for(int i = DT_INTERPOLATION_FIRST;

        uipref && i < DT_INTERPOLATION_LAST;

        i++)

    {

      if(!strcmp(uipref, dt_interpolator[i].name))

      {

        // Found the one

        itor = &dt_interpolator[i];

        break;

      }

    }


    /* In the case the search failed (!uipref or name not found),

     * prepare later search pass with default fallback */

    type = DT_INTERPOLATION_DEFAULT_WARP;

  }

  if(IS_NULL_PTR(itor))

  {

    // Did not find the userpref one or we've been asked for a specific one

    for(int i = DT_INTERPOLATION_FIRST; i < DT_INTERPOLATION_LAST; i++)

    {

      if(dt_interpolator[i].id == type)

      {

        itor = &dt_interpolator[i];

        break;

      }

      if(dt_interpolator[i].id == DT_INTERPOLATION_DEFAULT)

      {

        itor = &dt_interpolator[i];

      }

    }

  }


  return itor;

}


/* --------------------------------------------------------------------------

 * Image resampling

 * ------------------------------------------------------------------------*/


__DT_CLONE_TARGETS__


static gboolean _prepare_resampling_plan(const struct dt_interpolation *itor,

                                         const int in,

                                         const int in_x0,

                                         const int out,

                                         const int out_x0,

                                         const float scale,

                                         int **plength,

                                         float **pkernel,

                                         int **pindex,

                                         int **pmeta)

{

  // Safe return values

  *plength = NULL;

  *pkernel = NULL;

  *pindex = NULL;

  if(pmeta)

  {

    *pmeta = NULL;

  }


  if(scale == 1.f)

  {

    // No resampling required

    return FALSE;

  }


  // Compute common upsampling/downsampling memory requirements

  int maxtapsapixel;

  if(scale > 1.f)

  {

    // Upscale... the easy one. The values are exact

    maxtapsapixel = 2 * itor->width;

  }

  else

  {

    // Downscale... going for worst case values memory wise

    maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);

  }


  int nlengths = out;

  const int nindex = maxtapsapixel * out;

  const int nkernel = maxtapsapixel * out;

  const size_t lengthreq = dt_round_size(nlengths * sizeof(int), DT_CACHELINE_BYTES);

  const size_t indexreq = dt_round_size(nindex * sizeof(int), DT_CACHELINE_BYTES);

  const size_t kernelreq = dt_round_size(nkernel * sizeof(float), DT_CACHELINE_BYTES);

  const size_t scratchreq = dt_round_size(maxtapsapixel * sizeof(float) + 4 * sizeof(float), DT_CACHELINE_BYTES);

  // NB: because sse versions compute four taps a time

  const size_t metareq = dt_round_size(pmeta ? 4 * sizeof(int) * out : 0, DT_CACHELINE_BYTES);


  const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;

  void *blob = dt_pixelpipe_cache_alloc_align_cache(totalreq, 0);

  if(IS_NULL_PTR(blob)) return TRUE;


  int *lengths = (int *)blob;

  blob = (char *)blob + lengthreq;

  int *index = (int *)blob;

  blob = (char *)blob + indexreq;

  float *kernel = (float *)blob;

  blob = (char *)blob + kernelreq;

  float *scratchpad = scratchreq ? (float *)blob : NULL;

  blob = (char *)blob + scratchreq;

  int *meta = metareq ? (int *)blob : NULL;

//   blob = (char *)blob + metareq;


  /* setting this as a const should help the compilers trim all unnecessary

   * codepaths */

  const enum border_mode bordermode = RESAMPLING_BORDER_MODE;


  /* Upscale and downscale differ in subtle points, getting rid of code

   * duplication might have been tricky and i prefer keeping the code

   * as straight as possible */

  if(scale > 1.f)

  {

    int kidx = 0;

    int iidx = 0;

    int lidx = 0;

    int midx = 0;

    for(int x = 0; x < out; x++)

    {

      if(meta)

      {

        meta[midx++] = lidx;

        meta[midx++] = kidx;

        meta[midx++] = iidx;

      }


      // Projected position in input samples

      float fx = (float)(out_x0 + x) / scale - in_x0;


      // Compute the filter kernel at that position

      int first;

      (void)_compute_upsampling_kernel(itor, scratchpad, &first, fx);


      /* Check lower and higher bound pixel index and skip as many pixels as

       * necessary to fall into range */

      int tap_first;

      int tap_last;

      _prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->width, first, in);


      // Track number of taps that will be used

      lengths[lidx++] = tap_last - tap_first;


      // Precompute the inverse of the norm

      float norm = 0.f;

      for(int tap = tap_first; tap < tap_last; tap++)

      {

        norm += scratchpad[tap];

      }

      norm = 1.f / norm;


      /* Unlike single pixel or single sample code, here it's interesting to

       * precompute the normalized filter kernel as this will avoid dividing

       * by the norm for all processed samples/pixels

       * NB: use the same loop to put in place the index list */

      first += tap_first;

      for(int tap = tap_first; tap < tap_last; tap++)

      {

        kernel[kidx++] = scratchpad[tap] * norm;

        index[iidx++] = _clip(first++, 0, in - 1, bordermode);

      }

    }

  }

  else

  {

    int kidx = 0;

    int iidx = 0;

    int lidx = 0;

    int midx = 0;

    for(int x = 0; x < out; x++)

    {

      if(meta)

      {

        meta[midx++] = lidx;

        meta[midx++] = kidx;

        meta[midx++] = iidx;

      }


      // Compute downsampling kernel centered on output position

      int taps;

      int first;

      _compute_downsampling_kernel(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);


      /* Check lower and higher bound pixel index and skip as many pixels as

       * necessary to fall into range */

      int tap_first;

      int tap_last;

      _prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);


      // Track number of taps that will be used

      lengths[lidx++] = tap_last - tap_first;


      // Precompute the inverse of the norm

      float norm = 0.f;

      for(int tap = tap_first; tap < tap_last; tap++)

      {

        norm += scratchpad[tap];

      }

      norm = 1.f / norm;


      /* Unlike single pixel or single sample code, here it's interesting to

       * precompute the normalized filter kernel as this will avoid dividing

       * by the norm for all processed samples/pixels

       * NB: use the same loop to put in place the index list */

      first += tap_first;

      for(int tap = tap_first; tap < tap_last; tap++)

      {

        kernel[kidx++] = scratchpad[tap] * norm;

        index[iidx++] = _clip(first++, 0, in - 1, bordermode);

      }

    }

  }


  // Validate plan wrt caller

  *plength = lengths;

  *pindex = index;

  *pkernel = kernel;

  if(pmeta)

  {

    *pmeta = meta;

  }


  return FALSE;

}


#define TILE_ROWS 128


__DT_CLONE_TARGETS__


static void _interpolation_resample_plain(const struct dt_interpolation *itor,

                                          float *const restrict out,

                                          const dt_iop_roi_t *const roi_out,

                                          const float *const restrict in,

                                          const dt_iop_roi_t *const roi_in)

{

  int *hindex = NULL;

  int *hlength = NULL;

  float *hkernel = NULL;

  int *vindex = NULL;

  int *vlength = NULL;

  float *vkernel = NULL;

  int *vmeta = NULL;


  const int32_t in_stride_floats = roi_in->width * 4;

  const int32_t out_stride_floats = roi_out->width * 4;


  // Fast code path for 1:1 copy, only cropping area can change

  if(roi_out->scale == 1.f || roi_out->scale == roi_in->scale)

  {

    const size_t x0 = (roi_out->x - roi_in->x) * 4 * sizeof(float);

    const size_t y0 = (roi_out->y - roi_in->y);

    __OMP_PARALLEL_FOR__()

    for(int yt = 0; yt < roi_out->height; yt += TILE_ROWS)

    {

      const int y_end = MIN(yt + TILE_ROWS, roi_out->height);

      for(int y = yt; y < y_end; y++)

        memcpy((char *)__builtin_assume_aligned(out, 64) + (size_t)out_stride_floats * sizeof(float) * y,

              (char *)__builtin_assume_aligned(in, 64) + (size_t)in_stride_floats * sizeof(float) * (y + y0) + x0,

              out_stride_floats * sizeof(float));

    }


    // All done, so easy case

    return;

  }


  // Generic non 1:1 case... much more complicated :D


  // The actual resampling ratio between the two buffers,

  // not the absolute pipeline scale

  const float resample_scale = roi_out->scale / roi_in->scale;


  if(_prepare_resampling_plan(itor, roi_in->width, roi_in->x,

                              roi_out->width, roi_out->x, resample_scale,

                              &hlength, &hkernel, &hindex, NULL))

    goto exit;


  if(_prepare_resampling_plan(itor, roi_in->height, roi_in->y,

                              roi_out->height, roi_out->y, resample_scale,

                              &vlength, &vkernel, &vindex, &vmeta))

    goto exit;


  const size_t height = roi_out->height;

  const size_t width = roi_out->width;


  // Process each output line

  __OMP_PARALLEL_FOR__()

  for(size_t oy = 0; oy < height; oy++)

  {

    // Initialize column resampling indexes

    int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x

    int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x

    int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x


    // Initialize row resampling indexes

    int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x

    int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x


    // Number of lines contributing to the output line

    int vl = vlength[vlidx++]; // V(ertical) L(ength)


    // Process each output column

    for(size_t ox = 0; ox < width; ox++)

    {

      // This will hold the resulting pixel

      dt_aligned_pixel_simd_t vs = dt_simd_set1(0.0f);


      // Number of horizontal samples contributing to the output

      const int hl = hlength[hlidx++]; // H(orizontal) L(ength)

      const int *const column_hindex = hindex + hkidx;

      const float *const column_hkernel = hkernel + hkidx;

      const int *const column_vindex = vindex + viidx;

      const float *const column_vkernel = vkernel + vkidx;


      for(size_t iy = 0; iy < vl; iy++)

      {

        // This is our input line

        const size_t baseidx_vindex = (size_t)column_vindex[iy] * in_stride_floats;


        dt_aligned_pixel_simd_t vhs = dt_simd_set1(0.0f);


        for(size_t ix = 0; ix < hl; ix++)

        {

          // Apply the precomputed filter kernel

          const size_t baseidx = baseidx_vindex + (size_t)column_hindex[ix] * 4;

          const float htap = column_hkernel[ix];

          vhs += dt_load_simd_aligned(in + baseidx) * dt_simd_set1(htap);

        }


        // Accumulate contribution from this line

        const float vtap = column_vkernel[iy];

        vs += vhs * dt_simd_set1(vtap);

      }


      // Output pixel is ready

      const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;


      // Clip negative RGB that may be produced by Lanczos undershooting

      // Negative RGB are invalid values no matter the RGB space (light is positive)

      dt_aligned_pixel_t pixel;

      dt_store_simd_aligned(pixel, dt_simd_max_zero(vs));

      copy_pixel_nontemporal(out + baseidx, pixel);


      // The vertical support is fixed for the whole output row. Only the

      // horizontal plan advances from one output column to the next.

      hkidx += hl;

    }

  }


  dt_omploop_sfence();


exit:

  /* Free the resampling plans. It's nasty to optimize allocs like that, but

   * it simplifies the code :-D. The length array is in fact the only memory

   * allocated. */

  dt_pixelpipe_cache_free_align(hlength);

  dt_pixelpipe_cache_free_align(vlength);

}


void dt_interpolation_resample(const struct dt_interpolation *itor,

                               float *out,

                               const dt_iop_roi_t *const roi_out,

                               const float *const in,

                               const dt_iop_roi_t *const roi_in)

{

  return _interpolation_resample_plain(itor, out, roi_out, in, roi_in);

}


void dt_interpolation_resample_roi(const struct dt_interpolation *itor,

                                   float *out,

                                   const dt_iop_roi_t *const roi_out,

                                   const float *const in,

                                   const dt_iop_roi_t *const roi_in)

{

  dt_iop_roi_t oroi = *roi_out;

  //oroi.x = oroi.y = 0;


  dt_iop_roi_t iroi = *roi_in;

  //iroi.x = iroi.y = 0;


  dt_interpolation_resample(itor, out, &oroi, in, &iroi);

}


#ifdef HAVE_OPENCL


dt_interpolation_cl_global_t *dt_interpolation_init_cl_global()

{

  dt_interpolation_cl_global_t *g

      = (dt_interpolation_cl_global_t *)malloc(sizeof(dt_interpolation_cl_global_t));


  const int program = 2; // basic.cl, from programs.conf

  g->kernel_interpolation_resample =

    dt_opencl_create_kernel(program, "interpolation_resample");

  return g;

}


void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)

{

  if(IS_NULL_PTR(g)) return;

  // destroy kernels

  dt_opencl_free_kernel(g->kernel_interpolation_resample);

  dt_free(g);

}


static uint32_t roundToNextPowerOfTwo(uint32_t x)

{

  x--;

  x |= x >> 1;

  x |= x >> 2;

  x |= x >> 4;

  x |= x >> 8;

  x |= x >> 16;

  x++;

  return x;

}


int dt_interpolation_resample_cl(const struct dt_interpolation *itor,

                                 const int devid,

                                 cl_mem dev_out,

                                 const dt_iop_roi_t *const roi_out,

                                 cl_mem dev_in,

                                 const dt_iop_roi_t *const roi_in)

{

  int *hindex = NULL;

  int *hlength = NULL;

  float *hkernel = NULL;

  int *hmeta = NULL;

  int *vindex = NULL;

  int *vlength = NULL;

  float *vkernel = NULL;

  int *vmeta = NULL;


  cl_int err = DT_OPENCL_DEFAULT_ERROR;


  cl_mem dev_hindex = NULL;

  cl_mem dev_hlength = NULL;

  cl_mem dev_hkernel = NULL;

  cl_mem dev_hmeta = NULL;

  cl_mem dev_vindex = NULL;

  cl_mem dev_vlength = NULL;

  cl_mem dev_vkernel = NULL;

  cl_mem dev_vmeta = NULL;


  // Fast code path for 1:1 copy, only cropping area can change

  if(roi_out->scale == 1.f || roi_out->scale == roi_in->scale)

  {

    size_t iorigin[] = { roi_out->x - roi_in->x, roi_out->y - roi_in->y, 0 };

    size_t oorigin[] = { 0, 0, 0 };

    size_t region[] = { roi_out->width, roi_out->height, 1 };


    // copy original input from dev_in -> dev_out as starting point

    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);

    if(err != CL_SUCCESS) goto error;


    // All done, so easy case

    return CL_SUCCESS;

  }


  // Generic non 1:1 case... much more complicated :D


  // The actual resampling ratio between the two buffers,

  // not the absolute pipeline scale

  const float resample_scale = roi_out->scale / roi_in->scale;


  if(_prepare_resampling_plan(itor, roi_in->width, roi_in->x,

                              roi_out->width, roi_out->x, resample_scale,

                              &hlength, &hkernel, &hindex, &hmeta))

    goto error;


  if(_prepare_resampling_plan(itor, roi_in->height, roi_in->y,

                              roi_out->height, roi_out->y, resample_scale,

                              &vlength, &vkernel, &vindex, &vmeta))

    goto error;


  int hmaxtaps = -1, vmaxtaps = -1;

  for(int k = 0; k < roi_out->width; k++) hmaxtaps = MAX(hmaxtaps, hlength[k]);

  for(int k = 0; k < roi_out->height; k++) vmaxtaps = MAX(vmaxtaps, vlength[k]);


  // strategy: process image column-wise (local[0] = 1). For each row generate

  // a number of parallel work items each taking care of one horizontal convolution,

  // then sum over work items to do the vertical convolution


  const int kernel = darktable.opencl->interpolation->kernel_interpolation_resample;

  const int width = roi_out->width;

  const int height = roi_out->height;


  // make sure blocksize is not too large

  const int taps = roundToNextPowerOfTwo(vmaxtaps);

  // the number of work items per row rounded up to a power of 2

  // (for quick recursive reduction)


  int vblocksize;


  dt_opencl_local_buffer_t locopt

    = (dt_opencl_local_buffer_t)

        { .xoffset = 0,

          .xfactor = 1,

          .yoffset = 0,

          .yfactor = 1,

          .cellsize = 4 * sizeof(float),

          .overhead = hmaxtaps * sizeof(float) + hmaxtaps * sizeof(int),

          .sizex = 1,

          .sizey = (1 << 16) * taps };


  if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))

    vblocksize = locopt.sizey;

  else

    vblocksize = 1;


  if(vblocksize < taps)

  {

    // our strategy does not work: the vertical number of taps exceeds

    // the vertical workgroupsize; there is no point in continuing on

    // the GPU - that would be way too slow; let's delegate the stuff

    // to the CPU then.

    err = CL_INVALID_WORK_GROUP_SIZE;

    goto error;

  }


  size_t sizes[3] = { ROUNDUPDWD(width, devid), ROUNDUP(height * taps, vblocksize), 1 };

  size_t local[3] = { 1, vblocksize, 1 };


  // store resampling plan to device memory hindex, vindex, hkernel,

  // vkernel: (v|h)maxtaps might be too small, so store a bit more

  // than needed

  err = -999;


  dev_hindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * (hmaxtaps + 1), hindex);

  if(IS_NULL_PTR(dev_hindex)) goto error;


  dev_hlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width, hlength);

  if(IS_NULL_PTR(dev_hlength)) goto error;


  dev_hkernel = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * width * (hmaxtaps + 1), hkernel);

  if(IS_NULL_PTR(dev_hkernel)) goto error;


  dev_hmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * 3, hmeta);

  if(IS_NULL_PTR(dev_hmeta)) goto error;


  dev_vindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * (vmaxtaps + 1), vindex);

  if(IS_NULL_PTR(dev_vindex)) goto error;


  dev_vlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height, vlength);

  if(IS_NULL_PTR(dev_vlength)) goto error;


  dev_vkernel = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * height * (vmaxtaps + 1), vkernel);

  if(IS_NULL_PTR(dev_vkernel)) goto error;


  dev_vmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * 3, vmeta);

  if(IS_NULL_PTR(dev_vmeta)) goto error;


  dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);

  dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);

  dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);

  dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);

  dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cl_mem), (void *)&dev_hmeta);

  dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cl_mem), (void *)&dev_vmeta);

  dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cl_mem), (void *)&dev_hlength);

  dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cl_mem), (void *)&dev_vlength);

  dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cl_mem), (void *)&dev_hindex);

  dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(cl_mem), (void *)&dev_vindex);

  dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(cl_mem), (void *)&dev_hkernel);

  dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(cl_mem), (void *)&dev_vkernel);

  dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(int), (void *)&hmaxtaps);

  dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(int), (void *)&taps);

  dt_opencl_set_kernel_arg(devid, kernel, 14, hmaxtaps * sizeof(float), NULL);

  dt_opencl_set_kernel_arg(devid, kernel, 15, hmaxtaps * sizeof(int), NULL);

  dt_opencl_set_kernel_arg(devid, kernel, 16, vblocksize * 4 * sizeof(float), NULL);

  err = dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizes, local);


error:


  dt_opencl_release_mem_object(dev_hindex);

  dt_opencl_release_mem_object(dev_hlength);

  dt_opencl_release_mem_object(dev_hkernel);

  dt_opencl_release_mem_object(dev_hmeta);

  dt_opencl_release_mem_object(dev_vindex);

  dt_opencl_release_mem_object(dev_vlength);

  dt_opencl_release_mem_object(dev_vkernel);

  dt_opencl_release_mem_object(dev_vmeta);

  dt_pixelpipe_cache_free_align(hlength);

  dt_pixelpipe_cache_free_align(vlength);

  return err;

}


int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor,

                                     const int devid,

                                     cl_mem dev_out,

                                     const dt_iop_roi_t *const roi_out,

                                     cl_mem dev_in,

                                     const dt_iop_roi_t *const roi_in)

{

  dt_iop_roi_t oroi = *roi_out;

  //oroi.x = oroi.y = 0;


  dt_iop_roi_t iroi = *roi_in;

  //iroi.x = iroi.y = 0;


  return dt_interpolation_resample_cl(itor, devid, dev_out, &oroi, dev_in, &iroi);

}


#endif


static void _interpolation_resample_1c_plain(const struct dt_interpolation *itor,

                                             float *out,

                                             const dt_iop_roi_t *const roi_out,

                                             const float *const in,

                                             const dt_iop_roi_t *const roi_in)

{

  int *hindex = NULL;

  int *hlength = NULL;

  float *hkernel = NULL;

  int *vindex = NULL;

  int *vlength = NULL;

  float *vkernel = NULL;

  int *vmeta = NULL;


  const size_t out_stride = roi_out->width * sizeof(float);

  const size_t in_stride = roi_in->width * sizeof(float);


  // Fast code path for 1:1 copy, only cropping area can change

  if(roi_out->scale == 1.f || roi_out->scale == roi_in->scale)

  {

    const size_t x0 = (roi_out->x - roi_in->x) * sizeof(float);

    const size_t y0 = (roi_out->y - roi_in->y);

    __OMP_PARALLEL_FOR__()

    for(int y = 0; y < roi_out->height; y++)

    {

      float *i = (float *)((char *)in + in_stride * (y + y0) + x0);

      float *o = (float *)((char *)out + out_stride * y);

      memcpy(o, i, out_stride);

    }

    // All done, so easy case

    return;

  }


  // Generic non 1:1 case... much more complicated :D


  // Prepare resampling plans once and for all

  if(_prepare_resampling_plan(itor, roi_in->width, roi_in->x,

                              roi_out->width, roi_out->x, roi_out->scale,

                              &hlength, &hkernel, &hindex, NULL))

    goto exit;


  if(_prepare_resampling_plan(itor, roi_in->height, roi_in->y,

                              roi_out->height, roi_out->y, roi_out->scale,

                              &vlength, &vkernel, &vindex, &vmeta))

    goto exit;


  // Process each output line

  __OMP_PARALLEL_FOR__()

  for(int oy = 0; oy < roi_out->height; oy++)

  {

    // Initialize column resampling indexes

    int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x

    int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x

    int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x


    // Initialize row resampling indexes

    int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x

    int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x

    int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x


    // Number of lines contributing to the output line

    int vl = vlength[vlidx++]; // V(ertical) L(ength)


    // Process each output column

    for(int ox = 0; ox < roi_out->width; ox++)

    {

      // This will hold the resulting pixel

      float vs = 0.0f;


      // Number of horizontal samples contributing to the output

      const int hl = hlength[hlidx++]; // H(orizontal) L(ength)

      for(int iy = 0; iy < vl; iy++)

      {

        // This is our input line

        const float *i = (float *)((char *)in + in_stride * vindex[viidx++]);


        float vhs = 0.0f;


        for(int ix = 0; ix < hl; ix++)

        {

          // Apply the precomputed filter kernel

          const size_t baseidx = (size_t)hindex[hiidx++];

          const float htap = hkernel[hkidx++];

          vhs += i[baseidx] * htap;

        }


        // Accumulate contribution from this line

        const float vtap = vkernel[vkidx++];

        vs += vhs * vtap;


        // Reset horizontal resampling context

        hkidx -= hl;

        hiidx -= hl;

      }


      // Output pixel is ready

      float *o = (float *)((char *)out + (size_t)oy * out_stride

                           + (size_t)ox * sizeof(float));

      *o = vs;


      // Reset vertical resampling context

      viidx -= vl;

      vkidx -= vl;


      // Progress in horizontal context

      hiidx += hl;

      hkidx += hl;

    }

  }


  exit:

  /* Free the resampling plans. It's nasty to optimize allocs like that, but

   * it simplifies the code :-D. The length array is in fact the only memory

   * allocated. */

  dt_pixelpipe_cache_free_align(hlength);

  dt_pixelpipe_cache_free_align(vlength);

}


void dt_interpolation_resample_1c(const struct dt_interpolation *itor,

                                  float *out,

                                  const dt_iop_roi_t *const roi_out,

                                  const float *const in,

                                  const dt_iop_roi_t *const roi_in)

{

  return _interpolation_resample_1c_plain(itor, out, roi_out, in, roi_in);

}


void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor,

                                      float *out,

                                      const dt_iop_roi_t *const roi_out,

                                      const float *const in,

                                      const dt_iop_roi_t *const roi_in)

{

  dt_iop_roi_t oroi = *roi_out;

  //oroi.x = oroi.y = 0;


  dt_iop_roi_t iroi = *roi_in;

  //iroi.x = iroi.y = 0;


  dt_interpolation_resample_1c(itor, out, &oroi, in, &iroi);

}


// clang-format off

// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py

// vim: shiftwidth=2 expandtab tabstop=2 cindent

// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;

// clang-format on

error
static void error(char *msg)
Definition ashift_lsd.c:202

TRUE
#define TRUE
Definition ashift_lsd.c:162

FALSE
#define FALSE
Definition ashift_lsd.c:158

assert.h

width
int width
Definition bilateral.h:1

height
int height
Definition bilateral.h:1

dt_simd_set1
return vector dt_simd_set1(valid ?(scaling+NORM_MIN) :NORM_MIN)

i
const float i
Definition colorspaces_inline_conversions.h:440

g
const float g
Definition colorspaces_inline_conversions.h:674

fx
const float fx
Definition colorspaces_inline_conversions.h:100

f
const dt_aligned_pixel_t f
Definition colorspaces_inline_conversions.h:102

min
static const float const float const float min
Definition colorspaces_inline_conversions.h:438

max
const float max
Definition colorspaces_inline_conversions.h:490

out
const dt_colormatrix_t dt_aligned_pixel_t out
Definition colorspaces_inline_conversions.h:42

n
const float n
Definition colorspaces_inline_conversions.h:678

dt_store_simd_aligned
dt_store_simd_aligned(out, dt_mat3x4_mul_vec4(vin, dt_colormatrix_row_to_simd(matrix, 0), dt_colormatrix_row_to_simd(matrix, 1), dt_colormatrix_row_to_simd(matrix, 2)))

void
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))

type
int type
Definition common/metadata.c:62

name
char * name
Definition common/metadata.c:61

conf.h

dt_conf_get_string_const
const char * dt_conf_get_string_const(const char *name)
Definition control/conf.c:377

darktable
darktable_t darktable
Definition darktable.c:181

darktable.h

dt_store_simd
dt_store_simd(out, value)

DT_ALIGNED_ARRAY
#define DT_ALIGNED_ARRAY
Definition darktable.h:388

dt_pixelpipe_cache_alloc_align_cache
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
Definition darktable.h:433

__attribute__
float dt_aligned_pixel_simd_t __attribute__((vector_size(16), aligned(16)))
Enable aggressive floating-point arithmetic optimizations, in denormals handling. Set through user pr...
Definition darktable.h:524

copy_pixel_nontemporal
static void copy_pixel_nontemporal(float *const __restrict__ out, const float *const __restrict__ in)
Definition darktable.h:677

dt_free
#define dt_free(ptr)
Definition darktable.h:456

dt_pixelpipe_cache_free_align
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453

dt_round_size
static size_t dt_round_size(const size_t size, const size_t alignment)
Definition darktable.h:397

__DT_CLONE_TARGETS__
#define __DT_CLONE_TARGETS__
Definition darktable.h:367

for_four_channels
#define for_four_channels(_var,...)
Definition darktable.h:664

__OMP_PARALLEL_FOR__
#define __OMP_PARALLEL_FOR__(...)
Definition darktable.h:258

IS_NULL_PTR
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281

DT_CACHELINE_BYTES
#define DT_CACHELINE_BYTES
Definition darktable.h:380

meta
static CameraMetaData * meta
Definition imageio_rawspeed.cc:92

dt_omploop_sfence
#define dt_omploop_sfence()
Definition imageop.h:702

dt_interpolation_new
const struct dt_interpolation * dt_interpolation_new(enum dt_interpolation_type type)
Definition interpolation.c:595

dt_interpolation_free_cl_global
void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
Definition interpolation.c:1068

MAX_HALF_FILTER_WIDTH
#define MAX_HALF_FILTER_WIDTH
Definition interpolation.c:65

dt_interpolation_compute_pixel4c
__DT_CLONE_TARGETS__ void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out, const float x, const float y, const int width, const int height, const int linestride)
Definition interpolation.c:491

_interpolation_resample_1c_plain
static void _interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1282

_maketaps_mitchell
static float _maketaps_mitchell(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
Definition interpolation.c:248

roundToNextPowerOfTwo
static uint32_t roundToNextPowerOfTwo(uint32_t x)
Definition interpolation.c:1076

RESAMPLING_BORDER_MODE
#define RESAMPLING_BORDER_MODE
Definition interpolation.c:57

dt_interpolation_resample_roi
void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1041

border_mode
border_mode
Definition interpolation.c:48

BORDER_WRAP
@ BORDER_WRAP
Definition interpolation.c:50

BORDER_MIRROR
@ BORDER_MIRROR
Definition interpolation.c:51

BORDER_CLAMP
@ BORDER_CLAMP
Definition interpolation.c:52

BORDER_REPLICATE
@ BORDER_REPLICATE
Definition interpolation.c:49

_prepare_resampling_plan
static __DT_CLONE_TARGETS__ gboolean _prepare_resampling_plan(const struct dt_interpolation *itor, const int in, const int in_x0, const int out, const int out_x0, const float scale, int **plength, float **pkernel, int **pindex, int **pmeta)
Definition interpolation.c:706

dt_interpolation_resample_roi_1c
void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1417

dt_interpolation_resample
void dt_interpolation_resample(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1027

dt_interpolation_resample_cl
int dt_interpolation_resample_cl(const struct dt_interpolation *itor, const int devid, cl_mem dev_out, const dt_iop_roi_t *const roi_out, cl_mem dev_in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1091

dt_interpolation_resample_1c
void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out, const dt_iop_roi_t *const roi_out, const float *const in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1404

dt_interpolation_init_cl_global
dt_interpolation_cl_global_t * dt_interpolation_init_cl_global()
Definition interpolation.c:1057

_maketaps_bilinear
static float _maketaps_bilinear(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
Definition interpolation.c:170

MAX_KERNEL_REQ
#define MAX_KERNEL_REQ
Definition interpolation.c:388

dt_interpolation_resample_roi_cl
int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, const int devid, cl_mem dev_out, const dt_iop_roi_t *const roi_out, cl_mem dev_in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:1265

dt_interpolator
static const struct dt_interpolation dt_interpolator[]
Definition interpolation.c:293

TILE_ROWS
#define TILE_ROWS
Definition interpolation.c:890

dt_interpolation_compute_sample
__DT_CLONE_TARGETS__ float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x, const float y, const int width, const int height, const int samplestride, const int linestride)
Definition interpolation.c:391

INTERPOLATION_BORDER_MODE
#define INTERPOLATION_BORDER_MODE
Definition interpolation.c:61

_interpolation_resample_plain
static __DT_CLONE_TARGETS__ void _interpolation_resample_plain(const struct dt_interpolation *itor, float *const restrict out, const dt_iop_roi_t *const roi_out, const float *const restrict in, const dt_iop_roi_t *const roi_in)
Definition interpolation.c:893

_compute_downsampling_kernel
static void _compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first, float *kernel, float *norm, const float outoinratio, const int xout)
Definition interpolation.c:349

_maketaps_bicubic
static float _maketaps_bicubic(float *taps, const size_t num_taps, const float width, const float first_tap, const float interval)
Definition interpolation.c:195

interpolation.h

dt_interpolation_type
dt_interpolation_type
Definition interpolation.h:38

DT_INTERPOLATION_BICUBIC
@ DT_INTERPOLATION_BICUBIC
Definition interpolation.h:41

DT_INTERPOLATION_BILINEAR
@ DT_INTERPOLATION_BILINEAR
Definition interpolation.h:40

DT_INTERPOLATION_DEFAULT
@ DT_INTERPOLATION_DEFAULT
Definition interpolation.h:44

DT_INTERPOLATION_LAST
@ DT_INTERPOLATION_LAST
Definition interpolation.h:43

DT_INTERPOLATION_MITCHELL
@ DT_INTERPOLATION_MITCHELL
Definition interpolation.h:42

DT_INTERPOLATION_USERPREF
@ DT_INTERPOLATION_USERPREF
Definition interpolation.h:46

DT_INTERPOLATION_DEFAULT_WARP
@ DT_INTERPOLATION_DEFAULT_WARP
Definition interpolation.h:45

DT_INTERPOLATION_FIRST
@ DT_INTERPOLATION_FIRST
Definition interpolation.h:39

DT_INTERPOLATION_USERPREF_WARP
@ DT_INTERPOLATION_USERPREF_WARP
Definition interpolation.h:47

kernel
static float kernel(const float *x, const float *y)
Definition iop/colorchecker.c:469

x
static const float x
Definition iop_profile.h:235

t
const int t
Definition iop_profile.h:225

k
float *const restrict const size_t k
Definition luminance_mask.h:78

math.h

ceil_fast
static float ceil_fast(float x)
Definition math.h:322

dt_aligned_pixel_t
float dt_aligned_pixel_t[4]
Definition noiseprofile.c:28

dt_opencl_local_buffer_opt
int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
Definition opencl.c:3156

dt_opencl_create_kernel
int dt_opencl_create_kernel(const int prog, const char *name)
Definition opencl.c:2030

dt_opencl_copy_host_to_device_constant
void * dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host)
Definition opencl.c:2332

dt_opencl_enqueue_copy_image
int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst, size_t *region)
Definition opencl.c:2261

dt_opencl_free_kernel
void dt_opencl_free_kernel(const int kernel)
Definition opencl.c:2073

dt_opencl_set_kernel_arg
int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size, const void *arg)
Definition opencl.c:2127

dt_opencl_enqueue_kernel_2d_with_local
int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes, const size_t *local)
Definition opencl.c:2142

dt_opencl_release_mem_object
void dt_opencl_release_mem_object(cl_mem mem)
Definition opencl.c:2383

DT_OPENCL_DEFAULT_ERROR
#define DT_OPENCL_DEFAULT_ERROR
Definition opencl.h:57

ROUNDUP
#define ROUNDUP(a, n)
Definition opencl.h:78

ROUNDUPDWD
#define ROUNDUPDWD(a, b)
Definition opencl.h:81

r
const float r
Definition src/develop/noise_generator.h:101

darktable_t::opencl
struct dt_opencl_t * opencl
Definition darktable.h:785

dt_interpolation_cl_global_t
Definition interpolation.h:150

dt_interpolation_cl_global_t::kernel_interpolation_resample
int kernel_interpolation_resample
Definition interpolation.h:151

dt_interpolation
Definition interpolation.h:59

dt_interpolation::maketaps
dt_interpolation_func maketaps
Definition interpolation.h:63

dt_interpolation::width
size_t width
Definition interpolation.h:62

dt_interpolation::id
enum dt_interpolation_type id
Definition interpolation.h:60

dt_iop_roi_t
Region of interest passed through the pixelpipe.
Definition imageop.h:72

dt_iop_roi_t::x
int x
Definition imageop.h:73

dt_iop_roi_t::scale
double scale
Definition imageop.h:74

dt_iop_roi_t::width
int width
Definition imageop.h:73

dt_iop_roi_t::height
int height
Definition imageop.h:73

dt_iop_roi_t::y
int y
Definition imageop.h:73

dt_opencl_local_buffer_t
Definition opencl.h:282

dt_opencl_local_buffer_t::xoffset
const int xoffset
Definition opencl.h:283

dt_opencl_local_buffer_t::sizey
int sizey
Definition opencl.h:290

dt_opencl_t::interpolation
struct dt_interpolation_cl_global_t * interpolation
Definition opencl.h:260

c2
#define c2
Definition colorspaces_inline_conversions.h:796

MIN
#define MIN(a, b)
Definition thinplate.c:32

MAX
#define MAX(a, b)
Definition thinplate.c:29