imagebuf_8c_source.html

/*

    This file is part of darktable,

    Copyright (C) 2020-2021 Ralf Brown.

    Copyright (C) 2021-2022 Pascal Obry.

    Copyright (C) 2022 Martin Bařinka.

    Copyright (C) 2023, 2025-2026 Aurélien PIERRE.


    darktable is free software: you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or

    (at your option) any later version.


    darktable is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License

    along with darktable.  If not, see <http://www.gnu.org/licenses/>.

*/


#include <stdarg.h>

#include "common/imagebuf.h"


#ifdef _OPENMP

static size_t parallel_imgop_minimum = 500000;

#endif


// Allocate one or more buffers as detailed in the given parameters.  If any allocation fails, free all of them,

// set the module's trouble flag, and return 1 (0 on success).


int dt_iop_alloc_image_buffers(struct dt_iop_module_t *const module,

                               const struct dt_iop_roi_t *const roi_in,

                               const struct dt_iop_roi_t *const roi_out, ...)

{

  int err = 0;

  va_list args;

  // first pass: zero out all of the given buffer pointers

  va_start(args,roi_out);

  while (TRUE)

  {

    const int size = va_arg(args,int);

    float **bufptr = va_arg(args,float**);

    if (size & DT_IMGSZ_PERTHREAD)

      (void)va_arg(args,size_t*);    // skip the extra pointer for per-thread allocations

    if (size == 0 || IS_NULL_PTR(bufptr))        // end of arg list?

      break;

    *bufptr = NULL;

  }

  va_end(args);


  // second pass: attempt to allocate the requested buffers

  va_start(args,roi_out);

  while (!err)

  {

    const int size = va_arg(args,int);

    float **bufptr = va_arg(args,float**);

    size_t *paddedsize = (size & DT_IMGSZ_PERTHREAD) ? va_arg(args,size_t*) : NULL;

    if (size == 0 || IS_NULL_PTR(bufptr))

      break;

    const size_t channels = size & DT_IMGSZ_CH_MASK;

    size_t nfloats;

    switch (size & (DT_IMGSZ_ROI_MASK | DT_IMGSZ_DIM_MASK))

    {

    case DT_IMGSZ_OUTPUT | DT_IMGSZ_FULL:

      nfloats = channels * roi_out->width * roi_out->height;

      break;

    case DT_IMGSZ_OUTPUT | DT_IMGSZ_HEIGHT:

      nfloats = channels * roi_out->height;

      break;

    case DT_IMGSZ_OUTPUT | DT_IMGSZ_WIDTH:

      nfloats = channels * roi_out->width;

      break;

    case DT_IMGSZ_OUTPUT | DT_IMGSZ_LONGEST:

      nfloats = channels * MAX(roi_out->width, roi_out->height);

      break;

    case DT_IMGSZ_INPUT | DT_IMGSZ_FULL:

      nfloats = channels * roi_in->width * roi_in->height;

      break;

    case DT_IMGSZ_INPUT | DT_IMGSZ_HEIGHT:

      nfloats = channels * roi_in->height;

      break;

    case DT_IMGSZ_INPUT | DT_IMGSZ_WIDTH:

      nfloats = channels * roi_in->width;

      break;

    case DT_IMGSZ_INPUT | DT_IMGSZ_LONGEST:

      nfloats = channels * MAX(roi_in->width, roi_in->height);

      break;

    default:

      nfloats = 0;

      break;

    }

    if (size & DT_IMGSZ_PERTHREAD)

    {

      *bufptr = dt_pixelpipe_cache_alloc_perthread_float(nfloats,paddedsize);

      if ((size & DT_IMGSZ_CLEARBUF) && *bufptr)

        memset(*bufptr, 0, *paddedsize * darktable.num_openmp_threads * sizeof(float));

    }

    else

    {

      *bufptr = dt_pixelpipe_cache_alloc_align_float_cache(nfloats, 0);

      if ((size & DT_IMGSZ_CLEARBUF) && *bufptr)

        memset(*bufptr, 0, nfloats * sizeof(float));

    }

    if (!*bufptr)

    {

      err = 1;

      break;

    }

  }

  va_end(args);


  // finally, check whether successful and clean up if something went wrong

  if (err)

  {

    va_start(args,roi_out);

    while (TRUE)

    {

      const int size = va_arg(args,int);

      float **bufptr = va_arg(args,float**);

      if (size & DT_IMGSZ_PERTHREAD)

        (void)va_arg(args,size_t*);  // skip the extra pointer for per-thread allocations

      if (size == 0 || IS_NULL_PTR(bufptr) || !*bufptr)

        break;  // end of arg list or this attempted allocation failed

      dt_pixelpipe_cache_free_align(*bufptr);

      *bufptr = NULL;

    }

    va_end(args);

    // set the module's trouble flag

  }

  return err;

}


// Copy an image buffer, specifying the number of floats it contains.  Use of this function is to be preferred

// over a bare memcpy both because it helps document the purpose of the code and because it gives us a single

// point where we can optimize performance on different architectures.

__DT_CLONE_TARGETS__


void dt_iop_image_copy(float *const __restrict__ out, const float *const __restrict__ in, const size_t nfloats)

{

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(in, out : 16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      out[k] = in[k];

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

  memcpy(out, in, nfloats * sizeof(float));

}


// Copy an image buffer, specifying the regions of interest.  The output RoI may be larger than the input RoI,

// in which case the result is optionally padded with zeros.  If the output RoI is smaller than the input RoI,

// only a portion of the input buffer will be copied.


void dt_iop_copy_image_roi(float *const __restrict__ out, const float *const __restrict__ in, const size_t ch,

                           const dt_iop_roi_t *const __restrict__ roi_in,

                           const dt_iop_roi_t *const __restrict__ roi_out, const int zero_pad)

{

  if (roi_in->width == roi_out->width && roi_in->height == roi_out->height)

  {

    // fast path, just copy the entire contents of the buffer

    dt_iop_image_copy_by_size(out, in, roi_out->width, roi_out->height, ch);

  }

  else if (roi_in->width <= roi_out->width && roi_in->height <= roi_out->height)

  {

    // output needs padding

    fprintf(stderr,"copy_image_roi with larger output not yet implemented\n");

    //TODO

  }

  else if (roi_in->width >= roi_out->width && roi_in->height >= roi_out->height)

  {

    // copy only a portion of the input

    fprintf(stderr,"copy_image_roi with smaller output not yet implemented\n");

    //TODO

  }

  else

  {

    // inconsistent RoIs!!

    fprintf(stderr,"copy_image_roi called with inconsistent RoI!\n");

    //TODO

  }

}


__DT_CLONE_TARGETS__


void dt_iop_image_scaled_copy(float *const restrict buf, const float *const restrict src, const float scale,

                              const size_t width, const size_t height, const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf, src : 16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] = scale * src[k];

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf, src : 16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] = scale * src[k];

}


__DT_CLONE_TARGETS__


void dt_iop_image_fill(float *const buf, const float fill_value, const size_t width, const size_t height,

                       const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    const size_t nthreads = MIN(16,darktable.num_openmp_threads);

    // determine the number of 4-float vectors to be processed by each thread

    const size_t chunksize = (((nfloats + nthreads - 1) / nthreads) + 3) / 4;

#pragma omp parallel for default(firstprivate)  num_threads(nthreads)

    for(size_t chunk = 0; chunk < nthreads; chunk++)

    {

#pragma omp simd aligned(buf:16)

      for(size_t k = 4 * chunk * chunksize; k < MIN(4*(chunk+1)*chunksize, nfloats); k++)

        buf[k] = fill_value;

    }

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

  if (fill_value == 0.0f)

  {

    // take advantage of compiler intrinsic which is hopefully highly optimized

    memset(buf, 0, sizeof(float) * nfloats);

  }

  else

  {

#ifdef _OPENMP

#pragma omp simd aligned(buf:16)

#endif

    for (size_t k = 0; k < nfloats; k++)

      buf[k] = fill_value;

  }

}


__DT_CLONE_TARGETS__


void dt_iop_image_add_const(float *const buf, const float add_value, const size_t width, const size_t height,

                            const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf:16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] += add_value;

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf:16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] += add_value;

}


__DT_CLONE_TARGETS__


void dt_iop_image_add_image(float *const buf, const float* const other_image,

                            const size_t width, const size_t height, const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf, other_image : 16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] += other_image[k];

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf, other_image : 16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] += other_image[k];

}


__DT_CLONE_TARGETS__


void dt_iop_image_sub_image(float *const buf, const float* const other_image,

                            const size_t width, const size_t height, const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf, other_image : 16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] -= other_image[k];

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf, other_image : 16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] -= other_image[k];

}


__DT_CLONE_TARGETS__


void dt_iop_image_invert(float *const buf, const float max_value, const size_t width, const size_t height,

                         const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf:16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] = max_value - buf[k];

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf:16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] = max_value - buf[k];

}


__DT_CLONE_TARGETS__


void dt_iop_image_mul_const(float *const buf, const float mul_value, const size_t width, const size_t height,

                            const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf:16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] *= mul_value;

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf:16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] *= mul_value;

}


__DT_CLONE_TARGETS__


void dt_iop_image_div_const(float *const buf, const float div_value, const size_t width, const size_t height,

                            const size_t ch)

{

  const size_t nfloats = width * height * ch;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum) // is the copy big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf:16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] /= div_value;

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf:16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] /= div_value;

}


// elementwise: buf = lammda*buf + (1-lambda)*other

__DT_CLONE_TARGETS__


void dt_iop_image_linear_blend(float *const restrict buf, const float lambda, const float *const restrict other,

                               const size_t width, const size_t height, const size_t ch)

{

  const size_t nfloats = width * height * ch;

  const float lambda_1 = 1.0f - lambda;

#ifdef _OPENMP

  if (nfloats > parallel_imgop_minimum/2) // is the task big enough to outweigh threading overhead?

  {

    // we can gain a little by using a small number of threads in parallel, but not much since the memory bus

    // quickly saturates (basically, each core can saturate a memory channel, so a system with quad-channel

    // memory won't be able to take advantage of more than four cores).

#pragma omp parallel for simd aligned(buf:16) default(firstprivate)

    for(size_t k = 0; k < nfloats; k++)

      buf[k] = lambda*buf[k] + lambda_1*other[k];

    return;

  }

#endif // _OPENMP

  // no OpenMP, or image too small to bother parallelizing

#ifdef _OPENMP

#pragma omp simd aligned(buf:16)

#endif

  for (size_t k = 0; k < nfloats; k++)

    buf[k] = lambda*buf[k] + lambda_1*other[k];

}


// clang-format off

// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py

// vim: shiftwidth=2 expandtab tabstop=2 cindent

// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;

// clang-format on

TRUE
#define TRUE
Definition ashift_lsd.c:162

width
int width
Definition bilateral.h:1

height
int height
Definition bilateral.h:1

out
const dt_colormatrix_t dt_aligned_pixel_t out
Definition colorspaces_inline_conversions.h:42

void
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))

darktable
darktable_t darktable
Definition darktable.c:181

dt_pixelpipe_cache_alloc_align_float_cache
#define dt_pixelpipe_cache_alloc_align_float_cache(pixels, id)
Definition darktable.h:447

dt_pixelpipe_cache_free_align
#define dt_pixelpipe_cache_free_align(mem)
Definition darktable.h:453

__DT_CLONE_TARGETS__
#define __DT_CLONE_TARGETS__
Definition darktable.h:367

dt_pixelpipe_cache_alloc_perthread_float
#define dt_pixelpipe_cache_alloc_perthread_float(n, padded_size)
Definition darktable.h:1030

IS_NULL_PTR
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
Definition darktable.h:281

dt_iop_image_add_image
__DT_CLONE_TARGETS__ void dt_iop_image_add_image(float *const buf, const float *const other_image, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:276

dt_iop_image_mul_const
__DT_CLONE_TARGETS__ void dt_iop_image_mul_const(float *const buf, const float mul_value, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:351

dt_iop_image_copy
__DT_CLONE_TARGETS__ void dt_iop_image_copy(float *const __restrict__ out, const float *const __restrict__ in, const size_t nfloats)
Definition imagebuf.c:138

dt_iop_image_sub_image
__DT_CLONE_TARGETS__ void dt_iop_image_sub_image(float *const buf, const float *const other_image, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:301

dt_iop_alloc_image_buffers
int dt_iop_alloc_image_buffers(struct dt_iop_module_t *const module, const struct dt_iop_roi_t *const roi_in, const struct dt_iop_roi_t *const roi_out,...)
Definition imagebuf.c:31

dt_iop_image_add_const
__DT_CLONE_TARGETS__ void dt_iop_image_add_const(float *const buf, const float add_value, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:251

dt_iop_copy_image_roi
void dt_iop_copy_image_roi(float *const __restrict__ out, const float *const __restrict__ in, const size_t ch, const dt_iop_roi_t *const __restrict__ roi_in, const dt_iop_roi_t *const __restrict__ roi_out, const int zero_pad)
Definition imagebuf.c:159

dt_iop_image_invert
__DT_CLONE_TARGETS__ void dt_iop_image_invert(float *const buf, const float max_value, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:326

dt_iop_image_fill
__DT_CLONE_TARGETS__ void dt_iop_image_fill(float *const buf, const float fill_value, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:214

dt_iop_image_linear_blend
__DT_CLONE_TARGETS__ void dt_iop_image_linear_blend(float *const restrict buf, const float lambda, const float *const restrict other, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:402

dt_iop_image_scaled_copy
__DT_CLONE_TARGETS__ void dt_iop_image_scaled_copy(float *const restrict buf, const float *const restrict src, const float scale, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:189

dt_iop_image_div_const
__DT_CLONE_TARGETS__ void dt_iop_image_div_const(float *const buf, const float div_value, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.c:376

imagebuf.h

DT_IMGSZ_ROI_MASK
#define DT_IMGSZ_ROI_MASK
Definition imagebuf.h:53

DT_IMGSZ_LONGEST
#define DT_IMGSZ_LONGEST
Definition imagebuf.h:64

DT_IMGSZ_INPUT
#define DT_IMGSZ_INPUT
Definition imagebuf.h:55

DT_IMGSZ_CLEARBUF
#define DT_IMGSZ_CLEARBUF
Definition imagebuf.h:58

dt_iop_image_copy_by_size
static void dt_iop_image_copy_by_size(float *const __restrict__ out, const float *const __restrict__ in, const size_t width, const size_t height, const size_t ch)
Definition imagebuf.h:87

DT_IMGSZ_OUTPUT
#define DT_IMGSZ_OUTPUT
Definition imagebuf.h:54

DT_IMGSZ_WIDTH
#define DT_IMGSZ_WIDTH
Definition imagebuf.h:63

DT_IMGSZ_PERTHREAD
#define DT_IMGSZ_PERTHREAD
Definition imagebuf.h:57

DT_IMGSZ_HEIGHT
#define DT_IMGSZ_HEIGHT
Definition imagebuf.h:62

DT_IMGSZ_CH_MASK
#define DT_IMGSZ_CH_MASK
Definition imagebuf.h:51

DT_IMGSZ_DIM_MASK
#define DT_IMGSZ_DIM_MASK
Definition imagebuf.h:60

DT_IMGSZ_FULL
#define DT_IMGSZ_FULL
Definition imagebuf.h:61

k
float *const restrict const size_t k
Definition luminance_mask.h:78

ch
float *const restrict const size_t const size_t ch
Definition luminance_mask.h:78

size
size_t size
Definition mipmap_cache.c:3

darktable_t::num_openmp_threads
int32_t num_openmp_threads
Definition darktable.h:758

dt_iop_module_t
Definition imageop.h:246

dt_iop_roi_t
Region of interest passed through the pixelpipe.
Definition imageop.h:72

dt_iop_roi_t::width
int width
Definition imageop.h:73

dt_iop_roi_t::height
int height
Definition imageop.h:73

MIN
#define MIN(a, b)
Definition thinplate.c:32

MAX
#define MAX(a, b)
Definition thinplate.c:29