Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
math.h
Go to the documentation of this file.
1/*
2 * This file is part of darktable,
3 * Copyright (C) 2016 johannes hanika.
4 * Copyright (C) 2016, 2018 Tobias Ellinghaus.
5 * Copyright (C) 2018-2019 Heiko Bauke.
6 * Copyright (C) 2020-2021 Pascal Obry.
7 * Copyright (C) 2020-2021 Ralf Brown.
8 * Copyright (C) 2021 Andreas Schneider.
9 * Copyright (C) 2021, 2023-2025 Aurélien PIERRE.
10 * Copyright (C) 2022 Martin Bařinka.
11 * Copyright (C) 2023 Luca Zulberti.
12 *
13 * darktable is free software: you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation, either version 3 of the License, or
16 * (at your option) any later version.
17 *
18 * darktable is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with darktable. If not, see <http://www.gnu.org/licenses/>.
25 */
26
27#pragma once
28
29#include <stddef.h>
30#include <math.h>
31#include <stdint.h>
32
33#include "common/darktable.h"
34
35#define NORM_MIN 1.52587890625e-05f // norm can't be < to 2^(-16)
36
37// work around missing standard math.h symbols
39#ifndef M_LN10
40#define M_LN10 2.30258509299404568402
41#endif /* !M_LN10 */
42
44#ifndef M_PI
45#define M_PI 3.14159265358979323846
46#endif /* !M_PI */
47#ifndef M_PI_F
48#define M_PI_F 3.14159265358979324f
49#endif /* !M_PI_F */
50
51
52#define DT_M_PI_F (3.14159265358979324f)
53#define DT_M_PI (3.14159265358979324)
54
55#define DT_M_LN2f (0.6931471805599453f)
56
57// If platform supports hardware-accelerated fused-multiply-add
58// This is not only faster but more accurate because rounding happens at the right place
59#ifdef FP_FAST_FMAF
60 #define DT_FMA(x, y, z) fmaf(x, y, z)
61#else
62 #define DT_FMA(x, y, z) ((x) * (y) + (z))
63#endif
64
65// Golden number (1+sqrt(5))/2
66#ifndef PHI
67#define PHI 1.61803398874989479F
68#endif
69
70// 1/PHI
71#ifndef INVPHI
72#define INVPHI 0.61803398874989479F
73#endif
74
75// NaN-safe clamping (NaN compares false, and will thus result in H)
76#define CLAMPS(A, L, H) ((A) > (L) ? ((A) < (H) ? (A) : (H)) : (L))
77
78// clip channel value to be between 0 and 1
79// NaN-safe: NaN compares false and will result in 0.0
80// also does not force promotion of floats to doubles, but will use the type of its argument
81#define CLIP(x) (((x) >= 0) ? ((x) <= 1 ? (x) : 1) : 0)
82#define MM_CLIP_PS(X) (_mm_min_ps(_mm_max_ps((X), _mm_setzero_ps()), _mm_set1_ps(1.0)))
83
84// clip luminance values to be between 0 and 100
85#define LCLIP(x) ((x < 0) ? 0.0 : (x > 100.0) ? 100.0 : x)
86
87// clamp value to lie between mn and mx
88// Nan-safe: NaN compares false and will result in mn
89#define CLAMPF(a, mn, mx) ((a) >= (mn) ? ((a) <= (mx) ? (a) : (mx)) : (mn))
90//#define CLAMPF(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
91
92#if defined(__SSE__)
93#define MMCLAMPPS(a, mn, mx) (_mm_min_ps((mx), _mm_max_ps((a), (mn))))
94#endif
95
96static inline float clamp_range_f(const float x, const float low, const float high)
97{
98 return x > high ? high : (x < low ? low : x);
99}
100
101// Kahan summation algorithm
102#ifdef _OPENMP
103#pragma omp declare simd aligned(c)
104#endif
105static inline float Kahan_sum(const float m, float *const __restrict__ c, const float add)
106{
107 const float t1 = add - (*c);
108 const float t2 = m + t1;
109 *c = (t2 - m) - t1;
110 return t2;
111}
112
113#ifdef __SSE2__
114// vectorized Kahan summation algorithm
115static inline __m128 Kahan_sum_sse(const __m128 m, __m128 *const __restrict__ c, const __m128 add)
116{
117 const __m128 t1 = add - (*c);
118 const __m128 t2 = m + t1;
119 *c = (t2 - m) - t1;
120 return t2;
121}
122#endif /* __SSE2__ */
123
124static inline float Log2(float x)
125{
126 return (x > 0.0f) ? (logf(x) / DT_M_LN2f) : x;
127}
128
129static inline float Log2Thres(float x, float Thres)
130{
131 return logf(x > Thres ? x : Thres) / DT_M_LN2f;
132}
133
134// ensure that any changes here are synchronized with data/kernels/extended.cl
135static inline float fastlog2(float x)
136{
137 union { float f; uint32_t i; } vx = { x };
138 union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
139
140 float y = vx.i;
141
142 y *= 1.1920928955078125e-7f;
143
144 return y - 124.22551499f
145 - 1.498030302f * mx.f
146 - 1.72587999f / (0.3520887068f + mx.f);
147}
148
149// ensure that any changes here are synchronized with data/kernels/extended.cl
150static inline float
151fastlog (float x)
152{
153 return DT_M_LN2f * fastlog2(x);
154}
155
156// multiply 3x3 matrix with 3x1 vector
157// dest needs to be different from v
158#ifdef _OPENMP
159#pragma omp declare simd
160#endif
161static inline void mat3mulv(float *const __restrict__ dest, const float *const mat, const float *const __restrict__ v)
162{
163 for(int k = 0; k < 3; k++)
164 {
165 float x = 0.0f;
166 for(int i = 0; i < 3; i++)
167 x += mat[3 * k + i] * v[i];
168 dest[k] = x;
169 }
170}
171
172// multiply two 3x3 matrices
173// dest needs to be different from m1 and m2
174// dest = m1 * m2 in this order
175#ifdef _OPENMP
176#pragma omp declare simd
177#endif
178static inline void mat3mul(float *const __restrict__ dest, const float *const __restrict__ m1, const float *const __restrict__ m2)
179{
180 for(int k = 0; k < 3; k++)
181 {
182 for(int i = 0; i < 3; i++)
183 {
184 float x = 0.0f;
185 for(int j = 0; j < 3; j++)
186 x += m1[3 * k + j] * m2[3 * j + i];
187 dest[3 * k + i] = x;
188 }
189 }
190}
191
192#ifdef _OPENMP
193#pragma omp declare simd
194#endif
195static inline void mul_mat_vec_2(const float *m, const float *p, float *o)
196{
197 o[0] = p[0] * m[0] + p[1] * m[1];
198 o[1] = p[0] * m[2] + p[1] * m[3];
199}
200
201#ifdef _OPENMP
202#pragma omp declare simd uniform(v_2) aligned(v_1, v_2:16)
203#endif
204static inline float scalar_product(const dt_aligned_pixel_t v_1, const dt_aligned_pixel_t v_2)
205{
206 // specialized 3x1 dot products 2 4x1 RGB-alpha pixels.
207 // v_2 needs to be uniform along loop increments, e.g. independent from current pixel values
208 // we force an order of computation similar to SSE4 _mm_dp_ps() hoping the compiler will get the clue
209 float acc = 0.f;
210
211#ifdef _OPENMP
212#pragma omp simd aligned(v_1, v_2:16) reduction(+:acc)
213#endif
214 for(size_t c = 0; c < 3; c++) acc += v_1[c] * v_2[c];
215
216 return acc;
217}
218
219
220#ifdef _OPENMP
221#pragma omp declare simd
222#endif
223static inline float sqf(const float x)
224{
225 return x * x;
226}
227
228
229#ifdef _OPENMP
230#pragma omp declare simd aligned(vector:16)
231#endif
232static inline float euclidean_norm(const dt_aligned_pixel_t vector)
233{
234 return fmaxf(sqrtf(sqf(vector[0]) + sqf(vector[1]) + sqf(vector[2])), NORM_MIN);
235}
236
237
238#ifdef _OPENMP
239#pragma omp declare simd aligned(vector:16)
240#endif
241static inline void downscale_vector(dt_aligned_pixel_t vector, const float scaling)
242{
243 // check zero or NaN
244 const int valid = (scaling > NORM_MIN) && !isnan(scaling);
245 for(size_t c = 0; c < 3; c++) vector[c] = (valid) ? vector[c] / (scaling + NORM_MIN) : vector[c] / NORM_MIN;
246}
247
248
249#ifdef _OPENMP
250#pragma omp declare simd aligned(vector:16)
251#endif
252static inline void upscale_vector(dt_aligned_pixel_t vector, const float scaling)
253{
254 const int valid = (scaling > NORM_MIN) && !isnan(scaling);
255 for(size_t c = 0; c < 3; c++) vector[c] = (valid) ? vector[c] * (scaling + NORM_MIN) : vector[c] * NORM_MIN;
256}
257
258
259#ifdef _OPENMP
260#pragma omp declare simd
261#endif
262static inline float dt_log2f(const float f)
263{
264#ifdef __GLIBC__
265 return log2f(f);
266#else
267 return logf(f) / logf(2.0f);
268#endif
269}
270
272 float f;
273 int k;
274};
275
276// a faster, vectorizable version of hypotf() when we know that there won't be overflow, NaNs, or infinities
277#ifdef _OPENMP
278#pragma omp declare simd
279#endif
280static inline float dt_fast_hypotf(const float x, const float y)
281{
282 return sqrtf(x * x + y * y);
283}
284
285// fast approximation of expf()
286/****** if you change this function, you need to make the same change in data/kernels/{basecurve,basic}.cl ***/
287#ifdef _OPENMP
288#pragma omp declare simd
289#endif
290static inline float dt_fast_expf(const float x)
291{
292 // meant for the range [-100.0f, 0.0f]. largest error ~ -0.06 at 0.0f.
293 // will get _a_lot_ worse for x > 0.0f (9000 at 10.0f)..
294 const int i1 = 0x3f800000u;
295 // e^x, the comment would be 2^x
296 const int i2 = 0x402DF854u; // 0x40000000u;
297 // const int k = CLAMPS(i1 + x * (i2 - i1), 0x0u, 0x7fffffffu);
298 // without max clamping (doesn't work for large x, but is faster):
299 const int k0 = i1 + x * (i2 - i1);
300 union float_int u;
301 u.k = k0 > 0 ? k0 : 0;
302 return u.f;
303}
304
305static inline void dt_fast_expf_4wide(const float x[4], float result[4])
306{
307 // meant for the range [-100.0f, 0.0f]. largest error ~ -0.06 at 0.0f.
308 // will get _a_lot_ worse for x > 0.0f (9000 at 10.0f)..
309 const int i1 = 0x3f800000u;
310 // e^x, the comment would be 2^x
311 const int i2 = 0x402DF854u; // 0x40000000u;
312 // const int k = CLAMPS(i1 + x * (i2 - i1), 0x0u, 0x7fffffffu);
313 // without max clamping (doesn't work for large x, but is faster):
314 union float_int u[4];
315#ifdef _OPENMP
316#pragma omp simd aligned(x, result)
317#endif
318 for(size_t c = 0; c < 4; c++)
319 {
320 const int k0 = i1 + (int)(x[c] * (i2 - i1));
321 u[c].k = k0 > 0 ? k0 : 0;
322 result[c] = u[c].f;
323 }
324}
325
326#if defined(__SSE2__)
327#define ALIGNED(a) __attribute__((aligned(a)))
328#define VEC4(a) \
329 { \
330 (a), (a), (a), (a) \
331 }
332
333/* SSE intrinsics version of dt_fast_expf */
334static const __m128 dt__fone ALIGNED(64) = VEC4(0x3f800000u);
335static const __m128 femo ALIGNED(64) = VEC4(0x00adf880u);
336static inline __m128 dt_fast_expf_sse2(const __m128 x)
337{
338 __m128 f = dt__fone + (x * femo); // f(n) = i1 + x(n)*(i2-i1)
339 __m128i i = _mm_cvtps_epi32(f); // i(n) = int(f(n))
340 __m128i mask = _mm_srai_epi32(i, 31); // mask(n) = 0xffffffff if i(n) < 0
341 i = _mm_andnot_si128(mask, i); // i(n) = 0 if i(n) < 0
342 return _mm_castsi128_ps(i); // return *(float*)&i
343}
344#undef ALIGNED
345#undef VEC4
346
347#endif // __SSE2__
348
349// fast approximation of 2^-x for 0<x<126
350/****** if you change this function, you need to make the same change in data/kernels/{denoiseprofile,nlmeans}.cl ***/
351static inline float dt_fast_mexp2f(const float x)
352{
353 const int i1 = 0x3f800000; // bit representation of 2^0
354 const int i2 = 0x3f000000; // bit representation of 2^-1
355 const int k0 = i1 + (int)(x * (i2 - i1));
356 union {
357 float f;
358 int i;
359 } k;
360 k.i = k0 >= 0x800000 ? k0 : 0;
361 return k.f;
362}
363
364// The below version is incorrect, suffering from reduced precision.
365// It is used by the non-local means code in both nlmeans.c and
366// denoiseprofile.c, and fixing it would cause a change in output.
367static inline float fast_mexp2f(const float x)
368{
369 const float i1 = (float)0x3f800000u; // 2^0
370 const float i2 = (float)0x3f000000u; // 2^-1
371 const float k0 = i1 + x * (i2 - i1);
372 union {
373 float f;
374 int i;
375 } k;
376 k.i = k0 >= (float)0x800000u ? k0 : 0;
377 return k.f;
378}
379
385static inline float ceil_fast(float x)
386{
387 if(x <= 0.f)
388 {
389 return (float)(int)x;
390 }
391 else
392 {
393 return -((float)(int)-x) + 1.f;
394 }
395}
396
397#if defined(__SSE2__)
static inline __m128 _mm_abs_ps(__m128 t)
402{
403 static const uint32_t signmask[4] __attribute__((aligned(64)))
404 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
405 return _mm_and_ps(*(__m128 *)signmask, t);
406}
407#endif
408
421static inline float sinf_fast(float t)
422{
423 /***** if you change this function, you must also change the copy in data/kernels/basic.cl *****/
424 static const float a = 4 / (M_PI * M_PI);
425 static const float p = 0.225f;
426
427 t = a * t * (M_PI_F - fabsf(t));
428
429 return t * (p * (fabsf(t) - 1) + 1);
430}
431
432#if defined(__SSE2__)
445static inline __m128 sinf_fast_sse(__m128 t)
446{
447 static const __m128 a
448 = { 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI) };
449 static const __m128 p = { 0.225f, 0.225f, 0.225f, 0.225f };
450 static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
451
452 // m4 = a*t*(M_PI - fabsf(t));
453 const __m128 m1 = _mm_abs_ps(t);
454 const __m128 m2 = _mm_sub_ps(pi, m1);
455 const __m128 m3 = _mm_mul_ps(t, m2);
456 const __m128 m4 = _mm_mul_ps(a, m3);
457
458 // p*(m4*fabsf(m4) - m4) + m4;
459 const __m128 n1 = _mm_abs_ps(m4);
460 const __m128 n2 = _mm_mul_ps(m4, n1);
461 const __m128 n3 = _mm_sub_ps(n2, m4);
462 const __m128 n4 = _mm_mul_ps(p, n3);
463
464 return _mm_add_ps(n4, m4);
465}
466#endif
467
468
476static inline int ipow(int base, int exp)
477{
478 int result = 1;
479 for(;;)
480 {
481 if (exp & 1)
482 result *= base;
483 exp >>= 1;
484 if (!exp)
485 break;
486 base *= base;
487 }
488 return result;
489}
490
503static inline void dt_vector_sin(const dt_aligned_pixel_t arg,
504 dt_aligned_pixel_t sine)
505{
506 static const dt_aligned_pixel_t pi = { M_PI_F, M_PI_F, M_PI_F, M_PI_F };
507 static const dt_aligned_pixel_t a
508 = { 4 / (M_PI_F * M_PI_F),
509 4 / (M_PI_F * M_PI_F),
510 4 / (M_PI_F * M_PI_F),
511 4 / (M_PI_F * M_PI_F) };
512 static const dt_aligned_pixel_t p = { 0.225f, 0.225f, 0.225f, 0.225f };
513 static const dt_aligned_pixel_t one = { 1.0f, 1.0f, 1.0f, 1.0f };
514
515 dt_aligned_pixel_t abs_arg;
517 abs_arg[c] = (arg[c] < 0.0f) ? -arg[c] : arg[c];
518 dt_aligned_pixel_t scaled;
520 scaled[c] = a[c] * arg[c] * (pi[c] - abs_arg[c]);
521 dt_aligned_pixel_t abs_scaled;
523 abs_scaled[c] = (scaled[c] < 0.0f) ? -scaled[c] : scaled[c];
525 sine[c] = scaled[c] * (p[c] * (abs_scaled[c] - one[c]) + one[c]);
526}
527
532static inline float f_inv_sqrtf(const float x)
533{
534 if(x <= 1e-16f) return 0.0f;
535
536 union
537 {
538 float f;
539 uint32_t i;
540 } conv = { x };
541
542 conv.i = 0x5f3759dfu - (conv.i >> 1);
543 float y = conv.f;
544 // One Newton-Raphson iteration is accurate enough for geometry offsets.
545 y = y * (1.5f - 0.5f * x * y * y);
546 return y;
547}
548
549// clang-format off
550// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
551// vim: shiftwidth=2 expandtab tabstop=2 cindent
552// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
553// clang-format on
#define m
Definition basecurve.c:277
static const dt_aligned_pixel_simd_t const dt_adaptation_t const float p
Definition chromatic_adaptation.h:315
static const float scaling
Definition chromatic_adaptation.h:299
const float i
Definition colorspaces_inline_conversions.h:669
const float c
Definition colorspaces_inline_conversions.h:1365
const dt_aligned_pixel_t f
Definition colorspaces_inline_conversions.h:256
const float a
Definition colorspaces_inline_conversions.h:1292
float dt_aligned_pixel_simd_t __attribute__((vector_size(16), aligned(16)))
Multi-tap smudge source sample with directional jitter.
Definition darktable.h:448
#define for_four_channels(_var,...)
Definition darktable.h:584
static const float x
Definition iop_profile.h:239
const int t
Definition iop_profile.h:227
static const float v
Definition iop_profile.h:223
static float Kahan_sum(const float m, float *const __restrict__ c, const float add)
Definition math.h:105
static float scalar_product(const dt_aligned_pixel_t v_1, const dt_aligned_pixel_t v_2)
Definition math.h:204
static float ceil_fast(float x)
Definition math.h:385
static float clamp_range_f(const float x, const float low, const float high)
Definition math.h:96
static float sqf(const float x)
Definition math.h:223
static float dt_log2f(const float f)
Definition math.h:262
#define DT_M_LN2f
Definition math.h:55
static int ipow(int base, int exp)
Fast integer power, computing base^exp.
Definition math.h:476
static void mul_mat_vec_2(const float *m, const float *p, float *o)
Definition math.h:195
#define NORM_MIN
Definition math.h:35
static void upscale_vector(dt_aligned_pixel_t vector, const float scaling)
Definition math.h:252
static float f_inv_sqrtf(const float x)
Definition math.h:532
static float fast_mexp2f(const float x)
Definition math.h:367
static float sinf_fast(float t)
Definition math.h:421
static float dt_fast_expf(const float x)
Definition math.h:290
static float dt_fast_hypotf(const float x, const float y)
Definition math.h:280
static float dt_fast_mexp2f(const float x)
Definition math.h:351
static void mat3mul(float *const __restrict__ dest, const float *const __restrict__ m1, const float *const __restrict__ m2)
Definition math.h:178
static float euclidean_norm(const dt_aligned_pixel_t vector)
Definition math.h:232
static void dt_vector_sin(const dt_aligned_pixel_t arg, dt_aligned_pixel_t sine)
Definition math.h:503
#define M_PI_F
Definition math.h:48
static float fastlog2(float x)
Definition math.h:135
static float fastlog(float x)
Definition math.h:151
static float Log2(float x)
Definition math.h:124
static float Log2Thres(float x, float Thres)
Definition math.h:129
static void dt_fast_expf_4wide(const float x[4], float result[4])
Definition math.h:305
#define M_PI
Definition math.h:45
static void mat3mulv(float *const __restrict__ dest, const float *const mat, const float *const __restrict__ v)
Definition math.h:161
static void downscale_vector(dt_aligned_pixel_t vector, const float scaling)
Definition math.h:241
mask
Definition dtstyle_to_xmp.py:79
Definition math.h:271
float f
Definition math.h:272
int k
Definition math.h:273