Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
math.h
Go to the documentation of this file.
1/*
2 * This file is part of darktable,
3 * Copyright (C) 2018-2021 darktable developers.
4 *
5 * darktable is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * darktable is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19#pragma once
20
21#include <stddef.h>
22#include <math.h>
23#include <stdint.h>
24
25#include "common/darktable.h"
26
27#define NORM_MIN 1.52587890625e-05f // norm can't be < to 2^(-16)
28
29// work around missing standard math.h symbols
31#ifndef M_LN10
32#define M_LN10 2.30258509299404568402
33#endif /* !M_LN10 */
34
36#ifndef M_PI
37#define M_PI 3.14159265358979323846
38#endif /* !M_PI */
39#ifndef M_PI_F
40#define M_PI_F 3.14159265358979324f
41#endif /* !M_PI_F */
42
43
44#define DT_M_PI_F (3.14159265358979324f)
45#define DT_M_PI (3.14159265358979324)
46
47#define DT_M_LN2f (0.6931471805599453f)
48
49// If platform supports hardware-accelerated fused-multiply-add
50// This is not only faster but more accurate because rounding happens at the right place
51#ifdef FP_FAST_FMAF
52 #define DT_FMA(x, y, z) fmaf(x, y, z)
53#else
54 #define DT_FMA(x, y, z) ((x) * (y) + (z))
55#endif
56
57// Golden number (1+sqrt(5))/2
58#ifndef PHI
59#define PHI 1.61803398874989479F
60#endif
61
62// 1/PHI
63#ifndef INVPHI
64#define INVPHI 0.61803398874989479F
65#endif
66
67// NaN-safe clamping (NaN compares false, and will thus result in H)
68#define CLAMPS(A, L, H) ((A) > (L) ? ((A) < (H) ? (A) : (H)) : (L))
69
70// clip channel value to be between 0 and 1
71// NaN-safe: NaN compares false and will result in 0.0
72// also does not force promotion of floats to doubles, but will use the type of its argument
73#define CLIP(x) (((x) >= 0) ? ((x) <= 1 ? (x) : 1) : 0)
74#define MM_CLIP_PS(X) (_mm_min_ps(_mm_max_ps((X), _mm_setzero_ps()), _mm_set1_ps(1.0)))
75
76// clip luminance values to be between 0 and 100
77#define LCLIP(x) ((x < 0) ? 0.0 : (x > 100.0) ? 100.0 : x)
78
79// clamp value to lie between mn and mx
80// Nan-safe: NaN compares false and will result in mn
81#define CLAMPF(a, mn, mx) ((a) >= (mn) ? ((a) <= (mx) ? (a) : (mx)) : (mn))
82//#define CLAMPF(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
83
84#if defined(__SSE__)
85#define MMCLAMPPS(a, mn, mx) (_mm_min_ps((mx), _mm_max_ps((a), (mn))))
86#endif
87
88static inline float clamp_range_f(const float x, const float low, const float high)
89{
90 return x > high ? high : (x < low ? low : x);
91}
92
93// Kahan summation algorithm
94#ifdef _OPENMP
95#pragma omp declare simd aligned(c)
96#endif
97static inline float Kahan_sum(const float m, float *const __restrict__ c, const float add)
98{
99 const float t1 = add - (*c);
100 const float t2 = m + t1;
101 *c = (t2 - m) - t1;
102 return t2;
103}
104
105#ifdef __SSE2__
106// vectorized Kahan summation algorithm
107static inline __m128 Kahan_sum_sse(const __m128 m, __m128 *const __restrict__ c, const __m128 add)
108{
109 const __m128 t1 = add - (*c);
110 const __m128 t2 = m + t1;
111 *c = (t2 - m) - t1;
112 return t2;
113}
114#endif /* __SSE2__ */
115
116static inline float Log2(float x)
117{
118 return (x > 0.0f) ? (logf(x) / DT_M_LN2f) : x;
119}
120
121static inline float Log2Thres(float x, float Thres)
122{
123 return logf(x > Thres ? x : Thres) / DT_M_LN2f;
124}
125
126// ensure that any changes here are synchronized with data/kernels/extended.cl
127static inline float fastlog2(float x)
128{
129 union { float f; uint32_t i; } vx = { x };
130 union { uint32_t i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
131
132 float y = vx.i;
133
134 y *= 1.1920928955078125e-7f;
135
136 return y - 124.22551499f
137 - 1.498030302f * mx.f
138 - 1.72587999f / (0.3520887068f + mx.f);
139}
140
141// ensure that any changes here are synchronized with data/kernels/extended.cl
142static inline float
143fastlog (float x)
144{
145 return DT_M_LN2f * fastlog2(x);
146}
147
148// multiply 3x3 matrix with 3x1 vector
149// dest needs to be different from v
150#ifdef _OPENMP
151#pragma omp declare simd
152#endif
153static inline void mat3mulv(float *const __restrict__ dest, const float *const mat, const float *const __restrict__ v)
154{
155 for(int k = 0; k < 3; k++)
156 {
157 float x = 0.0f;
158 for(int i = 0; i < 3; i++)
159 x += mat[3 * k + i] * v[i];
160 dest[k] = x;
161 }
162}
163
164// multiply two 3x3 matrices
165// dest needs to be different from m1 and m2
166// dest = m1 * m2 in this order
167#ifdef _OPENMP
168#pragma omp declare simd
169#endif
170static inline void mat3mul(float *const __restrict__ dest, const float *const __restrict__ m1, const float *const __restrict__ m2)
171{
172 for(int k = 0; k < 3; k++)
173 {
174 for(int i = 0; i < 3; i++)
175 {
176 float x = 0.0f;
177 for(int j = 0; j < 3; j++)
178 x += m1[3 * k + j] * m2[3 * j + i];
179 dest[3 * k + i] = x;
180 }
181 }
182}
183
184#ifdef _OPENMP
185#pragma omp declare simd
186#endif
187static inline void mul_mat_vec_2(const float *m, const float *p, float *o)
188{
189 o[0] = p[0] * m[0] + p[1] * m[1];
190 o[1] = p[0] * m[2] + p[1] * m[3];
191}
192
193#ifdef _OPENMP
194#pragma omp declare simd uniform(v_2) aligned(v_1, v_2:16)
195#endif
196static inline float scalar_product(const dt_aligned_pixel_t v_1, const dt_aligned_pixel_t v_2)
197{
198 // specialized 3x1 dot products 2 4x1 RGB-alpha pixels.
199 // v_2 needs to be uniform along loop increments, e.g. independent from current pixel values
200 // we force an order of computation similar to SSE4 _mm_dp_ps() hoping the compiler will get the clue
201 float acc = 0.f;
202
203#ifdef _OPENMP
204#pragma omp simd aligned(v_1, v_2:16) reduction(+:acc)
205#endif
206 for(size_t c = 0; c < 3; c++) acc += v_1[c] * v_2[c];
207
208 return acc;
209}
210
211
212#ifdef _OPENMP
213#pragma omp declare simd
214#endif
215static inline float sqf(const float x)
216{
217 return x * x;
218}
219
220
221#ifdef _OPENMP
222#pragma omp declare simd aligned(vector:16)
223#endif
224static inline float euclidean_norm(const dt_aligned_pixel_t vector)
225{
226 return fmaxf(sqrtf(sqf(vector[0]) + sqf(vector[1]) + sqf(vector[2])), NORM_MIN);
227}
228
229
230#ifdef _OPENMP
231#pragma omp declare simd aligned(vector:16)
232#endif
233static inline void downscale_vector(dt_aligned_pixel_t vector, const float scaling)
234{
235 // check zero or NaN
236 const int valid = (scaling > NORM_MIN) && !isnan(scaling);
237 for(size_t c = 0; c < 3; c++) vector[c] = (valid) ? vector[c] / (scaling + NORM_MIN) : vector[c] / NORM_MIN;
238}
239
240
241#ifdef _OPENMP
242#pragma omp declare simd aligned(vector:16)
243#endif
244static inline void upscale_vector(dt_aligned_pixel_t vector, const float scaling)
245{
246 const int valid = (scaling > NORM_MIN) && !isnan(scaling);
247 for(size_t c = 0; c < 3; c++) vector[c] = (valid) ? vector[c] * (scaling + NORM_MIN) : vector[c] * NORM_MIN;
248}
249
250
251#ifdef _OPENMP
252#pragma omp declare simd
253#endif
254static inline float dt_log2f(const float f)
255{
256#ifdef __GLIBC__
257 return log2f(f);
258#else
259 return logf(f) / logf(2.0f);
260#endif
261}
262
264 float f;
265 int k;
266};
267
268// a faster, vectorizable version of hypotf() when we know that there won't be overflow, NaNs, or infinities
269#ifdef _OPENMP
270#pragma omp declare simd
271#endif
272static inline float dt_fast_hypotf(const float x, const float y)
273{
274 return sqrtf(x * x + y * y);
275}
276
277// fast approximation of expf()
278/****** if you change this function, you need to make the same change in data/kernels/{basecurve,basic}.cl ***/
279#ifdef _OPENMP
280#pragma omp declare simd
281#endif
282static inline float dt_fast_expf(const float x)
283{
284 // meant for the range [-100.0f, 0.0f]. largest error ~ -0.06 at 0.0f.
285 // will get _a_lot_ worse for x > 0.0f (9000 at 10.0f)..
286 const int i1 = 0x3f800000u;
287 // e^x, the comment would be 2^x
288 const int i2 = 0x402DF854u; // 0x40000000u;
289 // const int k = CLAMPS(i1 + x * (i2 - i1), 0x0u, 0x7fffffffu);
290 // without max clamping (doesn't work for large x, but is faster):
291 const int k0 = i1 + x * (i2 - i1);
292 union float_int u;
293 u.k = k0 > 0 ? k0 : 0;
294 return u.f;
295}
296
297static inline void dt_fast_expf_4wide(const float x[4], float result[4])
298{
299 // meant for the range [-100.0f, 0.0f]. largest error ~ -0.06 at 0.0f.
300 // will get _a_lot_ worse for x > 0.0f (9000 at 10.0f)..
301 const int i1 = 0x3f800000u;
302 // e^x, the comment would be 2^x
303 const int i2 = 0x402DF854u; // 0x40000000u;
304 // const int k = CLAMPS(i1 + x * (i2 - i1), 0x0u, 0x7fffffffu);
305 // without max clamping (doesn't work for large x, but is faster):
306 union float_int u[4];
307#ifdef _OPENMP
308#pragma omp simd aligned(x, result)
309#endif
310 for(size_t c = 0; c < 4; c++)
311 {
312 const int k0 = i1 + (int)(x[c] * (i2 - i1));
313 u[c].k = k0 > 0 ? k0 : 0;
314 result[c] = u[c].f;
315 }
316}
317
318#if defined(__SSE2__)
319#define ALIGNED(a) __attribute__((aligned(a)))
320#define VEC4(a) \
321 { \
322 (a), (a), (a), (a) \
323 }
324
325/* SSE intrinsics version of dt_fast_expf */
326static const __m128 dt__fone ALIGNED(64) = VEC4(0x3f800000u);
327static const __m128 femo ALIGNED(64) = VEC4(0x00adf880u);
328static inline __m128 dt_fast_expf_sse2(const __m128 x)
329{
330 __m128 f = dt__fone + (x * femo); // f(n) = i1 + x(n)*(i2-i1)
331 __m128i i = _mm_cvtps_epi32(f); // i(n) = int(f(n))
332 __m128i mask = _mm_srai_epi32(i, 31); // mask(n) = 0xffffffff if i(n) < 0
333 i = _mm_andnot_si128(mask, i); // i(n) = 0 if i(n) < 0
334 return _mm_castsi128_ps(i); // return *(float*)&i
335}
336#undef ALIGNED
337#undef VEC4
338
339#endif // __SSE2__
340
341// fast approximation of 2^-x for 0<x<126
342/****** if you change this function, you need to make the same change in data/kernels/{denoiseprofile,nlmeans}.cl ***/
343static inline float dt_fast_mexp2f(const float x)
344{
345 const int i1 = 0x3f800000; // bit representation of 2^0
346 const int i2 = 0x3f000000; // bit representation of 2^-1
347 const int k0 = i1 + (int)(x * (i2 - i1));
348 union {
349 float f;
350 int i;
351 } k;
352 k.i = k0 >= 0x800000 ? k0 : 0;
353 return k.f;
354}
355
356// The below version is incorrect, suffering from reduced precision.
357// It is used by the non-local means code in both nlmeans.c and
358// denoiseprofile.c, and fixing it would cause a change in output.
359static inline float fast_mexp2f(const float x)
360{
361 const float i1 = (float)0x3f800000u; // 2^0
362 const float i2 = (float)0x3f000000u; // 2^-1
363 const float k0 = i1 + x * (i2 - i1);
364 union {
365 float f;
366 int i;
367 } k;
368 k.i = k0 >= (float)0x800000u ? k0 : 0;
369 return k.f;
370}
371
377static inline float ceil_fast(float x)
378{
379 if(x <= 0.f)
380 {
381 return (float)(int)x;
382 }
383 else
384 {
385 return -((float)(int)-x) + 1.f;
386 }
387}
388
389#if defined(__SSE2__)
static inline __m128 _mm_abs_ps(__m128 t)
394{
395 static const uint32_t signmask[4] __attribute__((aligned(64)))
396 = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
397 return _mm_and_ps(*(__m128 *)signmask, t);
398}
399#endif
400
413static inline float sinf_fast(float t)
414{
415 /***** if you change this function, you must also change the copy in data/kernels/basic.cl *****/
416 static const float a = 4 / (M_PI * M_PI);
417 static const float p = 0.225f;
418
419 t = a * t * (M_PI_F - fabsf(t));
420
421 return t * (p * (fabsf(t) - 1) + 1);
422}
423
424#if defined(__SSE2__)
437static inline __m128 sinf_fast_sse(__m128 t)
438{
439 static const __m128 a
440 = { 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI), 4.f / (M_PI * M_PI) };
441 static const __m128 p = { 0.225f, 0.225f, 0.225f, 0.225f };
442 static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
443
444 // m4 = a*t*(M_PI - fabsf(t));
445 const __m128 m1 = _mm_abs_ps(t);
446 const __m128 m2 = _mm_sub_ps(pi, m1);
447 const __m128 m3 = _mm_mul_ps(t, m2);
448 const __m128 m4 = _mm_mul_ps(a, m3);
449
450 // p*(m4*fabsf(m4) - m4) + m4;
451 const __m128 n1 = _mm_abs_ps(m4);
452 const __m128 n2 = _mm_mul_ps(m4, n1);
453 const __m128 n3 = _mm_sub_ps(n2, m4);
454 const __m128 n4 = _mm_mul_ps(p, n3);
455
456 return _mm_add_ps(n4, m4);
457}
458#endif
459
460
468static inline int ipow(int base, int exp)
469{
470 int result = 1;
471 for(;;)
472 {
473 if (exp & 1)
474 result *= base;
475 exp >>= 1;
476 if (!exp)
477 break;
478 base *= base;
479 }
480 return result;
481}
482
495static inline void dt_vector_sin(const dt_aligned_pixel_t arg,
496 dt_aligned_pixel_t sine)
497{
498 static const dt_aligned_pixel_t pi = { M_PI_F, M_PI_F, M_PI_F, M_PI_F };
499 static const dt_aligned_pixel_t a
500 = { 4 / (M_PI_F * M_PI_F),
501 4 / (M_PI_F * M_PI_F),
502 4 / (M_PI_F * M_PI_F),
503 4 / (M_PI_F * M_PI_F) };
504 static const dt_aligned_pixel_t p = { 0.225f, 0.225f, 0.225f, 0.225f };
505 static const dt_aligned_pixel_t one = { 1.0f, 1.0f, 1.0f, 1.0f };
506
507 dt_aligned_pixel_t abs_arg;
509 abs_arg[c] = (arg[c] < 0.0f) ? -arg[c] : arg[c];
510 dt_aligned_pixel_t scaled;
512 scaled[c] = a[c] * arg[c] * (pi[c] - abs_arg[c]);
513 dt_aligned_pixel_t abs_scaled;
515 abs_scaled[c] = (scaled[c] < 0.0f) ? -scaled[c] : scaled[c];
517 sine[c] = scaled[c] * (p[c] * (abs_scaled[c] - one[c]) + one[c]);
518}
519
520// clang-format off
521// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
522// vim: shiftwidth=2 expandtab tabstop=2 cindent
523// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
524// clang-format on
#define m
Definition basecurve.c:231
#define for_four_channels(_var,...)
Definition darktable.h:413
static float f(const float t, const float c, const float x)
Definition graduatednd.c:173
static float Kahan_sum(const float m, float *const __restrict__ c, const float add)
Definition math.h:97
static float scalar_product(const dt_aligned_pixel_t v_1, const dt_aligned_pixel_t v_2)
Definition math.h:196
static float ceil_fast(float x)
Definition math.h:377
static float clamp_range_f(const float x, const float low, const float high)
Definition math.h:88
static float sqf(const float x)
Definition math.h:215
static float dt_log2f(const float f)
Definition math.h:254
#define DT_M_LN2f
Definition math.h:47
static int ipow(int base, int exp)
Fast integer power, computing base^exp.
Definition math.h:468
static void mul_mat_vec_2(const float *m, const float *p, float *o)
Definition math.h:187
#define NORM_MIN
Definition math.h:27
static void upscale_vector(dt_aligned_pixel_t vector, const float scaling)
Definition math.h:244
static float fast_mexp2f(const float x)
Definition math.h:359
static float sinf_fast(float t)
Definition math.h:413
static float dt_fast_expf(const float x)
Definition math.h:282
static float dt_fast_hypotf(const float x, const float y)
Definition math.h:272
static float dt_fast_mexp2f(const float x)
Definition math.h:343
static void mat3mul(float *const __restrict__ dest, const float *const __restrict__ m1, const float *const __restrict__ m2)
Definition math.h:170
static float euclidean_norm(const dt_aligned_pixel_t vector)
Definition math.h:224
static void dt_vector_sin(const dt_aligned_pixel_t arg, dt_aligned_pixel_t sine)
Definition math.h:495
#define M_PI_F
Definition math.h:40
static float fastlog2(float x)
Definition math.h:127
static float fastlog(float x)
Definition math.h:143
static float Log2(float x)
Definition math.h:116
static float Log2Thres(float x, float Thres)
Definition math.h:121
static void dt_fast_expf_4wide(const float x[4], float result[4])
Definition math.h:297
#define M_PI
Definition math.h:37
static void mat3mulv(float *const __restrict__ dest, const float *const mat, const float *const __restrict__ v)
Definition math.h:153
static void downscale_vector(dt_aligned_pixel_t vector, const float scaling)
Definition math.h:233
c
Definition derive_filmic_v6_gamut_mapping.py:11
mask
Definition dtstyle_to_xmp.py:54
str p
Definition extract_wb.py:157
static float __attribute__((__unused__))
Definition thinplate.c:39
Definition math.h:263
float f
Definition math.h:264
int k
Definition math.h:265