Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
sse.h
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2017-2020 darktable developers.
4 darktable is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8 darktable is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12 You should have received a copy of the GNU General Public License
13 along with darktable. If not, see <http://www.gnu.org/licenses/>.
14*/
15#pragma once
16
17#include <xmmintrin.h>
18
19
24#define POLY0(x, c0) _mm_set1_ps(c0)
25#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
26#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
27#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
28#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
29#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
30
31#define EXP_POLY_DEGREE 4
32#define LOG_POLY_DEGREE 5
33
37static inline __m128 _mm_exp2_ps(__m128 x)
38{
39 __m128i ipart;
40 __m128 fpart, expipart, expfpart;
41
42 x = _mm_min_ps(x, _mm_set1_ps(129.00000f));
43 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
44
45 /* ipart = int(x - 0.5) */
46 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
47
48 /* fpart = x - ipart */
49 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
50
51 /* expipart = (float) (1 << ipart) */
52 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
53
54/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
55#if EXP_POLY_DEGREE == 5
56 expfpart
57 = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
58#elif EXP_POLY_DEGREE == 4
59 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
60#elif EXP_POLY_DEGREE == 3
61 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
62#elif EXP_POLY_DEGREE == 2
63 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
64#else
65#error
66#endif
67
68 return _mm_mul_ps(expipart, expfpart);
69}
70
71
75static inline __m128 _mm_log2_ps(__m128 x)
76{
77 __m128i expmask = _mm_set1_epi32(0x7f800000);
78 __m128i mantmask = _mm_set1_epi32(0x007fffff);
79 __m128 one = _mm_set1_ps(1.0f);
80
81 __m128i i = _mm_castps_si128(x);
82
83 /* exp = (float) exponent(x) */
84 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
85
86 /* mant = (float) mantissa(x) */
87 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
88
89 __m128 logmant;
90
91/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
92 * These coefficients can be generate with
93 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
94 */
95#if LOG_POLY_DEGREE == 6
96 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f,
97 -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
98#elif LOG_POLY_DEGREE == 5
99 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f,
100 -0.465725644288844778798f, 0.0596515482674574969533f);
101#elif LOG_POLY_DEGREE == 4
102 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f,
103 -0.107254423828329604454f);
104#elif LOG_POLY_DEGREE == 3
105 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
106#else
107#error
108#endif
109
110 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
111 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
112
113 return _mm_add_ps(logmant, exp);
114}
115
116static inline __m128 _mm_pow_ps(__m128 x, __m128 y)
117{
118 return _mm_exp2_ps(_mm_mul_ps(_mm_log2_ps(x), y));
119}
120
121static inline __m128 _mm_pow_ps1(__m128 x, float y)
122{
123 return _mm_exp2_ps(_mm_mul_ps(_mm_log2_ps(x), _mm_set1_ps(y)));
124}
125
126
131static inline float _mm_vectorGetByIndex( __m128 V, unsigned int i)
132{
133 union {
134 __m128 v;
135 float a[4];
136 } converter;
137
138 converter.v = V;
139 return converter.a[i];
140}
141
142// clang-format off
143// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
144// vim: shiftwidth=2 expandtab tabstop=2 cindent
145// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
146// clang-format on
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition sse.h:29
static __m128 _mm_log2_ps(__m128 x)
Definition sse.h:75
#define POLY3(x, c0, c1, c2, c3)
Definition sse.h:27
static __m128 _mm_pow_ps1(__m128 x, float y)
Definition sse.h:121
static __m128 _mm_exp2_ps(__m128 x)
Definition sse.h:37
#define POLY2(x, c0, c1, c2)
Definition sse.h:26
#define POLY4(x, c0, c1, c2, c3, c4)
Definition sse.h:28
static float _mm_vectorGetByIndex(__m128 V, unsigned int i)
Definition sse.h:131
static __m128 _mm_pow_ps(__m128 x, __m128 y)
Definition sse.h:116