Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
choleski.h
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2019, 2026 Aurélien PIERRE.
4 Copyright (C) 2019 Heiko Bauke.
5 Copyright (C) 2019 luzpaz.
6 Copyright (C) 2019-2020 Pascal Obry.
7 Copyright (C) 2020 Ralf Brown.
8 Copyright (C) 2022 Martin Bařinka.
9 Copyright (C) 2022 Sakari Kapanen.
10 Copyright (C) 2023 Luca Zulberti.
11
12 darktable is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
16
17 darktable is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with darktable. If not, see <http://www.gnu.org/licenses/>.
24*/
25
26#include <math.h>
27#include <stdlib.h>
28#include <stdio.h>
29#include <string.h>
30#include <time.h>
31
32#include "common/darktable.h"
33#include "common/imagebuf.h"
35
36
37/* DOCUMENTATION
38 *
39 * Choleski decomposition is a fast way to solve linear systems of equations
40 * described by a positive definite hermitian (= square and symmetrical) matrix.
41 * It is a special case of LU decompositions enabling special optimizations.
42 * For matrices not matching this requirement,
43 * you need to use the Gauss-Jordan elimination in iop/gaussian_elemination.h,
44 * which is about twice as slow but more general.
45 *
46 * To solve A x = y, for x, with A a positive definite hermitian real matrix :
47 *
48 * 1) find L such that A = L x L' (Choleski decomposition)
49 * 2) Second step : solve L x b = y for b (triangular descent)
50 * 3) Third step : solve L' x x = b for x (triangular ascent)
51 *
52 * L is a lower diagonal matrix such that :
53 *
54 * [ l11 0 0 ]
55 * L = [ l12 l22 0 ] (if n = 3)
56 * [ l13 l23 l33 ]
57 *
58 * and L' is its transpose :
59 *
60 * [ l11 l12 l13 ]
61 * L' = [ 0 l22 l23 ] (if n = 3)
62 * [ 0 0 l33 ]
63 *
64 * We use the Cholesky-Banachiewicz algorithm for the decomposition because it
65 * operates row by row, which is more suitable with C memory layout.
66 *
67 * The return codes are in sync with /iop/gaussian_elimination.h, in order, maybe one day,
68 * to have a general linear solving lib that could for example iterate over methods until
69 * one succeed.
70 *
71 * We didn't bother to parallelize the code nor to make it use double floating point precision
72 * because it's already fast enough (2 to 45 ms for 16x16 matrix on Xeon) and used for properly
73 * conditioned matrices.
74 *
75 * Vectorization leads to slow-downs here since we access matrices row-wise and column-wise,
76 * in a non-contiguous fashion.
77 *
78 * References :
79 * "Analyse numérique pour ingénieurs", 4e edition, André Fortin,
80 * Presses Internationales de Polytechnique Montréal, 2011.
81 *
82 * Cholesky method,
83 * https://algowiki-project.org/en/Cholesky_method#The_.5Bmath.5DLL.5ET.5B.2Fmath.5D_decomposition
84 * https://en.wikipedia.org/wiki/Cholesky_decomposition
85 * https://rosettacode.org/wiki/Cholesky_decomposition#C
86 *
87 */
88
89
90static inline int choleski_decompose_fast(const float *const restrict A,
91 float *const restrict L, size_t n)
92{
93 // A is input nxn matrix, decompose it into L such that A = L x L'
94 // fast variant : we don't check values for negatives in sqrt,
95 // ensure you know the properties of your matrix.
96
97 if(A[0] <= 0.0f) return 0; // failure : non positive definite matrice
98
99 for(size_t i = 0; i < n; i++)
100 for(size_t j = 0; j < (i + 1); j++)
101 {
102 float sum = 0.0f;
103
104 for(size_t k = 0; k < j; k++)
105 sum += L[i * n + k] * L[j * n + k];
106
107 L[i * n + j] = (i == j) ?
108 sqrtf(A[i * n + i] - sum) :
109 (A[i * n + j] - sum) / L[j * n + j];
110 }
111
112 return 1; // success
113}
114
115
116static inline int choleski_decompose_safe(const float *const restrict A,
117 float *const restrict L, size_t n)
118{
119 // A is input nxn matrix, decompose it into L such that A = L x L'
120 // slow and safe variant : we check values for negatives in sqrt and divisions by 0.
121
122 if(A[0] <= 0.0f) return 0; // failure : non positive definite matrice
123
124 int valid = 1;
125
126 for(size_t i = 0; i < n; i++)
127 for(size_t j = 0; j < (i + 1); j++)
128 {
129 float sum = 0.0f;
130
131 for(size_t k = 0; k < j; k++)
132 sum += L[i * n + k] * L[j * n + k];
133
134 if(i == j)
135 {
136 const float temp = A[i * n + i] - sum;
137
138 if(temp < 0.0f)
139 {
140 valid = 0;
141 L[i * n + j] = NAN;
142 }
143 else
144 L[i * n + j] = sqrtf(A[i * n + i] - sum);
145 }
146 else
147 {
148 const float temp = L[j * n + j];
149
150 if(temp == 0.0f)
151 {
152 valid = 0;
153 L[i * n + j] = NAN;
154 }
155 else
156 L[i * n + j] = (A[i * n + j] - sum) / temp;
157 }
158 }
159
160 return valid; // success ?
161}
162
163
164static inline int triangular_descent_fast(const float *const restrict L,
165 const float *const restrict y, float *const restrict b,
166 const size_t n)
167{
168 // solve L x b = y for b
169 // use the lower triangular part of L from top to bottom
170
171 for(size_t i = 0; i < n; ++i)
172 {
173 float sum = y[i];
174 for(size_t j = 0; j < i; ++j)
175 sum -= L[i * n + j] * b[j];
176
177 b[i] = sum / L[i * n + i];
178 }
179
180 return 1; // success !
181}
182
183
184static inline int triangular_descent_safe(const float *const restrict L,
185 const float *const restrict y, float *const restrict b,
186 const size_t n)
187{
188 // solve L x b = y for b
189 // use the lower triangular part of L from top to bottom
190
191 int valid = 1;
192
193 for(size_t i = 0; i < n; ++i)
194 {
195 float sum = y[i];
196 for(size_t j = 0; j < i; ++j)
197 sum -= L[i * n + j] * b[j];
198
199 const float temp = L[i * n + i];
200
201 if(temp != 0.0f)
202 b[i] = sum / temp;
203 else
204 {
205 b[i] = NAN;
206 valid = 0;
207 }
208 }
209
210 return valid; // success ?
211}
212
213
214static inline int triangular_ascent_fast(const float *const restrict L,
215 const float *const restrict b, float *const restrict x,
216 const size_t n)
217{
218 // solve L' x x = b for x
219 // use the lower triangular part of L transposed from bottom to top
220
221 for(int i = (n - 1); i > -1 ; --i)
222 {
223 float sum = b[i];
224 for(int j = (n - 1); j > i; --j)
225 sum -= L[j * n + i] * x[j];
226
227 x[i] = sum / L[i * n + i];
228 }
229
230 return 1; // success !
231}
232
233
234static inline int triangular_ascent_safe(const float *const restrict L,
235 const float *const restrict b, float *const restrict x,
236 const size_t n)
237{
238 // solve L' x x = b for x
239 // use the lower triangular part of L transposed from bottom to top
240
241 int valid = 1;
242
243 for(int i = (n - 1); i > -1 ; --i)
244 {
245 float sum = b[i];
246 for(int j = (n - 1); j > i; --j)
247 sum -= L[j * n + i] * x[j];
248
249 const float temp = L[i * n + i];
250 if(temp != 0.0f)
251 x[i] = sum / temp;
252 else
253 {
254 x[i] = NAN;
255 valid = 0;
256 }
257 }
258
259 return valid; // success ?
260}
261
262
263static inline int solve_hermitian(const float *const restrict A,
264 float *const restrict y,
265 const size_t n, const int checks)
266{
267 // Solve A x = y where A an hermitian positive definite matrix n x n
268 // x and y are n vectors. Output the result in y
269
270 // A and y need to be 64-bits aligned, which is darktable's default memory alignment
271 // if you used DT_ALIGNED_ARRAY and dt_alloc_align_float(...) to declare arrays and pointers
272
273 // If you are sure about the properties of the matrix A (symmetrical square definite positive)
274 // because you built it yourself, set checks == FALSE to branch to the fast track that
275 // skips validity checks.
276
277 // If you are unsure about A, because it is user-set, set checks == TRUE to branch
278 // to the safe but slower path.
279
280 // clock_t start = clock();
281
282 int valid = 0;
283 int err = 0;
284 float *const restrict x = dt_alloc_align_float(n);
285 float *const restrict L = dt_alloc_align_float(n * n);
286
287 if(!x || !L)
288 {
289 dt_control_log(_("Choleski decomposition failed to allocate memory, check your RAM settings"));
290 fprintf(stdout, "Choleski decomposition failed to allocate memory, check your RAM settings\n");
291 err = 1;
292 goto error;
293 }
294
295 // LU decomposition
296 valid = (checks) ? choleski_decompose_safe(A, L, n) :
298 if(!valid) fprintf(stdout, "Cholesky decomposition returned NaNs\n");
299
300 // Triangular descent
301 if(valid)
302 valid = (checks) ? triangular_descent_safe(L, y, x, n) :
304 if(!valid) fprintf(stdout, "Cholesky LU triangular descent returned NaNs\n");
305
306 // Triangular ascent
307 if(valid)
308 valid = (checks) ? triangular_ascent_safe(L, x, y, n) :
310 if(!valid) fprintf(stdout, "Cholesky LU triangular ascent returned NaNs\n");
311
312 if(!valid) err = 1;
313
314error:
317
318 //clock_t end = clock();
319 //fprintf(stdout, "hermitian matrix solving took : %f s\n", ((float) (end - start)) / CLOCKS_PER_SEC);
320
321 return err;
322}
323
324
325static inline int transpose_dot_matrix(float *const restrict A, // input
326 float *const restrict A_square, // output
327 const size_t m, const size_t n)
328{
329 // Construct the square symmetrical definite positive matrix A' A,
330 // BUT only compute the lower triangle part for performance
331
332 for(size_t i = 0; i < n; ++i)
333 for(size_t j = 0; j < (i + 1); ++j)
334 {
335 float sum = 0.0f;
336 for(size_t k = 0; k < m; ++k)
337 sum += A[k * n + i] * A[k * n + j];
338
339 A_square[i * n + j] = sum;
340 }
341
342 return 0;
343}
344
345
346static inline int transpose_dot_vector(float *const restrict A, // input
347 float *const restrict y, // input
348 float *const restrict y_square, // output
349 const size_t m, const size_t n)
350{
351 // Construct the vector A' y
352
353 for(size_t i = 0; i < n; ++i)
354 {
355 float sum = 0.0f;
356 for(size_t k = 0; k < m; ++k)
357 sum += A[k * n + i] * y[k];
358
359 y_square[i] = sum;
360 }
361
362 return 0;
363}
364
365
366static inline int pseudo_solve(float *const restrict A,
367 float *const restrict y,
368 const size_t m, const size_t n, const int checks)
369{
370 // Solve the linear problem A x = y with the over-constrained rectanguler matrice A
371 // of dimension m x n (m >= n) by the least squares method
372
373 //clock_t start = clock();
374
375 int err = 0;
376 if(m < n)
377 {
378 fprintf(stdout, "Pseudo solve: cannot cast %zu \303\227 %zu matrice\n", m, n);
379 return 1;
380 }
381
382 float *const restrict A_square = dt_alloc_align_float(n * n);
383 float *const restrict y_square = dt_alloc_align_float(n);
384
385 if(!A_square || !y_square)
386 {
387 dt_control_log(_("Choleski decomposition failed to allocate memory, check your RAM settings"));
388 err = 1;
389 goto error;
390 }
391
392 #ifdef _OPENMP
393 #pragma omp parallel sections
394 #endif
395 {
396 #ifdef _OPENMP
397 #pragma omp section
398 #endif
399 {
400 // Prepare the least squares matrix = A' A
401 transpose_dot_matrix(A, A_square, m, n);
402 }
403
404 #ifdef _OPENMP
405 #pragma omp section
406 #endif
407 {
408 // Prepare the y square vector = A' y
409 transpose_dot_vector(A, y, y_square, m, n);
410 }
411 }
412
413
414 // Solve A' A x = A' y for x
415 if(solve_hermitian(A_square, y_square, n, checks))
416 {
417 err = 1;
418 goto error;
419 }
420 dt_simd_memcpy(y_square, y, n);
421
422error:
423 dt_free_align(y_square);
424 dt_free_align(A_square);
425
426 //clock_t end = clock();
427 //fprintf(stdout, "hermitian matrix solving took : %f s\n", ((float) (end - start)) / CLOCKS_PER_SEC);
428
429 return err;
430}
431
432
433// clang-format off
434// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
435// vim: shiftwidth=2 expandtab tabstop=2 cindent
436// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
437// clang-format on
static void error(char *msg)
Definition ashift_lsd.c:202
#define m
Definition basecurve.c:277
static int transpose_dot_vector(float *const restrict A, float *const restrict y, float *const restrict y_square, const size_t m, const size_t n)
Definition choleski.h:346
static int transpose_dot_matrix(float *const restrict A, float *const restrict A_square, const size_t m, const size_t n)
Definition choleski.h:325
static int choleski_decompose_safe(const float *const restrict A, float *const restrict L, size_t n)
Definition choleski.h:116
static int solve_hermitian(const float *const restrict A, float *const restrict y, const size_t n, const int checks)
Definition choleski.h:263
static int pseudo_solve(float *const restrict A, float *const restrict y, const size_t m, const size_t n, const int checks)
Definition choleski.h:366
static int choleski_decompose_fast(const float *const restrict A, float *const restrict L, size_t n)
Definition choleski.h:90
static int triangular_ascent_safe(const float *const restrict L, const float *const restrict b, float *const restrict x, const size_t n)
Definition choleski.h:234
static int triangular_ascent_fast(const float *const restrict L, const float *const restrict b, float *const restrict x, const size_t n)
Definition choleski.h:214
static int triangular_descent_fast(const float *const restrict L, const float *const restrict y, float *const restrict b, const size_t n)
Definition choleski.h:164
static int triangular_descent_safe(const float *const restrict L, const float *const restrict y, float *const restrict b, const size_t n)
Definition choleski.h:184
#define A(y, x)
Definition colorspaces.c:186
const float i
Definition colorspaces_inline_conversions.h:669
const float L
Definition colorspaces_inline_conversions.h:724
const float b
Definition colorspaces_inline_conversions.h:1326
const float n
Definition colorspaces_inline_conversions.h:929
void dt_control_log(const char *msg,...)
Definition control.c:530
#define dt_free_align(ptr)
Definition darktable.h:405
static float * dt_alloc_align_float(size_t pixels)
Definition darktable.h:418
static __DT_CLONE_TARGETS__ void dt_simd_memcpy(const float *const __restrict__ in, float *const __restrict__ out, const size_t num_elem)
Definition imagebuf.h:68
static const float x
Definition iop_profile.h:239