Ansel 0.0
A darktable fork - bloat + design vision
Loading...
Searching...
No Matches
choleski.h
Go to the documentation of this file.
1/*
2 This file is part of darktable,
3 Copyright (C) 2019-2020 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17*/
18
19#include <math.h>
20#include <stdlib.h>
21#include <stdio.h>
22#include <string.h>
23#include <time.h>
24
25#include "common/darktable.h"
26#include "common/imagebuf.h"
28
29
30/* DOCUMENTATION
31 *
32 * Choleski decomposition is a fast way to solve linear systems of equations
33 * described by a positive definite hermitian (= square and symmetrical) matrix.
34 * It is a special case of LU decompositions enabling special optimizations.
35 * For matrices not matching this requirement,
36 * you need to use the Gauss-Jordan elimination in iop/gaussian_elemination.h,
37 * which is about twice as slow but more general.
38 *
39 * To solve A x = y, for x, with A a positive definite hermitian real matrix :
40 *
41 * 1) find L such that A = L x L' (Choleski decomposition)
42 * 2) Second step : solve L x b = y for b (triangular descent)
43 * 3) Third step : solve L' x x = b for x (triangular ascent)
44 *
45 * L is a lower diagonal matrix such that :
46 *
47 * [ l11 0 0 ]
48 * L = [ l12 l22 0 ] (if n = 3)
49 * [ l13 l23 l33 ]
50 *
51 * and L' is its transpose :
52 *
53 * [ l11 l12 l13 ]
54 * L' = [ 0 l22 l23 ] (if n = 3)
55 * [ 0 0 l33 ]
56 *
57 * We use the Cholesky-Banachiewicz algorithm for the decomposition because it
58 * operates row by row, which is more suitable with C memory layout.
59 *
60 * The return codes are in sync with /iop/gaussian_elimination.h, in order, maybe one day,
61 * to have a general linear solving lib that could for example iterate over methods until
62 * one succeed.
63 *
64 * We didn't bother to parallelize the code nor to make it use double floating point precision
65 * because it's already fast enough (2 to 45 ms for 16x16 matrix on Xeon) and used for properly
66 * conditioned matrices.
67 *
68 * Vectorization leads to slow-downs here since we access matrices row-wise and column-wise,
69 * in a non-contiguous fashion.
70 *
71 * References :
72 * "Analyse numérique pour ingénieurs", 4e edition, André Fortin,
73 * Presses Internationales de Polytechnique Montréal, 2011.
74 *
75 * Cholesky method,
76 * https://algowiki-project.org/en/Cholesky_method#The_.5Bmath.5DLL.5ET.5B.2Fmath.5D_decomposition
77 * https://en.wikipedia.org/wiki/Cholesky_decomposition
78 * https://rosettacode.org/wiki/Cholesky_decomposition#C
79 *
80 */
81
82
84static inline int choleski_decompose_fast(const float *const restrict A,
85 float *const restrict L, size_t n)
86{
87 // A is input nxn matrix, decompose it into L such that A = L x L'
88 // fast variant : we don't check values for negatives in sqrt,
89 // ensure you know the properties of your matrix.
90
91 if(A[0] <= 0.0f) return 0; // failure : non positive definite matrice
92
93 for(size_t i = 0; i < n; i++)
94 for(size_t j = 0; j < (i + 1); j++)
95 {
96 float sum = 0.0f;
97
98 for(size_t k = 0; k < j; k++)
99 sum += L[i * n + k] * L[j * n + k];
100
101 L[i * n + j] = (i == j) ?
102 sqrtf(A[i * n + i] - sum) :
103 (A[i * n + j] - sum) / L[j * n + j];
104 }
105
106 return 1; // success
107}
108
109
111static inline int choleski_decompose_safe(const float *const restrict A,
112 float *const restrict L, size_t n)
113{
114 // A is input nxn matrix, decompose it into L such that A = L x L'
115 // slow and safe variant : we check values for negatives in sqrt and divisions by 0.
116
117 if(A[0] <= 0.0f) return 0; // failure : non positive definite matrice
118
119 int valid = 1;
120
121 for(size_t i = 0; i < n; i++)
122 for(size_t j = 0; j < (i + 1); j++)
123 {
124 float sum = 0.0f;
125
126 for(size_t k = 0; k < j; k++)
127 sum += L[i * n + k] * L[j * n + k];
128
129 if(i == j)
130 {
131 const float temp = A[i * n + i] - sum;
132
133 if(temp < 0.0f)
134 {
135 valid = 0;
136 L[i * n + j] = NAN;
137 }
138 else
139 L[i * n + j] = sqrtf(A[i * n + i] - sum);
140 }
141 else
142 {
143 const float temp = L[j * n + j];
144
145 if(temp == 0.0f)
146 {
147 valid = 0;
148 L[i * n + j] = NAN;
149 }
150 else
151 L[i * n + j] = (A[i * n + j] - sum) / temp;
152 }
153 }
154
155 return valid; // success ?
156}
157
158
160static inline int triangular_descent_fast(const float *const restrict L,
161 const float *const restrict y, float *const restrict b,
162 const size_t n)
163{
164 // solve L x b = y for b
165 // use the lower triangular part of L from top to bottom
166
167 for(size_t i = 0; i < n; ++i)
168 {
169 float sum = y[i];
170 for(size_t j = 0; j < i; ++j)
171 sum -= L[i * n + j] * b[j];
172
173 b[i] = sum / L[i * n + i];
174 }
175
176 return 1; // success !
177}
178
179
181static inline int triangular_descent_safe(const float *const restrict L,
182 const float *const restrict y, float *const restrict b,
183 const size_t n)
184{
185 // solve L x b = y for b
186 // use the lower triangular part of L from top to bottom
187
188 int valid = 1;
189
190 for(size_t i = 0; i < n; ++i)
191 {
192 float sum = y[i];
193 for(size_t j = 0; j < i; ++j)
194 sum -= L[i * n + j] * b[j];
195
196 const float temp = L[i * n + i];
197
198 if(temp != 0.0f)
199 b[i] = sum / temp;
200 else
201 {
202 b[i] = NAN;
203 valid = 0;
204 }
205 }
206
207 return valid; // success ?
208}
209
210
212static inline int triangular_ascent_fast(const float *const restrict L,
213 const float *const restrict b, float *const restrict x,
214 const size_t n)
215{
216 // solve L' x x = b for x
217 // use the lower triangular part of L transposed from bottom to top
218
219 for(int i = (n - 1); i > -1 ; --i)
220 {
221 float sum = b[i];
222 for(int j = (n - 1); j > i; --j)
223 sum -= L[j * n + i] * x[j];
224
225 x[i] = sum / L[i * n + i];
226 }
227
228 return 1; // success !
229}
230
231
233static inline int triangular_ascent_safe(const float *const restrict L,
234 const float *const restrict b, float *const restrict x,
235 const size_t n)
236{
237 // solve L' x x = b for x
238 // use the lower triangular part of L transposed from bottom to top
239
240 int valid = 1;
241
242 for(int i = (n - 1); i > -1 ; --i)
243 {
244 float sum = b[i];
245 for(int j = (n - 1); j > i; --j)
246 sum -= L[j * n + i] * x[j];
247
248 const float temp = L[i * n + i];
249 if(temp != 0.0f)
250 x[i] = sum / temp;
251 else
252 {
253 x[i] = NAN;
254 valid = 0;
255 }
256 }
257
258 return valid; // success ?
259}
260
261
263static inline int solve_hermitian(const float *const restrict A,
264 float *const restrict y,
265 const size_t n, const int checks)
266{
267 // Solve A x = y where A an hermitian positive definite matrix n x n
268 // x and y are n vectors. Output the result in y
269
270 // A and y need to be 64-bits aligned, which is darktable's default memory alignment
271 // if you used DT_ALIGNED_ARRAY and dt_alloc_sse_ps(...) to declare arrays and pointers
272
273 // If you are sure about the properties of the matrix A (symmetrical square definite positive)
274 // because you built it yourself, set checks == FALSE to branch to the fast track that
275 // skips validity checks.
276
277 // If you are unsure about A, because it is user-set, set checks == TRUE to branch
278 // to the safe but slower path.
279
280 // clock_t start = clock();
281
282 int valid = 0;
283 float *const restrict x = dt_alloc_sse_ps(n);
284 float *const restrict L = dt_alloc_sse_ps(n * n);
285
286 if(!x || !L)
287 {
288 dt_control_log(_("Choleski decomposition failed to allocate memory, check your RAM settings"));
289 fprintf(stdout, "Choleski decomposition failed to allocate memory, check your RAM settings\n");
290 return 0;
291 }
292
293 // LU decomposition
294 valid = (checks) ? choleski_decompose_safe(A, L, n) :
296 if(!valid) fprintf(stdout, "Cholesky decomposition returned NaNs\n");
297
298 // Triangular descent
299 if(valid)
300 valid = (checks) ? triangular_descent_safe(L, y, x, n) :
301 triangular_descent_fast(L, y, x, n) ;
302 if(!valid) fprintf(stdout, "Cholesky LU triangular descent returned NaNs\n");
303
304 // Triangular ascent
305 if(valid)
306 valid = (checks) ? triangular_ascent_safe(L, x, y, n) :
307 triangular_ascent_fast(L, x, y, n);
308 if(!valid) fprintf(stdout, "Cholesky LU triangular ascent returned NaNs\n");
309
310 dt_free_align(x);
311 dt_free_align(L);
312
313 //clock_t end = clock();
314 //fprintf(stdout, "hermitian matrix solving took : %f s\n", ((float) (end - start)) / CLOCKS_PER_SEC);
315
316 return valid;
317}
318
319
321static inline int transpose_dot_matrix(float *const restrict A, // input
322 float *const restrict A_square, // output
323 const size_t m, const size_t n)
324{
325 // Construct the square symmetrical definite positive matrix A' A,
326 // BUT only compute the lower triangle part for performance
327
328 for(size_t i = 0; i < n; ++i)
329 for(size_t j = 0; j < (i + 1); ++j)
330 {
331 float sum = 0.0f;
332 for(size_t k = 0; k < m; ++k)
333 sum += A[k * n + i] * A[k * n + j];
334
335 A_square[i * n + j] = sum;
336 }
337
338 return 1;
339}
340
341
343static inline int transpose_dot_vector(float *const restrict A, // input
344 float *const restrict y, // input
345 float *const restrict y_square, // output
346 const size_t m, const size_t n)
347{
348 // Construct the vector A' y
349
350 for(size_t i = 0; i < n; ++i)
351 {
352 float sum = 0.0f;
353 for(size_t k = 0; k < m; ++k)
354 sum += A[k * n + i] * y[k];
355
356 y_square[i] = sum;
357 }
358
359 return 1;
360}
361
362
364static inline int pseudo_solve(float *const restrict A,
365 float *const restrict y,
366 const size_t m, const size_t n, const int checks)
367{
368 // Solve the linear problem A x = y with the over-constrained rectanguler matrice A
369 // of dimension m x n (m >= n) by the least squares method
370
371 //clock_t start = clock();
372
373 int valid = 1;
374 if(m < n)
375 {
376 valid = 0;
377 fprintf(stdout, "Pseudo solve: cannot cast %zu \303\227 %zu matrice\n", m, n);
378 return valid;
379 }
380
381 float *const restrict A_square = dt_alloc_sse_ps(n * n);
382 float *const restrict y_square = dt_alloc_sse_ps(n);
383
384 if(!A_square || !y_square)
385 {
386 dt_control_log(_("Choleski decomposition failed to allocate memory, check your RAM settings"));
387 return 0;
388 }
389
390 #ifdef _OPENMP
391 #pragma omp parallel sections
392 #endif
393 {
394 #ifdef _OPENMP
395 #pragma omp section
396 #endif
397 {
398 // Prepare the least squares matrix = A' A
399 valid = transpose_dot_matrix(A, A_square, m, n);
400 }
401
402 #ifdef _OPENMP
403 #pragma omp section
404 #endif
405 {
406 // Prepare the y square vector = A' y
407 valid = transpose_dot_vector(A, y, y_square, m, n);
408 }
409 }
410
411
412 // Solve A' A x = A' y for x
413 valid = solve_hermitian(A_square, y_square, n, checks);
414 dt_simd_memcpy(y_square, y, n);
415
416 dt_free_align(y_square);
417 dt_free_align(A_square);
418
419 //clock_t end = clock();
420 //fprintf(stdout, "hermitian matrix solving took : %f s\n", ((float) (end - start)) / CLOCKS_PER_SEC);
421
422 return valid;
423}
424
425
426// clang-format off
427// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
428// vim: shiftwidth=2 expandtab tabstop=2 cindent
429// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
430// clang-format on
#define m
Definition basecurve.c:231
static __DT_CLONE_TARGETS__ int choleski_decompose_fast(const float *const restrict A, float *const restrict L, size_t n)
Definition choleski.h:84
static __DT_CLONE_TARGETS__ int triangular_descent_safe(const float *const restrict L, const float *const restrict y, float *const restrict b, const size_t n)
Definition choleski.h:181
static __DT_CLONE_TARGETS__ int transpose_dot_vector(float *const restrict A, float *const restrict y, float *const restrict y_square, const size_t m, const size_t n)
Definition choleski.h:343
static __DT_CLONE_TARGETS__ int triangular_descent_fast(const float *const restrict L, const float *const restrict y, float *const restrict b, const size_t n)
Definition choleski.h:160
static __DT_CLONE_TARGETS__ int solve_hermitian(const float *const restrict A, float *const restrict y, const size_t n, const int checks)
Definition choleski.h:263
static __DT_CLONE_TARGETS__ int triangular_ascent_fast(const float *const restrict L, const float *const restrict b, float *const restrict x, const size_t n)
Definition choleski.h:212
static __DT_CLONE_TARGETS__ int choleski_decompose_safe(const float *const restrict A, float *const restrict L, size_t n)
Definition choleski.h:111
static __DT_CLONE_TARGETS__ int transpose_dot_matrix(float *const restrict A, float *const restrict A_square, const size_t m, const size_t n)
Definition choleski.h:321
static __DT_CLONE_TARGETS__ int triangular_ascent_safe(const float *const restrict L, const float *const restrict b, float *const restrict x, const size_t n)
Definition choleski.h:233
static __DT_CLONE_TARGETS__ int pseudo_solve(float *const restrict A, float *const restrict y, const size_t m, const size_t n, const int checks)
Definition choleski.h:364
#define A(y, x)
Definition colorspaces.c:148
void dt_control_log(const char *msg,...)
Definition control.c:424
#define __DT_CLONE_TARGETS__
Definition darktable.h:249
#define dt_free_align(A)
Definition darktable.h:334
static void * dt_alloc_sse_ps(size_t pixels)
Definition darktable.h:356
static __DT_CLONE_TARGETS__ void dt_simd_memcpy(const float *const __restrict__ in, float *const __restrict__ out, const size_t num_elem)
Definition imagebuf.h:64