49#define CLAMPI(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
55#define CL_ALIGNMENT ((piece->dsc_in.filters != 9u) ? 4 : 1)
63static unsigned _gcd(
unsigned a,
unsigned b)
76static unsigned _lcm(
unsigned a,
unsigned b)
78 return (((
unsigned long)a * b) /
_gcd(a, b));
82static inline int _min(
int a,
int b)
87static inline int _max(
int a,
int b)
95 return n + a - (
n % a);
103 const int off =
n % a;
104 const int shift = (off > a/2) ? a - off : -off;
119 fprintf(stderr,
" {%5d %5d ->%5d %5d (%5dx%5d) %.6f } %s\n",
126_nm_constraints(
double x[],
int n)
133 if(
x[0] > 1.0)
x[0] = 1.0 -
x[0];
134 if(
x[1] > 1.0)
x[1] = 1.0 -
x[1];
135 if(
x[2] > 1.0)
x[2] = 1.0 -
x[2];
136 if(
x[3] > 1.0)
x[3] = 1.0 -
x[3];
157 self->modify_roi_in(self, pipe, &piece_copy, &oroi_test, &iroi_probe);
159 double fitness = 0.0;
161 fitness += (
double)(iroi_probe.
x - iroi->
x) * (iroi_probe.
x - iroi->
x);
162 fitness += (
double)(iroi_probe.
y - iroi->
y) * (iroi_probe.
y - iroi->
y);
208static int _simplex(
double (*objfunc)(
double[],
void *[]),
double start[],
int n,
double EPSILON,
209 double scale,
int maxiter,
void (*constrain)(
double[],
int n),
void *rest[])
216 int i, j = 0,
m,
row;
230 double fsum, favg, s, cent;
235 v = (
double **)malloc(
sizeof(
double *) * (
n + 1));
236 f = (
double *)malloc(
sizeof(
double) * (
n + 1));
237 vr = (
double *)malloc(
sizeof(
double) *
n);
238 ve = (
double *)malloc(
sizeof(
double) *
n);
239 vc = (
double *)malloc(
sizeof(
double) *
n);
240 vm = (
double *)malloc(
sizeof(
double) *
n);
243 for(
i = 0;
i <=
n;
i++)
245 v[
i] = (
double *)malloc(
sizeof(
double) *
n);
251 pn = scale * (sqrt(
n + 1) - 1 +
n) / (
n * sqrt(2));
252 qn = scale * (sqrt(
n + 1) - 1) / (
n * sqrt(2));
254 for(
i = 0;
i <
n;
i++)
259 for(
i = 1;
i <=
n;
i++)
261 for(j = 0; j <
n; j++)
265 v[
i][j] = pn + start[j];
269 v[
i][j] = qn + start[j];
279 for(j = 0; j <=
n; j++)
281 f[j] = objfunc(
v[j], rest);
286 printf (
"Initial Values\n");
287 for (j = 0; j <=
n; j++)
289 for (
i = 0;
i <
n;
i++)
291 printf (
"%f %f\n",
v[j][
i],
f[j]);
297 for(itr = 1; itr <= maxiter; itr++)
301 for(j = 0; j <=
n; j++)
311 for(j = 0; j <=
n; j++)
321 for(j = 0; j <=
n; j++)
323 if(
f[j] >
f[vh] &&
f[j] <
f[vg])
330 for(j = 0; j <=
n - 1; j++)
333 for(
m = 0;
m <=
n;
m++)
344 for(j = 0; j <=
n - 1; j++)
347 vr[j] = vm[j] +
ALPHA * (vm[j] -
v[vg][j]);
353 fr = objfunc(vr, rest);
355 if(fr <
f[vh] && fr >=
f[vs])
357 for(j = 0; j <=
n - 1; j++)
367 for(j = 0; j <=
n - 1; j++)
370 ve[j] = vm[j] +
GAMMA * (vr[j] - vm[j]);
376 fe = objfunc(ve, rest);
384 for(j = 0; j <=
n - 1; j++)
392 for(j = 0; j <=
n - 1; j++)
403 if(fr <
f[vg] && fr >=
f[vh])
406 for(j = 0; j <=
n - 1; j++)
409 vc[j] = vm[j] +
BETA * (vr[j] - vm[j]);
415 fc = objfunc(vc, rest);
420 for(j = 0; j <=
n - 1; j++)
423 vc[j] = vm[j] -
BETA * (vm[j] -
v[vg][j]);
429 fc = objfunc(vc, rest);
435 for(j = 0; j <=
n - 1; j++)
452 for(j = 0; j <=
n - 1; j++)
454 v[
row][j] =
v[vs][j] + (
v[
row][j] -
v[vs][j]) / 2.0;
462 f[vg] = objfunc(
v[vg], rest);
467 f[vh] = objfunc(
v[vh], rest);
473 printf (
"Iteration %d\n", itr);
474 for (j = 0; j <=
n; j++)
476 for (
i = 0;
i <
n;
i++)
478 printf (
"%f %f\n",
v[j][
i],
f[j]);
485 for(j = 0; j <=
n; j++)
489 favg = fsum / (
n + 1);
491 for(j = 0; j <=
n; j++)
493 s += pow((
f[j] - favg), 2.0) / (
n);
502 for(j = 0; j <=
n; j++)
511 printf (
"The minimum was found at\n");
512 for (j = 0; j <
n; j++)
514 printf (
"%e\n",
v[vs][j]);
517 double min = objfunc (
v[vs], rest);
518 printf (
"Function value at minimum %f\n",
min);
520 printf (
"%d Function Evaluations\n",
k);
521 printf (
"%d Iterations through program\n", itr);
529 for(
i = 0;
i <=
n;
i++)
542 void *rest[5] = { (
void *)self, (
void *)piece, (
void *)iroi, (
void *)oroi, (
void *)pipe };
543 double start[4] = { (float)oroi->
x / piece->
iwidth, (
float)oroi->
y / piece->
iheight,
552 oroi->
x = start[0] * piece->
iwidth;
553 oroi->
y = start[1] * piece->
iheight;
557 return (iter <= maxiter);
575 self->modify_roi_in(self, pipe, &piece_copy, oroi, &iroi_probe);
576 while((abs((
int)iroi_probe.
x - (
int)iroi->
x) >
delta || abs((
int)iroi_probe.
y - (
int)iroi->
y) >
delta
583 oroi->
x += (iroi->
x - iroi_probe.
x) * oroi->
scale / iroi->
scale;
584 oroi->
y += (iroi->
y - iroi_probe.
y) * oroi->
scale / iroi->
scale;
591 self->modify_roi_in(self, pipe, &piece_copy, oroi, &iroi_probe);
595 if(iter > 0)
return TRUE;
611 const void *
const ivoid,
void *
const ovoid,
618 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_ptp] **** tiling module '%s' for image with size %dx%d --> %dx%d\n",
622 const int ipitch = roi_in->
width * in_bpp;
623 const int opitch = roi_out->
width * out_bpp;
624 const int max_bpp =
_max(in_bpp, out_bpp);
628 self->tiling_callback(self, pipe, piece, &
tiling);
636 "memory saving to be expected\n", self->
op);
642 assert(available >= 500.0f * 1024.0f * 1024.0f);
644 available = fmaxf(available - ((
float)roi_out->
width * roi_out->
height * out_bpp)
653 const float maxbuf = fmaxf(
tiling.maxbuf, 1.0f);
654 const float singlebuffer = available /
factor;
660 if((
float)
width *
height * max_bpp * maxbuf > singlebuffer)
662 const float scale = singlebuffer / ((float)
width *
height * max_bpp * maxbuf);
665 if(width < height && scale >= 0.333f)
669 else if(height <= width && scale >= 0.333f)
699 assert(xyalign != 0);
706 const int overlap =
tiling.overlap % xyalign != 0 ? (
tiling.overlap / xyalign + 1) * xyalign
710 const int tile_wd =
width - 2 * overlap > 0 ?
width - 2 * overlap : 1;
711 const int tile_ht =
height - 2 * overlap > 0 ?
height - 2 * overlap : 1;
714 const int tiles_x =
width < roi_in->
width ? ceilf(roi_in->
width / (
float)tile_wd) : 1;
715 const int tiles_y =
height < roi_in->
height ? ceilf(roi_in->
height / (
float)tile_ht) : 1;
720 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_ptp] gave up tiling for module '%s'. too many tiles: %d x %d\n",
721 self->
op, tiles_x, tiles_y);
725 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_ptp] (%dx%d) tiles with max dimensions %dx%d and overlap %d\n",
749 for(
size_t tx = 0; tx < tiles_x; tx++)
752 for(
size_t ty = 0; ty < tiles_y; ty++)
759 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0))
continue;
762 size_t origin[] = { 0, 0, 0 };
763 size_t region[] = { wd, ht, 1 };
766 dt_iop_roi_t iroi = { roi_in->
x + tx * tile_wd, roi_in->
y + ty * tile_ht, wd, ht, roi_in->
scale };
767 dt_iop_roi_t oroi = { roi_out->
x + tx * tile_wd, roi_out->
y + ty * tile_ht, wd, ht, roi_out->
scale };
770 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
771 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
773 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_ptp] tile (%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
") with %" G_GSIZE_FORMAT
"x%" G_GSIZE_FORMAT
" at origin [%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
"]\n",
774 tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
778 for(
size_t j = 0; j < ht; j++)
779 memcpy((
char *)input + j * wd * in_bpp, (
char *)ivoid + ioffs + j * ipitch, (
size_t)wd * in_bpp);
785 int err = self->process(self, pipe, &piece_tile, input, output);
798 origin[0] += overlap;
799 region[0] -= overlap;
800 ooffs += (size_t)overlap * out_bpp;
804 origin[1] += overlap;
805 region[1] -= overlap;
806 ooffs += (size_t)overlap * opitch;
811 for(
size_t j = 0; j < region[1]; j++)
812 memcpy((
char *)
ovoid + ooffs + j * opitch,
813 (
char *)output + ((j + origin[1]) * wd + origin[0]) * out_bpp, (
size_t)region[0] * out_bpp);
823 dt_control_log(_(
"tiling failed for module '%s'. output might be garbled."), self->
op);
832 int err = self->process(self, pipe, piece, ivoid,
ovoid);
842 const void *
const ivoid,
void *
const ovoid,
850 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_roi] **** tiling module '%s' for image input size %dx%d --> %dx%d\n",
857 const int ipitch = roi_in->
width * in_bpp;
858 const int opitch = roi_out->
width * out_bpp;
859 const int max_bpp =
_max(in_bpp, out_bpp);
861 float fullscale = fmaxf(roi_in->
scale / roi_out->
scale, sqrtf(((
float)roi_in->
width * roi_in->
height)
865 const int delta = ceilf(fullscale);
872 self->tiling_callback(self, pipe, piece, &
tiling);
878 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_roi] no need to use tiling for module '%s' as no memory saving is expected\n",
885 assert(available >= 500.0f * 1024.0f * 1024.0f);
887 available = fmaxf(available - ((
float)roi_out->
width * roi_out->
height * out_bpp)
896 const float maxbuf = fmaxf(
tiling.maxbuf, 1.0f);
897 const float singlebuffer = available /
factor;
910 assert(xyalign != 0);
913 if((
float)
width *
height * max_bpp * maxbuf > singlebuffer)
915 const float scale = singlebuffer / ((float)
width *
height * max_bpp * maxbuf);
918 if(width < height && scale >= 0.333f)
922 else if(height <= width && scale >= 0.333f)
946 const int overlap_out = ceilf((
float)overlap_in / fullscale);
948 int tiles_x = 1, tiles_y = 1;
954 ? ceilf((
float)roi_in->
width / (
float)
_max(
width - 2 * overlap_in - inacc, 1))
962 ? ceilf((
float)roi_in->
height / (
float)
_max(
height - 2 * overlap_in - inacc, 1))
972 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_roi] gave up tiling for module '%s'. too many tiles: %d x %d\n",
973 self->
op, tiles_x, tiles_y);
981 roi_out->
width % tiles_x == 0 ? roi_out->
width / tiles_x : roi_out->
width / tiles_x + 1, xyalign);
983 roi_out->
height % tiles_y == 0 ? roi_out->
height / tiles_y : roi_out->
height / tiles_y + 1, xyalign);
985 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_roi] (%dx%d) tiles with max dimensions %dx%d, good %dx%d, overlap %d->%d\n",
986 tiles_x, tiles_y,
width,
height, tile_wd, tile_ht, overlap_in, overlap_out);
989 for(
size_t tx = 0; tx < tiles_x; tx++)
990 for(
size_t ty = 0; ty < tiles_y; ty++)
995 const size_t wd = (tx + 1) * tile_wd > roi_out->
width ? (
size_t)roi_out->
width - tx * tile_wd : tile_wd;
996 const size_t ht = (ty + 1) * tile_ht > roi_out->
height ? (
size_t)roi_out->
height - ty * tile_ht : tile_ht;
1000 dt_iop_roi_t iroi_good = { roi_in->
x + tx * tile_wd, roi_in->
y + ty * tile_ht, wd, ht, roi_in->
scale };
1001 dt_iop_roi_t oroi_good = { roi_out->
x + tx * tile_wd, roi_out->
y + ty * tile_ht, wd, ht, roi_out->
scale };
1004 self->modify_roi_in(self, pipe, &piece_copy, &oroi_good, &iroi_good);
1007 iroi_good.
x =
_max(iroi_good.
x, roi_in->
x);
1008 iroi_good.
y =
_max(iroi_good.
y, roi_in->
y);
1019 const int x_in = iroi_good.
x;
1020 const int y_in = iroi_good.
y;
1021 const int width_in = iroi_good.
width;
1022 const int height_in = iroi_good.
height;
1025 const int new_width_in =
_min(
_align_up(width_in + overlap_in +
delta + (x_in - new_x_in), xyalign),
1026 roi_in->
width + roi_in->
x - new_x_in);
1027 const int new_height_in =
_min(
_align_up(height_in + overlap_in +
delta + (y_in - new_y_in), xyalign),
1028 roi_in->
height + roi_in->
y - new_y_in);
1032 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.
scale };
1035 _print_roi(&iroi_full,
"tile iroi_full before optimization");
1036 _print_roi(&oroi_full,
"tile oroi_full before optimization");
1042 "module '%s' not possible.\n",
1047 _print_roi(&iroi_full,
"tile iroi_full after optimization");
1048 _print_roi(&oroi_full,
"tile oroi_full after optimization");
1052 oroi_full.
x =
_min(oroi_full.
x, oroi_good.
x);
1053 oroi_full.
y =
_min(oroi_full.
y, oroi_good.
y);
1058 oroi_full.
x =
_max(oroi_full.
x, roi_out->
x);
1059 oroi_full.
y =
_max(oroi_full.
y, roi_out->
y);
1065 self->modify_roi_in(self, pipe, &piece_full, &oroi_full, &iroi_full);
1068 iroi_full.
x =
_max(iroi_full.
x, roi_in->
x);
1069 iroi_full.
y =
_max(iroi_full.
y, roi_in->
y);
1073 _print_roi(&iroi_full,
"tile iroi_full final");
1074 _print_roi(&oroi_full,
"tile oroi_full final");
1077 const size_t ioffs = ((size_t)iroi_full.
y - roi_in->
y) * ipitch + ((size_t)iroi_full.
x - roi_in->
x) * in_bpp;
1078 size_t ooffs = ((size_t)oroi_good.
y - roi_out->
y) * opitch + ((size_t)oroi_good.
x - roi_out->
x) * out_bpp;
1080 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_roi] process tile (%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
") size %dx%d at origin [%d,%d]\n",
1081 tx, ty, iroi_full.
width, iroi_full.
height, iroi_full.
x, iroi_full.
y);
1085 (
size_t)iroi_full.
width * iroi_full.
height * in_bpp,
1094 (
size_t)oroi_full.
width * oroi_full.
height * out_bpp,
1103 for(
size_t j = 0; j < iroi_full.
height; j++)
1104 memcpy((
char *)input + j * iroi_full.
width * in_bpp, (
char *)ivoid + ioffs + j * ipitch,
1105 (
size_t)iroi_full.
width * in_bpp);
1109 piece_tile.
roi_in = iroi_full;
1110 piece_tile.
roi_out = oroi_full;
1111 int err = self->process(self, pipe, &piece_tile, input, output);
1116 mutable_pipe->
tiling = 0;
1121 const int origin_x = oroi_good.
x - oroi_full.
x;
1122 const int origin_y = oroi_good.
y - oroi_full.
y;
1124 for(
size_t j = 0; j < oroi_good.
height; j++)
1125 memcpy((
char *)
ovoid + ooffs + j * opitch,
1126 (
char *)output + ((j + origin_y) * oroi_full.
width + origin_x) * out_bpp,
1127 (
size_t)oroi_good.
width * out_bpp);
1131 input = output = NULL;
1136 mutable_pipe->
tiling = 0;
1140 dt_control_log(_(
"tiling failed for module '%s'. output might be garbled."), self->
op);
1146 mutable_pipe->
tiling = 0;
1149 int err = self->process(self, pipe, piece, ivoid,
ovoid);
1162 const void *
const ivoid,
void *
const ovoid,
const int in_bpp)
1178 const void *
const ivoid,
void *
const ovoid,
1184 cl_mem input = NULL;
1185 cl_mem output = NULL;
1187 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_cl_ptp] **** tiling module '%s' for image with size %dx%d --> %dx%d\n",
1193 const int ipitch = roi_in->
width * in_bpp;
1194 const int opitch = roi_out->
width * out_bpp;
1195 const int max_bpp =
_max(in_bpp, out_bpp);
1199 self->tiling_callback(self, pipe, piece, &
tiling);
1204 const float singlebuffer = fminf(fmaxf((available -
tiling.overhead) /
factor, 0.0f),
1206 const float maxbuf = fmaxf(
tiling.maxbuf_cl, 1.0f);
1211 if((
float)
width *
height * max_bpp * maxbuf > singlebuffer)
1213 const float scale = singlebuffer / ((float)
width *
height * max_bpp * maxbuf);
1215 if(width < height && scale >= 0.333f)
1219 else if(height <= width && scale >= 0.333f)
1253 const unsigned int halign = xyalign;
1255 assert(xyalign != 0 && walign != 0 && halign != 0);
1269 if(
width <= (
int)walign &&
height <= (int)halign)
break;
1270 if(width < height && height > (
int)halign)
1272 else if(
width > (
int)walign)
1279 const int overlap =
tiling.overlap % xyalign != 0 ? (
tiling.overlap / xyalign + 1) * xyalign
1284 const int tile_wd =
width - 2 * overlap > 0 ?
width - 2 * overlap : 1;
1285 const int tile_ht =
height - 2 * overlap > 0 ?
height - 2 * overlap : 1;
1289 const int tiles_x =
width < roi_in->
width ? ceilf(roi_in->
width / (
float)tile_wd) : 1;
1290 const int tiles_y =
height < roi_in->
height ? ceilf(roi_in->
height / (
float)tile_ht) : 1;
1295 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_cl_ptp] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1296 self->
op, tiles_x, tiles_y);
1300 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_cl_ptp] (%dx%d) tiles with max dimensions %dx%d, good %dx%d and overlap %d\n",
1301 tiles_x, tiles_y,
width,
height, tile_wd, tile_ht, overlap);
1304 for(
size_t tx = 0; tx < tiles_x; tx++)
1305 for(
size_t ty = 0; ty < tiles_y; ty++)
1307 mutable_pipe->
tiling = 1;
1313 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0))
continue;
1316 size_t origin[] = { 0, 0, 0 };
1317 size_t region[] = { wd, ht, 1 };
1320 dt_iop_roi_t iroi = { roi_in->
x + tx * tile_wd, roi_in->
y + ty * tile_ht, wd, ht, roi_in->
scale };
1321 dt_iop_roi_t oroi = { roi_out->
x + tx * tile_wd, roi_out->
y + ty * tile_ht, wd, ht, roi_out->
scale };
1325 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
1326 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
1329 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_cl_ptp] tile (%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
") size %" G_GSIZE_FORMAT
"x%" G_GSIZE_FORMAT
" at origin [%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
"]\n",
1330 tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
1341 if(err != CL_SUCCESS)
goto error;
1345 piece_tile.
roi_in = iroi;
1347 if(!self->process_cl(self, pipe, &piece_tile, input, output))
goto error;
1353 origin[0] += overlap;
1354 region[0] -= overlap;
1355 ooffs += (size_t)overlap * out_bpp;
1359 origin[1] += overlap;
1360 region[1] -= overlap;
1361 ooffs += (size_t)overlap * opitch;
1367 if(err != CL_SUCCESS)
goto error;
1381 mutable_pipe->
tiling = 0;
1387 mutable_pipe->
tiling = 0;
1389 "[default_process_tiling_opencl_ptp] couldn't run process_cl() for module '%s' in tiling mode: %i\n",
1399 const void *
const ivoid,
void *
const ovoid,
1405 cl_mem input = NULL;
1406 cl_mem output = NULL;
1409 "[default_process_tiling_cl_roi] **** tiling module '%s' for image with input size %dx%d --> %dx%d\n",
1417 const int ipitch = roi_in->
width * in_bpp;
1418 const int opitch = roi_out->
width * out_bpp;
1419 const int max_bpp =
_max(in_bpp, out_bpp);
1421 const float fullscale = fmaxf(roi_in->
scale / roi_out->
scale, sqrtf(((
float)roi_in->
width * roi_in->
height)
1425 const int delta = ceilf(fullscale);
1432 self->tiling_callback(self, pipe, piece, &
tiling);
1437 const float singlebuffer = fminf(fmaxf((available -
tiling.overhead) /
factor, 0.0f),
1439 const float maxbuf = fmaxf(
tiling.maxbuf_cl, 1.0f);
1453 assert(xyalign != 0);
1456 if((
float)
width *
height * max_bpp * maxbuf > singlebuffer)
1458 const float scale = singlebuffer / ((float)
width *
height * max_bpp * maxbuf);
1460 if(width < height && scale >= 0.333f)
1464 else if(height <= width && scale >= 0.333f)
1488 const int overlap_out = ceilf((
float)overlap_in / fullscale);
1496 if(
width <= (
int)xyalign &&
height <= (
int)xyalign)
break;
1497 if(width < height && height > (
int)xyalign)
1499 else if(
width > (
int)xyalign)
1505 int tiles_x = 1, tiles_y = 1;
1511 ? ceilf((
float)roi_in->
width / (
float)
_max(
width - 2 * overlap_in - inacc, 1))
1519 ? ceilf((
float)roi_in->
height / (
float)
_max(
height - 2 * overlap_in - inacc, 1))
1530 "[default_process_tiling_cl_roi] aborted tiling for module '%s'. too many tiles: %dx%d\n",
1531 self->
op, tiles_x, tiles_y);
1538 roi_out->
width % tiles_x == 0 ? roi_out->
width / tiles_x : roi_out->
width / tiles_x + 1, xyalign);
1540 roi_out->
height % tiles_y == 0 ? roi_out->
height / tiles_y : roi_out->
height / tiles_y + 1, xyalign);
1543 "[default_process_tiling_cl_roi] (%dx%d) tiles with max input dimensions %dx%d, good %ix%i\n",
1544 tiles_x, tiles_y,
width,
height, tile_wd, tile_ht);
1547 for(
size_t tx = 0; tx < tiles_x; tx++)
1548 for(
size_t ty = 0; ty < tiles_y; ty++)
1550 mutable_pipe->
tiling = 1;
1553 const size_t wd = (tx + 1) * tile_wd > roi_out->
width ? (
size_t)roi_out->
width - tx * tile_wd : tile_wd;
1554 const size_t ht = (ty + 1) * tile_ht > roi_out->
height ? (
size_t)roi_out->
height - ty * tile_ht : tile_ht;
1558 dt_iop_roi_t iroi_good = { roi_in->
x + tx * tile_wd, roi_in->
y + ty * tile_ht, wd, ht, roi_in->
scale };
1559 dt_iop_roi_t oroi_good = { roi_out->
x + tx * tile_wd, roi_out->
y + ty * tile_ht, wd, ht, roi_out->
scale };
1562 self->modify_roi_in(self, pipe, &piece_copy, &oroi_good, &iroi_good);
1565 iroi_good.
x =
_max(iroi_good.
x, roi_in->
x);
1566 iroi_good.
y =
_max(iroi_good.
y, roi_in->
y);
1577 const int x_in = iroi_good.
x;
1578 const int y_in = iroi_good.
y;
1579 const int width_in = iroi_good.
width;
1580 const int height_in = iroi_good.
height;
1583 const int new_width_in =
_min(
_align_up(width_in + overlap_in +
delta + (x_in - new_x_in), xyalign),
1584 roi_in->
width + roi_in->
x - new_x_in);
1585 const int new_height_in =
_min(
_align_up(height_in + overlap_in +
delta + (y_in - new_y_in), xyalign),
1586 roi_in->
height + roi_in->
y - new_y_in);
1590 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.
scale };
1593 _print_roi(&iroi_full,
"tile iroi_full before optimization");
1594 _print_roi(&oroi_full,
"tile oroi_full before optimization");
1600 "for module '%s' not possible.\n",
1608 oroi_full.
x =
_min(oroi_full.
x, oroi_good.
x);
1609 oroi_full.
y =
_min(oroi_full.
y, oroi_good.
y);
1614 oroi_full.
x =
_max(oroi_full.
x, roi_out->
x);
1615 oroi_full.
y =
_max(oroi_full.
y, roi_out->
y);
1622 self->modify_roi_in(self, pipe, &piece_full, &oroi_full, &iroi_full);
1625 iroi_full.
x =
_max(iroi_full.
x, roi_in->
x);
1626 iroi_full.
y =
_max(iroi_full.
y, roi_in->
y);
1634 const int in_dx = iroi_full.
x - roi_in->
x;
1635 const int in_dy = iroi_full.
y - roi_in->
y;
1636 const int out_dx = oroi_good.
x - roi_out->
x;
1637 const int out_dy = oroi_good.
y - roi_out->
y;
1638 const size_t ioffs = (size_t)(in_dy * ipitch) + (size_t)(in_dx * in_bpp);
1639 const size_t ooffs = (size_t)(out_dy * opitch) + (size_t)(out_dx * out_bpp);
1642 size_t iorigin[] = { 0, 0, 0 };
1643 size_t iregion[] = { iroi_full.
width, iroi_full.
height, 1 };
1646 size_t oorigin[] = { oroi_good.
x - oroi_full.
x, oroi_good.
y - oroi_full.
y, 0 };
1647 size_t oregion[] = { oroi_good.
width, oroi_good.
height, 1 };
1649 dt_print(
DT_DEBUG_TILING,
"[default_process_tiling_cl_roi] process tile (%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
") size %dx%d at origin [%d,%d]\n",
1650 tx, ty, iroi_full.
width, iroi_full.
height, iroi_full.
x, iroi_full.
y);
1651 dt_vprint(
DT_DEBUG_TILING,
"[default_process_tiling_cl_roi] dest [%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
"] at [%" G_GSIZE_FORMAT
",%" G_GSIZE_FORMAT
"], offsets [%i,%i] -> [%i,%i], delta=%i\n\n",
1652 oregion[0], oregion[1], oorigin[0], oorigin[1], in_dx, in_dy, out_dx, out_dy,
delta);
1664 if(err != CL_SUCCESS)
goto error;
1668 piece_tile.
roi_in = iroi_full;
1669 piece_tile.
roi_out = oroi_full;
1670 if(!self->process_cl(self, pipe, &piece_tile, input, output))
goto error;
1675 if(err != CL_SUCCESS)
goto error;
1689 mutable_pipe->
tiling = 0;
1695 mutable_pipe->
tiling = 0;
1697 "[default_process_tiling_opencl_roi] couldn't run process_cl() for module '%s' in tiling mode: %i\n",
1709 const void *
const ivoid,
void *
const ovoid,
const int in_bpp)
1722 const void *
const ivoid,
void *
const ovoid,
const int in_bpp)
1745 tiling->factor = 1.0f + ioratio;
1779 const float factor,
const size_t overhead)
1786 while(!
error && available < total)
1792 if(total <= available)
static void error(char *msg)
const dt_aligned_pixel_t f
static const float const float const float min
typedef void((*dt_cache_allocate_t)(void *userdata, dt_cache_entry_t *entry))
void dt_control_log(const char *msg,...)
void dt_vprint(dt_debug_thread_t thread, const char *msg,...)
size_t dt_get_available_mem()
void dt_print(dt_debug_thread_t thread, const char *msg,...)
#define dt_pixelpipe_cache_alloc_align_cache(size, id)
#define dt_pixelpipe_cache_free_align(mem)
#define __OMP_PARALLEL_FOR__(...)
#define IS_NULL_PTR(p)
C is way too permissive with !=, == and if(var) checks, which can mean too many things depending on w...
@ IOP_FLAGS_TILING_FULL_ROI
int dt_ioppr_get_iop_order(GList *iop_order_list, const char *op_name, const int multi_priority)
Return the iop_order for a given operation/instance pair.
float *const restrict const size_t k
cl_ulong dt_opencl_get_device_available(const int devid)
void * dt_opencl_alloc_device(const int devid, const int width, const int height, const int bpp)
int dt_opencl_read_host_from_device_raw(const int devid, void *host, void *device, const size_t *origin, const size_t *region, const int rowpitch, const int blocking)
cl_ulong dt_opencl_get_device_memalloc(const int devid)
gboolean dt_opencl_finish(const int devid)
int dt_opencl_write_host_to_device_raw(const int devid, const void *host, void *device, const size_t *origin, const size_t *region, const int rowpitch, const int blocking)
void dt_opencl_release_mem_object(cl_mem mem)
int dt_dev_pixel_pipe_cache_remove_lru(dt_dev_pixelpipe_cache_t *cache)
struct dt_dev_pixelpipe_cache_t * pixelpipe_cache
struct dt_opencl_t * opencl
dt_iop_buffer_dsc_t dsc_out
dt_iop_buffer_dsc_t dsc_in
dt_dev_pixelpipe_type_t type
GModule *dt_dev_operation_t op
Region of interest passed through the pixelpipe.
typedef double((*spd)(unsigned long int wavelength, double TempK))
int default_process_tiling(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const int in_bpp)
static int _fit_output_to_input_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta, int iter)
static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta)
static double _nm_fitness(double x[], void *rest[])
void default_tiling_callback(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, struct dt_develop_tiling_t *tiling)
static int _default_process_tiling_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
static int _align_up(int n, int a)
static int _default_process_tiling_ptp(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
static void _print_roi(const dt_iop_roi_t *roi, const char *label)
int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp, const float factor, const size_t overhead)
static int _max(int a, int b)
static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
static int _simplex(double(*objfunc)(double[], void *[]), double start[], int n, double EPSILON, double scale, int maxiter, void(*constrain)(double[], int n), void *rest[])
static int _align_close(int n, int a)
static unsigned _lcm(unsigned a, unsigned b)
static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const int in_bpp)
static int _min(int a, int b)
int default_process_tiling_cl(struct dt_iop_module_t *self, const struct dt_dev_pixelpipe_t *pipe, const struct dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const int in_bpp)
static int _align_down(int n, int a)
static int _maximum_number_tiles()
static unsigned _gcd(unsigned a, unsigned b)