Skip to content

Commit 4832605

Browse files
committed
Reduce the number of worker jobs
This gives less overhead per job and also means less merging at the end. I also merged blur and threshold into a single task.
1 parent 442491d commit 4832605

File tree

3 files changed

+52
-104
lines changed

3 files changed

+52
-104
lines changed

apriltag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1184,7 +1184,7 @@ zarray_t *apriltag_detector_detect(apriltag_detector_t *td, image_u8_t *im_orig)
11841184
if (1) {
11851185
image_u8_t *im_samples = td->debug ? image_u8_copy(im_orig) : NULL;
11861186

1187-
int chunksize = 1 + zarray_size(quads) / (APRILTAG_TASKS_PER_THREAD_TARGET * td->nthreads);
1187+
int chunksize = 1 + zarray_size(quads) / td->nthreads;
11881188

11891189
struct quad_decode_task *tasks = malloc(sizeof(struct quad_decode_task)*(zarray_size(quads) / chunksize + 1));
11901190

apriltag.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ extern "C" {
4040
#include "common/timeprofile.h"
4141
#include "common/pthreads_cross.h"
4242

43-
#define APRILTAG_TASKS_PER_THREAD_TARGET 10
44-
4543
struct quad
4644
{
4745
float p[4][2]; // corners

apriltag_quad_thresh.c

Lines changed: 51 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -108,31 +108,22 @@ struct cluster_task
108108
};
109109

110110
struct minmax_task {
111-
int ty;
111+
int ty0, ty1;
112112

113113
image_u8_t *im;
114114
uint8_t *im_max;
115115
uint8_t *im_min;
116116
};
117117

118-
struct blur_task {
119-
int ty;
120-
121-
image_u8_t *im;
122-
uint8_t *im_max;
123-
uint8_t *im_min;
124-
uint8_t *im_max_tmp;
125-
uint8_t *im_min_tmp;
126-
};
127-
128-
struct threshold_task {
129-
int ty;
118+
struct blur_threshold_task {
119+
int ty0, ty1;
130120

131121
apriltag_detector_t *td;
132122
image_u8_t *im;
133123
image_u8_t *threshim;
134124
uint8_t *im_max;
135125
uint8_t *im_min;
126+
int tw, th;
136127
};
137128

138129
struct remove_vertex
@@ -1104,10 +1095,10 @@ void do_minmax_task(void *p)
11041095
const int tilesz = 4;
11051096
struct minmax_task* task = (struct minmax_task*) p;
11061097
int s = task->im->stride;
1107-
int ty = task->ty;
11081098
int tw = task->im->width / tilesz;
11091099
image_u8_t *im = task->im;
11101100

1101+
for (int ty = task->ty0; ty < task->ty1; ty++)
11111102
for (int tx = 0; tx < tw; tx++) {
11121103
uint8_t max = 0, min = 255;
11131104

@@ -1128,65 +1119,42 @@ void do_minmax_task(void *p)
11281119
}
11291120
}
11301121

1131-
void do_blur_task(void *p)
1122+
1123+
void do_blur_threshold_task(void *p)
11321124
{
11331125
const int tilesz = 4;
1134-
struct blur_task* task = (struct blur_task*) p;
1135-
int ty = task->ty;
1136-
int tw = task->im->width / tilesz;
1137-
int th = task->im->height / tilesz;
1126+
struct blur_threshold_task* task = (struct blur_threshold_task*) p;
1127+
int tw = task->tw;
1128+
int th = task->th;
1129+
int s = task->im->stride;
11381130
uint8_t *im_max = task->im_max;
11391131
uint8_t *im_min = task->im_min;
1132+
image_u8_t *im = task->im;
1133+
image_u8_t *threshim = task->threshim;
1134+
int min_white_black_diff = task->td->qtp.min_white_black_diff;
11401135

1136+
for (int ty = task->ty0; ty < task->ty1; ty++)
11411137
for (int tx = 0; tx < tw; tx++) {
11421138
uint8_t max = 0, min = 255;
1143-
11441139
for (int dy = -1; dy <= 1; dy++) {
11451140
if (ty+dy < 0 || ty+dy >= th)
11461141
continue;
11471142
for (int dx = -1; dx <= 1; dx++) {
11481143
if (tx+dx < 0 || tx+dx >= tw)
11491144
continue;
1150-
11511145
uint8_t m = im_max[(ty+dy)*tw+tx+dx];
1152-
if (m > max)
1153-
max = m;
1146+
if (m > max) max = m;
11541147
m = im_min[(ty+dy)*tw+tx+dx];
1155-
if (m < min)
1156-
min = m;
1148+
if (m < min) min = m;
11571149
}
11581150
}
11591151

1160-
task->im_max_tmp[ty*tw + tx] = max;
1161-
task->im_min_tmp[ty*tw + tx] = min;
1162-
}
1163-
}
1164-
1165-
void do_threshold_task(void *p)
1166-
{
1167-
const int tilesz = 4;
1168-
struct threshold_task* task = (struct threshold_task*) p;
1169-
int ty = task->ty;
1170-
int tw = task->im->width / tilesz;
1171-
int s = task->im->stride;
1172-
uint8_t *im_max = task->im_max;
1173-
uint8_t *im_min = task->im_min;
1174-
image_u8_t *im = task->im;
1175-
image_u8_t *threshim = task->threshim;
1176-
int min_white_black_diff = task->td->qtp.min_white_black_diff;
1177-
1178-
for (int tx = 0; tx < tw; tx++) {
1179-
int min = im_min[ty*tw + tx];
1180-
int max = im_max[ty*tw + tx];
1181-
11821152
// low contrast region? (no edges)
11831153
if (max - min < min_white_black_diff) {
11841154
for (int dy = 0; dy < tilesz; dy++) {
11851155
int y = ty*tilesz + dy;
1186-
11871156
for (int dx = 0; dx < tilesz; dx++) {
11881157
int x = tx*tilesz + dx;
1189-
11901158
threshim->buf[y*s+x] = 127;
11911159
}
11921160
}
@@ -1198,23 +1166,17 @@ void do_threshold_task(void *p)
11981166
// argument for biasing towards dark; specular highlights
11991167
// can be substantially brighter than white tag parts
12001168
uint8_t thresh = min + (max - min) / 2;
1201-
12021169
for (int dy = 0; dy < tilesz; dy++) {
12031170
int y = ty*tilesz + dy;
1204-
12051171
for (int dx = 0; dx < tilesz; dx++) {
12061172
int x = tx*tilesz + dx;
1207-
12081173
uint8_t v = im->buf[y*s+x];
1209-
if (v > thresh)
1210-
threshim->buf[y*s+x] = 255;
1211-
else
1212-
threshim->buf[y*s+x] = 0;
1174+
threshim->buf[y*s+x] = (v > thresh) ? 255 : 0;
12131175
}
12141176
}
12151177
}
12161178
}
1217-
1179+
12181180
image_u8_t *threshold(apriltag_detector_t *td, image_u8_t *im)
12191181
{
12201182
int w = im->width, h = im->height, s = im->stride;
@@ -1257,58 +1219,46 @@ image_u8_t *threshold(apriltag_detector_t *td, image_u8_t *im)
12571219
uint8_t *im_max = calloc(tw*th, sizeof(uint8_t));
12581220
uint8_t *im_min = calloc(tw*th, sizeof(uint8_t));
12591221

1260-
struct minmax_task *minmax_tasks = malloc(sizeof(struct minmax_task)*th);
1261-
// first, collect min/max statistics for each tile
1262-
for (int ty = 0; ty < th; ty++) {
1263-
minmax_tasks[ty].im = im;
1264-
minmax_tasks[ty].im_max = im_max;
1265-
minmax_tasks[ty].im_min = im_min;
1266-
minmax_tasks[ty].ty = ty;
1222+
int ntasks_target = td->nthreads;
1223+
int tile_chunk = (th + ntasks_target - 1) / ntasks_target;
12671224

1268-
workerpool_add_task(td->wp, do_minmax_task, &minmax_tasks[ty]);
1225+
// first, collect min/max statistics for each tile
1226+
struct minmax_task *minmax_tasks = malloc(sizeof(struct minmax_task)*ntasks_target);
1227+
int mm_ntasks = 0;
1228+
for (int ty = 0; ty < th; ty += tile_chunk) {
1229+
minmax_tasks[mm_ntasks].im = im;
1230+
minmax_tasks[mm_ntasks].im_max = im_max;
1231+
minmax_tasks[mm_ntasks].im_min = im_min;
1232+
minmax_tasks[mm_ntasks].ty0 = ty;
1233+
minmax_tasks[mm_ntasks].ty1 = (ty + tile_chunk < th) ? ty + tile_chunk : th;
1234+
workerpool_add_task(td->wp, do_minmax_task, &minmax_tasks[mm_ntasks]);
1235+
mm_ntasks++;
12691236
}
12701237
workerpool_run(td->wp);
12711238
free(minmax_tasks);
12721239

12731240
// second, apply 3x3 max/min convolution to "blur" these values
12741241
// over larger areas. This reduces artifacts due to abrupt changes
12751242
// in the threshold value.
1276-
if (1) {
1277-
uint8_t *im_max_tmp = calloc(tw*th, sizeof(uint8_t));
1278-
uint8_t *im_min_tmp = calloc(tw*th, sizeof(uint8_t));
1279-
1280-
struct blur_task *blur_tasks = malloc(sizeof(struct blur_task)*th);
1281-
for (int ty = 0; ty < th; ty++) {
1282-
blur_tasks[ty].im = im;
1283-
blur_tasks[ty].im_max = im_max;
1284-
blur_tasks[ty].im_min = im_min;
1285-
blur_tasks[ty].im_max_tmp = im_max_tmp;
1286-
blur_tasks[ty].im_min_tmp = im_min_tmp;
1287-
blur_tasks[ty].ty = ty;
1288-
1289-
workerpool_add_task(td->wp, do_blur_task, &blur_tasks[ty]);
1243+
{
1244+
struct blur_threshold_task *bt_tasks = malloc(sizeof(struct blur_threshold_task)*ntasks_target);
1245+
int bt_ntasks = 0;
1246+
for (int ty = 0; ty < th; ty += tile_chunk) {
1247+
bt_tasks[bt_ntasks].im = im;
1248+
bt_tasks[bt_ntasks].threshim = threshim;
1249+
bt_tasks[bt_ntasks].im_max = im_max;
1250+
bt_tasks[bt_ntasks].im_min = im_min;
1251+
bt_tasks[bt_ntasks].ty0 = ty;
1252+
bt_tasks[bt_ntasks].ty1 = (ty + tile_chunk < th) ? ty + tile_chunk : th;
1253+
bt_tasks[bt_ntasks].td = td;
1254+
bt_tasks[bt_ntasks].tw = tw;
1255+
bt_tasks[bt_ntasks].th = th;
1256+
workerpool_add_task(td->wp, do_blur_threshold_task, &bt_tasks[bt_ntasks]);
1257+
bt_ntasks++;
12901258
}
12911259
workerpool_run(td->wp);
1292-
free(blur_tasks);
1293-
free(im_max);
1294-
free(im_min);
1295-
im_max = im_max_tmp;
1296-
im_min = im_min_tmp;
1297-
}
1298-
1299-
struct threshold_task *threshold_tasks = malloc(sizeof(struct threshold_task)*th);
1300-
for (int ty = 0; ty < th; ty++) {
1301-
threshold_tasks[ty].im = im;
1302-
threshold_tasks[ty].threshim = threshim;
1303-
threshold_tasks[ty].im_max = im_max;
1304-
threshold_tasks[ty].im_min = im_min;
1305-
threshold_tasks[ty].ty = ty;
1306-
threshold_tasks[ty].td = td;
1307-
1308-
workerpool_add_task(td->wp, do_threshold_task, &threshold_tasks[ty]);
1260+
free(bt_tasks);
13091261
}
1310-
workerpool_run(td->wp);
1311-
free(threshold_tasks);
13121262

13131263
// we skipped over the non-full-sized tiles above. Fix those now.
13141264
if (1) {
@@ -1522,7 +1472,7 @@ unionfind_t* connected_components(apriltag_detector_t *td, image_u8_t* threshim,
15221472
do_unionfind_first_line(uf, threshim, w, ts);
15231473

15241474
int sz = h;
1525-
int chunksize = 1 + sz / (APRILTAG_TASKS_PER_THREAD_TARGET * td->nthreads);
1475+
int chunksize = 1 + sz / (2 * td->nthreads);
15261476
struct unionfind_task *tasks = malloc(sizeof(struct unionfind_task)*(sz / chunksize + 1));
15271477

15281478
int ntasks = 0;
@@ -1754,7 +1704,7 @@ zarray_t* gradient_clusters(apriltag_detector_t *td, image_u8_t* threshim, int w
17541704
int nclustermap = 0.2*w*h;
17551705

17561706
int sz = h - 1;
1757-
int chunksize = 1 + sz / (APRILTAG_TASKS_PER_THREAD_TARGET * td->nthreads);
1707+
int chunksize = 1 + sz / (2 * td->nthreads);
17581708
struct cluster_task *tasks = malloc(sizeof(struct cluster_task)*(sz / chunksize + 1));
17591709

17601710
int ntasks = 0;
@@ -1833,7 +1783,7 @@ zarray_t* fit_quads(apriltag_detector_t *td, int w, int h, zarray_t* clusters, i
18331783
}
18341784

18351785
int sz = zarray_size(clusters);
1836-
int chunksize = 1 + sz / (APRILTAG_TASKS_PER_THREAD_TARGET * td->nthreads);
1786+
int chunksize = 1 + sz / (4 * td->nthreads);
18371787
struct quad_task *tasks = malloc(sizeof(struct quad_task)*(sz / chunksize + 1));
18381788

18391789
int ntasks = 0;

0 commit comments

Comments
 (0)