Skip to content

Commit 6a2348c

Browse files
rtjohnsoajhconway
andauthored
Robj/memtable race fix (#574)
* Memtable Generation Bugfix Fixes a bug where memtable_maybe_rotate_and_get_insert_lock would speculatively increment the memtable generation even when the next memtable was not yet ready. This would cause concurrent lookup threads to attempt to access that memtable, resulting in errors. This fix requires the insert threads to wait until the next memtable is ready before finalizing the current one. * abstract memtable and trunk root-addr locking apis --------- Co-authored-by: Alex Conway <aconway@vmware.com>
1 parent 8c639a0 commit 6a2348c

File tree

6 files changed

+180
-82
lines changed

6 files changed

+180
-82
lines changed

src/memtable.c

Lines changed: 75 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -43,20 +43,20 @@ memtable_process(memtable_context *ctxt, uint64 generation)
4343
ctxt->process(ctxt->process_ctxt, generation);
4444
}
4545

46-
void
47-
memtable_get_insert_lock(memtable_context *ctxt)
46+
static inline void
47+
memtable_begin_insert(memtable_context *ctxt)
4848
{
4949
platform_batch_rwlock_get(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
5050
}
5151

5252
void
53-
memtable_unget_insert_lock(memtable_context *ctxt)
53+
memtable_end_insert(memtable_context *ctxt)
5454
{
5555
platform_batch_rwlock_unget(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
5656
}
5757

58-
bool
59-
memtable_try_lock_insert_lock(memtable_context *ctxt)
58+
static inline bool
59+
memtable_try_begin_insert_rotation(memtable_context *ctxt)
6060
{
6161
if (!platform_batch_rwlock_try_claim(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX))
6262
{
@@ -66,87 +66,120 @@ memtable_try_lock_insert_lock(memtable_context *ctxt)
6666
return TRUE;
6767
}
6868

69-
void
70-
memtable_lock_insert_lock(memtable_context *ctxt)
69+
static inline void
70+
memtable_end_insert_rotation(memtable_context *ctxt)
71+
{
72+
platform_batch_rwlock_unlock(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
73+
platform_batch_rwlock_unclaim(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
74+
}
75+
76+
static inline void
77+
memtable_begin_raw_rotation(memtable_context *ctxt)
7178
{
72-
platform_batch_rwlock_claim(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
79+
platform_batch_rwlock_get(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
80+
platform_batch_rwlock_claim_loop(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
7381
platform_batch_rwlock_lock(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
7482
}
7583

76-
void
77-
memtable_unlock_insert_lock(memtable_context *ctxt)
84+
static inline void
85+
memtable_end_raw_rotation(memtable_context *ctxt)
7886
{
79-
platform_batch_rwlock_unclaim(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
80-
platform_batch_rwlock_unlock(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
87+
platform_batch_rwlock_full_unlock(ctxt->rwlock, MEMTABLE_INSERT_LOCK_IDX);
8188
}
8289

8390
void
84-
memtable_get_lookup_lock(memtable_context *ctxt)
91+
memtable_begin_lookup(memtable_context *ctxt)
8592
{
8693
platform_batch_rwlock_get(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
8794
}
8895

8996
void
90-
memtable_unget_lookup_lock(memtable_context *ctxt)
97+
memtable_end_lookup(memtable_context *ctxt)
9198
{
9299
platform_batch_rwlock_unget(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
93100
}
94101

95102
void
96-
memtable_lock_lookup_lock(memtable_context *ctxt)
103+
memtable_block_lookups(memtable_context *ctxt)
97104
{
98-
platform_batch_rwlock_claim(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
105+
platform_batch_rwlock_get(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
106+
platform_batch_rwlock_claim_loop(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
99107
platform_batch_rwlock_lock(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
100108
}
101109

102110
void
103-
memtable_unlock_lookup_lock(memtable_context *ctxt)
111+
memtable_unblock_lookups(memtable_context *ctxt)
104112
{
105-
platform_batch_rwlock_unclaim(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
106-
platform_batch_rwlock_unlock(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
113+
platform_batch_rwlock_full_unlock(ctxt->rwlock, MEMTABLE_LOOKUP_LOCK_IDX);
107114
}
108115

109116

110117
platform_status
111-
memtable_maybe_rotate_and_get_insert_lock(memtable_context *ctxt,
112-
uint64 *generation)
118+
memtable_maybe_rotate_and_begin_insert(memtable_context *ctxt,
119+
uint64 *generation)
113120
{
114121
uint64 wait = 100;
115122
while (TRUE) {
116-
memtable_get_insert_lock(ctxt);
117-
*generation = ctxt->generation;
118-
uint64 mt_no = *generation % ctxt->cfg.max_memtables;
119-
memtable *mt = &ctxt->mt[mt_no];
120-
if (mt->state != MEMTABLE_STATE_READY) {
123+
memtable_begin_insert(ctxt);
124+
uint64 current_generation = ctxt->generation;
125+
uint64 current_mt_no = current_generation % ctxt->cfg.max_memtables;
126+
memtable *current_mt = &ctxt->mt[current_mt_no];
127+
if (current_mt->state != MEMTABLE_STATE_READY) {
121128
// The next memtable is not ready yet, back off and wait.
122-
memtable_unget_insert_lock(ctxt);
129+
memtable_end_insert(ctxt);
123130
platform_sleep_ns(wait);
124131
wait = wait > 2048 ? wait : 2 * wait;
125132
continue;
126133
}
127134
wait = 100;
128135

129-
if (memtable_is_full(&ctxt->cfg, &ctxt->mt[mt_no])) {
130-
// If the current memtable is full, try to retire it.
131-
memtable_unget_insert_lock(ctxt);
132-
if (memtable_try_lock_insert_lock(ctxt)) {
136+
if (memtable_is_full(&ctxt->cfg, current_mt)) {
137+
// If the current memtable is full, try to retire it
138+
139+
uint64 next_generation = current_generation + 1;
140+
uint64 next_mt_no = next_generation % ctxt->cfg.max_memtables;
141+
memtable *next_mt = &ctxt->mt[next_mt_no];
142+
if (next_mt->state != MEMTABLE_STATE_READY) {
143+
memtable_end_insert(ctxt);
144+
return STATUS_BUSY;
145+
}
146+
147+
if (memtable_try_begin_insert_rotation(ctxt)) {
133148
// We successfully got the lock, so we do the finalization
134149
memtable_transition(
135-
mt, MEMTABLE_STATE_READY, MEMTABLE_STATE_FINALIZED);
150+
current_mt, MEMTABLE_STATE_READY, MEMTABLE_STATE_FINALIZED);
136151

137152
// Safe to increment non-atomically because we have a lock on
138153
// the insert lock
139-
uint64 process_generation = ctxt->generation++;
154+
ctxt->generation++;
155+
platform_assert(ctxt->generation - ctxt->generation_retired
156+
<= ctxt->cfg.max_memtables,
157+
"ctxt->generation: %lu, "
158+
"ctxt->generation_retired: %lu, "
159+
"current_generation: %lu\n",
160+
ctxt->generation,
161+
ctxt->generation_retired,
162+
current_generation);
163+
platform_assert(current_generation + 1 == ctxt->generation,
164+
"ctxt->generation: %lu, "
165+
"ctxt->generation_retired: %lu, "
166+
"current_generation: %lu\n",
167+
ctxt->generation,
168+
ctxt->generation_retired,
169+
current_generation);
170+
140171
memtable_mark_empty(ctxt);
141-
memtable_unlock_insert_lock(ctxt);
142-
memtable_process(ctxt, process_generation);
172+
memtable_end_insert_rotation(ctxt);
173+
memtable_end_insert(ctxt);
174+
memtable_process(ctxt, current_generation);
143175
} else {
176+
memtable_end_insert(ctxt);
144177
platform_sleep_ns(wait);
145178
wait = wait > 2048 ? wait : 2 * wait;
146179
}
147180
continue;
148181
}
149-
*generation = ctxt->generation;
182+
*generation = current_generation;
150183
return STATUS_OK;
151184
}
152185
}
@@ -232,17 +265,19 @@ memtable_dec_ref_maybe_recycle(memtable_context *ctxt, memtable *mt)
232265
uint64
233266
memtable_force_finalize(memtable_context *ctxt)
234267
{
235-
memtable_lock_insert_lock(ctxt);
268+
memtable_begin_raw_rotation(ctxt);
236269

237270
uint64 generation = ctxt->generation;
238271
uint64 mt_no = generation % ctxt->cfg.max_memtables;
239272
memtable *mt = &ctxt->mt[mt_no];
240273
memtable_transition(mt, MEMTABLE_STATE_READY, MEMTABLE_STATE_FINALIZED);
241-
uint64 process_generation = ctxt->generation++;
274+
uint64 current_generation = ctxt->generation++;
275+
platform_assert(ctxt->generation - ctxt->generation_retired
276+
<= ctxt->cfg.max_memtables);
242277
memtable_mark_empty(ctxt);
243278

244-
memtable_unlock_insert_lock(ctxt);
245-
return process_generation;
279+
memtable_end_raw_rotation(ctxt);
280+
return current_generation;
246281
}
247282

248283
void

src/memtable.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ typedef struct memtable_context {
127127
platform_mutex incorporation_mutex;
128128
volatile uint64 generation_to_incorporate;
129129

130-
// Protected by the MEMTABLE_INSERT_LOCK_IDX'th lock of rwlock. Must hold
130+
// Protected by the MEMTABLE_LOOKUP_LOCK_IDX'th lock of rwlock. Must hold
131131
// read lock to read and write lock to modify.
132132
volatile uint64 generation_retired;
133133

@@ -140,23 +140,23 @@ typedef struct memtable_context {
140140
} memtable_context;
141141

142142
platform_status
143-
memtable_maybe_rotate_and_get_insert_lock(memtable_context *ctxt,
144-
uint64 *generation);
143+
memtable_maybe_rotate_and_begin_insert(memtable_context *ctxt,
144+
uint64 *generation);
145145

146146
void
147-
memtable_unget_insert_lock(memtable_context *ctxt);
147+
memtable_end_insert(memtable_context *ctxt);
148148

149149
void
150-
memtable_get_lookup_lock(memtable_context *ctxt);
150+
memtable_begin_lookup(memtable_context *ctxt);
151151

152152
void
153-
memtable_unget_lookup_lock(memtable_context *ctxt);
153+
memtable_end_lookup(memtable_context *ctxt);
154154

155155
void
156-
memtable_lock_lookup_lock(memtable_context *ctxt);
156+
memtable_block_lookups(memtable_context *ctxt);
157157

158158
void
159-
memtable_unlock_lookup_lock(memtable_context *ctxt);
159+
memtable_unblock_lookups(memtable_context *ctxt);
160160

161161
platform_status
162162
memtable_insert(memtable_context *ctxt,

src/platform_linux/platform.c

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,41 +275,57 @@ platform_batch_rwlock_unlock(platform_batch_rwlock *lock, uint64 lock_idx)
275275
__sync_lock_release(&lock->write_lock[lock_idx].lock);
276276
}
277277

278+
void
279+
platform_batch_rwlock_full_unlock(platform_batch_rwlock *lock, uint64 lock_idx)
280+
{
281+
platform_batch_rwlock_unlock(lock, lock_idx);
282+
platform_batch_rwlock_unclaim(lock, lock_idx);
283+
platform_batch_rwlock_unget(lock, lock_idx);
284+
}
285+
278286
/*
279287
*-----------------------------------------------------------------------------
280288
* try_claim/claim/unlock
281289
*
282290
* A claim blocks all other claimants (and therefore all other writelocks,
283291
* because writelocks are required to hold a claim during the writelock).
284292
*
285-
* Cannot hold a get (read lock)
286-
*
293+
* Must hold a get (read lock)
287294
* try_claim returns whether the claim succeeded
288295
*-----------------------------------------------------------------------------
289296
*/
290297

291298
bool
292299
platform_batch_rwlock_try_claim(platform_batch_rwlock *lock, uint64 lock_idx)
293300
{
301+
threadid tid = platform_get_tid();
302+
debug_assert(lock->read_counter[tid][lock_idx]);
294303
if (__sync_lock_test_and_set(&lock->write_lock[lock_idx].claim, 1)) {
295304
return FALSE;
296305
}
306+
debug_only uint8 old_counter =
307+
__sync_fetch_and_sub(&lock->read_counter[tid][lock_idx], 1);
308+
debug_assert(0 < old_counter);
297309
return TRUE;
298310
}
299311

300312
void
301-
platform_batch_rwlock_claim(platform_batch_rwlock *lock, uint64 lock_idx)
313+
platform_batch_rwlock_claim_loop(platform_batch_rwlock *lock, uint64 lock_idx)
302314
{
303315
uint64 wait = 1;
304316
while (!platform_batch_rwlock_try_claim(lock, lock_idx)) {
317+
platform_batch_rwlock_unget(lock, lock_idx);
305318
platform_sleep_ns(wait);
306319
wait = wait > 2048 ? wait : 2 * wait;
320+
platform_batch_rwlock_get(lock, lock_idx);
307321
}
308322
}
309323

310324
void
311325
platform_batch_rwlock_unclaim(platform_batch_rwlock *lock, uint64 lock_idx)
312326
{
327+
threadid tid = platform_get_tid();
328+
__sync_fetch_and_add(&lock->read_counter[tid][lock_idx], 1);
313329
__sync_lock_release(&lock->write_lock[lock_idx].claim);
314330
}
315331

src/platform_linux/platform_types.h

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,29 +96,60 @@ _Static_assert(sizeof(platform_batch_rwlock)
9696
== PLATFORM_CACHELINE_SIZE * (MAX_THREADS / 2 + 1),
9797
"Missized platform_batch_rwlock\n");
9898

99+
100+
/*
101+
* The state machine for a thread interacting with a batch_rwlock is:
102+
*
103+
* get claim lock
104+
* unlocked <-------> read-locked <----------> claimed <--------> write-locked
105+
* unget unclaim unlock
106+
*
107+
* Note that try_claim() may fail, in which case the state of the lock
108+
* is unchanged, i.e. the caller still holds a read lock.
109+
*/
110+
111+
99112
void
100113
platform_batch_rwlock_init(platform_batch_rwlock *lock);
101114

115+
/* no lock -> shared lock */
102116
void
103-
platform_batch_rwlock_lock(platform_batch_rwlock *lock, uint64 lock_idx);
117+
platform_batch_rwlock_get(platform_batch_rwlock *lock, uint64 lock_idx);
104118

119+
/* shared lock -> no lock */
105120
void
106-
platform_batch_rwlock_unlock(platform_batch_rwlock *lock, uint64 lock_idx);
121+
platform_batch_rwlock_unget(platform_batch_rwlock *lock, uint64 lock_idx);
107122

123+
/*
124+
* shared-lock -> claim (may fail)
125+
*
126+
* Callers still hold a shared lock after a failed claim attempt.
127+
* Callers _must_ release their shared lock after a failed claim attempt.
128+
*/
108129
bool
109130
platform_batch_rwlock_try_claim(platform_batch_rwlock *lock, uint64 lock_idx);
110131

132+
/* shared-lock -> claim, BUT(!) may temporarily release the shared-lock in the
133+
* process. */
111134
void
112-
platform_batch_rwlock_claim(platform_batch_rwlock *lock, uint64 lock_idx);
135+
platform_batch_rwlock_claim_loop(platform_batch_rwlock *lock, uint64 lock_idx);
113136

137+
/* claim -> shared lock */
114138
void
115139
platform_batch_rwlock_unclaim(platform_batch_rwlock *lock, uint64 lock_idx);
116140

141+
/* claim -> exclusive lock */
117142
void
118-
platform_batch_rwlock_get(platform_batch_rwlock *lock, uint64 lock_idx);
143+
platform_batch_rwlock_lock(platform_batch_rwlock *lock, uint64 lock_idx);
119144

145+
/* exclusive lock -> claim */
120146
void
121-
platform_batch_rwlock_unget(platform_batch_rwlock *lock, uint64 lock_idx);
147+
platform_batch_rwlock_unlock(platform_batch_rwlock *lock, uint64 lock_idx);
148+
149+
/* exclusive-lock -> unlocked */
150+
void
151+
platform_batch_rwlock_full_unlock(platform_batch_rwlock *lock, uint64 lock_idx);
152+
122153

123154
// Buffer handle
124155
typedef struct {

0 commit comments

Comments
 (0)