-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathplasma_spin.h
675 lines (599 loc) · 29.3 KB
/
plasma_spin.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
/*
* plasma_spin - portable macros for processor-specific spin loops
*
* inline assembly and compiler intrinsics for building spin loops / spinlocks
*
*
* Copyright (c) 2012, Glue Logic LLC. All rights reserved. code()gluelogic.com
*
* This file is part of plasma.
*
* plasma is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* plasma is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with plasma. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef INCLUDED_PLASMA_SPIN_H
#define INCLUDED_PLASMA_SPIN_H
#include "plasma_feature.h"
#include "plasma_attr.h"
#include "plasma_membar.h"
#include "plasma_atomic.h"
#include "plasma_endian.h"
PLASMA_ATTR_Pragma_once
#if defined(__APPLE__) && defined(__MACH__)
#include <libkern/OSAtomic.h>
#include <sched.h>
#endif
#ifndef PLASMA_SPIN_C99INLINE
#define PLASMA_SPIN_C99INLINE C99INLINE
#endif
#ifndef NO_C99INLINE
#ifndef PLASMA_SPIN_C99INLINE_FUNCS
#define PLASMA_SPIN_C99INLINE_FUNCS
#endif
#endif
/* spinlocks / busy-wait loops that pause briefly (versus spinning on nop
* instructions) benefit modern processors by reducing memory hazards upon loop
* exit as well as potentially saving CPU power and reducing resource contention
* with other CPU threads (SMT) (a.k.a strands) on the same core. */
/* plasma_spin_pause() */
/* The asm memory constraint (:::"memory") is not required for pause, but
* if provided acts as a compiler fence to disable compiler optimization,
* e.g. in a spin loop where the spin variable is (incorrectly) not a load
* of _Atomic, or is not volatile. The __volatile__ on the asm (where present
* in the 'pause' implementations below) effects appropriate compiler fence.
* NB: volatile -does not- appear to result in a compiler fence for IBM xlC */
#if defined(_MSC_VER)
/* prefer using intrinsics directly instead of winnt.h macro */
/* http://software.intel.com/en-us/forums/topic/296168 */
/* https://learn.microsoft.com/en-us/cpp/intrinsics/arm-intrinsics */
#include <intrin.h>
#if defined(_M_AMD64) || defined(_M_IX86)
#pragma intrinsic(_mm_pause)
#define plasma_spin_pause() _mm_pause()
/* (if pause not supported by older x86 assembler, "rep nop" is equivalent)*/
/*#define plasma_spin_pause() __asm rep nop */
#elif defined(_M_IA64)
#pragma intrinsic(__yield)
#define plasma_spin_pause() __yield()
#elif defined (_M_ARM) || defined(_M_ARMT)
#pragma intrinsic(__yield)
#define plasma_spin_pause() __yield()
#elif defined(_M_ARM64) || defined(_M_ARM64EC)
/* https://web.archive.org/web/20231004132033/https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8258604 */
/* https://bugs.mysql.com/bug.php?id=100664 */
#pragma intrinsic(__isb)
#define plasma_spin_pause() __isb()
#else
#define plasma_spin_pause() YieldProcessor()
#endif
#if 0 /* do not set these in header; calling file should determine need */
/* http://msdn.microsoft.com/en-us/library/windows/desktop/ms687419%28v=vs.85%29.aspx */
/* NB: caller needs to include proper header for YieldProcessor()
* (e.g. #include <windows.h>)
* http://stackoverflow.com/questions/257134/weird-compile-error-dealing-with-winnt-h */
/* http://support.microsoft.com/kb/166474 */
#define VC_EXTRALEAN
#define WIN32_LEAN_AND_MEAN
#include <wtypes.h>
#include <windef.h>
#include <winnt.h>
#endif
#elif defined(__x86_64__) || defined(__i386__)
/* http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/intref_cls/common/intref_sse2_pause.htm
* http://software.intel.com/sites/default/files/m/2/c/d/3/9/25602-17689_w_spinlock.pdf
* http://software.intel.com/en-us/forums/topic/309231
* http://siyobik.info.gf/main/reference/instruction/PAUSE
* http://stackoverflow.com/questions/7086220/what-does-rep-nop-mean-in-x86-assembly
* http://stackoverflow.com/questions/7371869/minimum-time-a-thread-can-pause-in-linux
*/
/* http://gcc.gnu.org/onlinedocs/gcc/X86-Built_002din-Functions.html
* GCC provides __builtin_ia32_pause()
* Generates the pause machine instruction with a compiler memory barrier.
* (not sure if preferred vis-a-vis _mm_pause() or asm "pause" (below)) */
/* avoid pulling in the large header xmmintrin.h just for plasma_spin_pause()
* clang xmmintrin.h defines _mm_pause() as pause
* #if defined(__clang__) || defined(__INTEL_COMPILER)
* ||(__GNUC_PREREQ(4,7) && defined(__SSE__))
* #include <xmmintrin.h>
* #define plasma_spin_pause() _mm_pause()
* gcc 4.7 xmmintrin.h defines _mm_pause() as rep; nop, which is portable to
* chips pre-Pentium 4. Although opcode generated is same, we want pause
* (if pause not supported by older x86 assembler, "rep; nop" is equivalent)
* #define plasma_spin_pause() __asm__ __volatile__ ("rep; nop") */
#define plasma_spin_pause() __asm__ __volatile__ ("pause")
#elif defined(__ia64__)
/* http://www.intel.com/content/www/us/en/processors/itanium/itanium-architecture-vol-3-manual.html volume 3 page 145 (3:145) */
#if (defined(__HP_cc__) || defined(__HP_aCC__))
#define plasma_spin_pause() _Asm_hint(_HINT_PAUSE)
#else
#define plasma_spin_pause() __asm__ __volatile__ ("hint @pause")
#endif
#elif defined(__arm__) || defined(__thumb__)
/* ARMv7 Architecture Reference Manual (for YIELD) */
/* ARM Compiler toolchain Compiler Reference (for __yield() instrinsic) */
#ifdef __CC_ARM
#define plasma_spin_pause() __yield()
#elif defined(__aarch64___)
/* https://web.archive.org/web/20231004132033/https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8258604 */
/* https://bugs.mysql.com/bug.php?id=100664 */
#define plasma_spin_pause() __asm__ __volatile__ ("isb")
#else
#define plasma_spin_pause() __asm__ __volatile__ ("yield")
#endif
#elif defined(__sparc) || defined(__sparc__)
/* http://www.oracle.com/technetwork/systems/opensparc/sparc-architecture-2011-1728132.pdf
* 5.5.12 Pause Count (PAUSE) Register (ASR 27)
* 7.100 Pause
* PAUSE instruction
* 7.141 Write Ancillary State Register
* WRPAUSE instruction
* wr regrs1, reg_or_imm,%pause
*
* See thorough DELAY_SPIN implementation in Solaris kernel:
* https://github.com/joyent/illumos-joyent/blob/master/usr/src/common/atomic/sparcv9/atomic.s
* https://hg.openindiana.org/upstream/illumos/illumos-gate/rev/7c80b70bb8de
*
* Simpler:
* https://github.com/joyent/illumos-joyent/blob/master/usr/src/lib/libc/capabilities/sun4v/common/smt_pause.s
*/
#if 0 /* barrier only prevents compiler from optimizing away a spin loop */
#if (defined(__SUNPRO_C) && __SUNPRO_C >= 0x5110) \
|| (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5110)
/* http://www.oracle.com/technetwork/server-storage/solarisstudio/documentation/oss-compiler-barriers-176055.pdf
* http://www.oracle.com/technetwork/server-storage/solarisstudio/documentation/oss-memory-barriers-fences-176056.pdf */
#include <mbarrier.h>
#define plasma_spin_pause() __compiler_barrier()
#else
#define plasma_spin_pause() __asm __volatile__("membar #LoadLoad":::"memory")
#endif
#endif /* barrier only prevents compiler from optimizing away a spin loop */
/* Note: SPARC-T4 and newer come with a 'pause' instruction.
* The the length of the pause can be modified by writing to ASR 27
* e.g. wr %%g0, 128, %%asr27
* However, for compatibility with all SPARC, do a generic pause by reading
* the condition code register a few times. If instead of inline assembly,
* this was implemented as a routine in a shared library, then a function
* optimized for the specific architecture and model could be used */
#if defined(__SUNPRO_C)
#define plasma_spin_pause() __asm __volatile__ ("rd %%ccr, %%g0 \n\t" \
"rd %%ccr, %%g0 \n\t" \
"rd %%ccr, %%g0")
#else /* defined(__SUNPRO_CC) || defined(__GNUC__) */
#define plasma_spin_pause() __asm __volatile__ ("rd %ccr, %g0 \n\t" \
"rd %ccr, %g0 \n\t" \
"rd %ccr, %g0")
#endif
#elif defined(__ppc__) || defined(_ARCH_PPC) || \
defined(_ARCH_PWR) || defined(_ARCH_PWR2) || defined(_POWER)
/* http://stackoverflow.com/questions/5425506/equivalent-of-x86-pause-instruction-for-ppc
* https://www.power.org/documentation/power-instruction-set-architecture-version-2-06/attachment/powerisa_v2-06b_v2_public/ (requires free account)
* 3.2 "or" Instruction
* "or" Shared Resource Hints
* For compatibility use 'or 27,27,27' instead of 'yield' extended mnemonic
* (available with POWER 7) and use volatile instead of __volatile__ since
* -qlanglvl=stdc99 rejects __volatile__, though -qlanglvl=extc99 supports it)
*/
#define plasma_spin_pause() __asm__ volatile ("or 27,27,27")
#else
/* help prevent compiler from optimizing loop away; not a pause */
#define plasma_spin_pause() plasma_membar_ccfence()
#endif
/* plasma_spin_yield()
* attempt switch to another thread on same CPU, else sleep 1ms */
#if defined(_MSC_VER)
/* prototype defined in different headers in different MS Windows versions */
VOID WINAPI Sleep(_In_ DWORD dwMilliseconds);
BOOL WINAPI SwitchToThread(void);
#define plasma_spin_yield() do {if (!SwitchToThread()) Sleep(1);} while (0)
#elif defined(PLASMA_FEATURE_POSIX)
#include <unistd.h> /* _POSIX_PRIORITY_SCHEDULING */
#include <poll.h> /* might also need _XOPEN_SOURCE=500 or better on _AIX */
#if defined(_POSIX_PRIORITY_SCHEDULING) && (_POSIX_PRIORITY_SCHEDULING-0)!= -1
#include <sched.h>
#define plasma_spin_yield() \
do {if (sched_yield()!=0) (void)poll(NULL, 0, 1); } while(0)
#else
#define plasma_spin_yield() (void)poll(NULL, 0, 1)
#endif
#endif
#if defined(__APPLE__) && defined(__MACH__)
/* use Mach kernel thread_switch() to reduce priority inversion spinning
* http://web.mit.edu/darwin/src/modules/xnu/osfmk/man/thread_switch.html
* http://www.gnu.org/software/hurd/gnumach-doc/Hand_002dOff-Scheduling.html
* http://stackoverflow.com/questions/12949028/spin-lock-implementations-osspinlock?rq=1
*/
#include <TargetConditionals.h>
#if TARGET_OS_MAC
#include <mach/thread_switch.h>
#include <sched.h>
#undef plasma_spin_yield
#define plasma_spin_yield() \
do {if (thread_switch(THREAD_NULL,SWITCH_OPTION_DEPRESS,1) != KERN_SUCCESS \
&& sched_yield() != 0) \
(void)poll(NULL, 0, 1); \
} while (0) /* (untested!) */
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* plasma_spin_lock_init()
* plasma_spin_lock_acquire()
* plasma_spin_lock_acquire_try()
* plasma_spin_lock_acquire_spinloop()
* plasma_spin_lock_acquire_spindecay()
* plasma_spin_lock_release()
*
* plasma_spin_lock_acquire() is intended for use protecting small critical
* sections that do not contain any (zero, zilch, nada) blocking calls,
* e.g. for use when the cost of obtaining a mutex or the cost of a context
* switch is greater than the cost of executing the instructions in the
* critical section.
*
* plasma_spin_lock_acquire_spindecay()
* spin loop hybrid with adjustable spin counts for levels of backoff/decay
* (pause count, (pause x 32) count, yield count)
* e.g. plasma_spin_lock_acquire_spindecay(spin, 32, 64, 8)
* returns true if lock obtained, false if backoff counts exhausted (decayed)
* - again, caller should avoid blocking operations inside critical section
* - for potential use with moderately contended locks (YMMV)
* - for contended cases after measuring this does better than basic spinloop
* - adjust spin count to your needs after testing your application
* Note that if spin often reaches yield level without obtaining lock, then
* performance suffers and a real mutex is probably a better choice. Though
* yielding the CPU aims to prevent continual spin for long periods of time
* when something else is wrong, spinning fruitlessly is still expensive and
* many threads spinning results in fewer resources available for real work.
*
* Performance notes:
* - locks should be spread out in memory and not share same cache line
* (to avoid false sharing); recommend 128-byte separation
* recommend adding padding on both sides of lock structure when allocating
* (again, to avoid false sharing), especially if lock struct is on the stack
* - not bothering to do non-atomic pre-check of spin->lck prior to atomic
* attempt to acquire the lock since branch prediction and pipelining will
* likely result in the atomic instruction being started anyway. When there
* is significant contention, it might make sense to directly call
* plasma_spin_lock_acquire_spinloop() or plasma_spin_lock_acquire_spindecay()
* instead of plasma_spin_lock_acquire()
* - plasma_spin_lock_t lck member is intentionally not volatile
* (avoid compiler volatile effects)
* - prefer using real mutex for highly contended locks; OS-provided mutex are
* implemented as adaptive mutex in most modern operating system standard libs
* (uncontended mutexes are locked in user-space, and contended mutexes spin
* in user-space for a short period before falling back to the kernel to
* arbitrate) (other options include semaphores, wait queues, event objects)
* - prefer Apple OSSpinLock* implementation where available
* On uniprocessor systems, spinlock implementation does not needlessly spin.
* (A sizable number of mobile devices are uniprocessor and single core.)
*/
#if (defined(__APPLE__) && defined(__MACH__)) \
&& defined(MAC_OS_X_VERSION_MIN_REQUIRED) \
&& MAC_OS_X_VERSION_MIN_REQUIRED-0 >= 1070 /* OSSpinLock in OSX 10.7 */
#define plasma_spin_lock_init(spin) \
((spin)->lck=OS_SPINLOCK_INIT, (spin)->udata32=0, (spin)->udata64=0)
#define PLASMA_SPIN_LOCK_INITIALIZER { OS_SPINLOCK_INIT, 0, 0 }
#define plasma_spin_lock_release(spin) OSSpinLockUnlock(&(spin)->lck)
#define plasma_spin_lock_acquire_try(spin) OSSpinLockTry(&(spin)->lck)
#define plasma_spin_lock_acquire(spin) OSSpinLockLock(&(spin)->lck)
#define plasma_spin_lock_acquire_spinloop(spin) OSSpinLockLock(&(spin)->lck)
/* (OSSpinLock currently defined by libkern/OSAtomic.h as int32_t) */
typedef OSSpinLock plasma_spin_lock_lock_t;
#else
typedef uint32_t plasma_spin_lock_lock_t;
#endif
typedef __attribute_aligned__(16)
struct plasma_spin_lock_t {
plasma_spin_lock_lock_t lck;
uint32_t udata32; /* user data 4-bytes */
uint64_t udata64; /* user data 8-bytes */
} plasma_spin_lock_t;
/* default spin lock macro implementation */
#ifndef PLASMA_SPIN_LOCK_INITIALIZER
#if (defined(__STDC_VERSION__) && __STDC_VERSION__-0 >= 199901L) /* C99 */
#define PLASMA_SPIN_LOCK_INITIALIZER { .lck = PLASMA_ATOMIC_LOCK_INITIALIZER, \
.udata32 = 0, \
.udata64 = 0 }
#else
#define PLASMA_SPIN_LOCK_INITIALIZER { PLASMA_ATOMIC_LOCK_INITIALIZER, 0, 0 }
#endif
#define plasma_spin_lock_init(spin) \
((spin)->lck = PLASMA_ATOMIC_LOCK_INITIALIZER, \
(spin)->udata32 = 0, \
(spin)->udata64 = 0)
#define plasma_spin_lock_release(spin) \
plasma_atomic_lock_release(&(spin)->lck)
#define plasma_spin_lock_acquire_try(spin) \
plasma_atomic_lock_acquire(&(spin)->lck)
#define plasma_spin_lock_acquire(spin) \
(plasma_spin_lock_acquire_try(spin) \
|| plasma_spin_lock_acquire_spinloop(spin))
#endif /* default spin lock macro implementation */
#ifndef plasma_spin_lock_acquire_spinloop
/*(plasma_spin_lock_acquire_spinloop() always returns true)*/
__attribute_nonnull__()
bool
plasma_spin_lock_acquire_spinloop (plasma_spin_lock_t * const spin);
#endif
__attribute_nonnull__()
bool
plasma_spin_lock_acquire_spindecay (plasma_spin_lock_t * const spin,
int pause1, int pause32, int yield);
/* plasma_spin_tktlock_*() ticket lock
* (see bottom of file for ticket lock references)
*
* plasma_spin_tktlock_init()
* plasma_spin_tktlock_is_free()
* plasma_spin_tktlock_acquire()
* plasma_spin_tktlock_acquire_spinloop()
* plasma_spin_tktlock_release()
*
* Note: no trylock interface for ticket lock; trylock can be approximated by
* checking plasma_spin_tktlock_is_free() prior to attempting to obtain lock,
* but doing so might result in two cache line misses to obtain lock, similar
* to CAS.
*/
typedef __attribute_aligned__(16)
struct plasma_spin_tktlock_t {
union { uint32_t u; struct { uint16_t le; uint16_t be; } t; } lck;
uint32_t udata32; /* user data 4-bytes */
uint64_t udata64; /* user data 8-bytes */
} plasma_spin_tktlock_t;
#define PLASMA_SPIN_TKTLOCK_TKTMAX 0xFFFFu
#define PLASMA_SPIN_TKTLOCK_TKTINC 0x10000u
#define PLASMA_SPIN_TKTLOCK_MASK(x) ((x) & PLASMA_SPIN_TKTLOCK_TKTMAX)
#define PLASMA_SPIN_TKTLOCK_SHIFT(x) ((x) >> 16)
#if (defined(__STDC_VERSION__) && __STDC_VERSION__-0 >= 199901L) /* C99 */
#define PLASMA_SPIN_TKTLOCK_INITIALIZER {.lck.u = 0, .udata32 = 0, .udata64 = 0}
#else
#define PLASMA_SPIN_TKTLOCK_INITIALIZER { { 0 }, 0, 0 }
#endif
#define plasma_spin_tktlock_init(t) ((t)->lck.u=0,(t)->udata32=0,(t)->udata64=0)
__attribute_nonnull__()
PLASMA_SPIN_C99INLINE
bool
plasma_spin_tktlock_is_free (const plasma_spin_tktlock_t * const restrict spin);
#ifdef PLASMA_SPIN_C99INLINE_FUNCS
PLASMA_SPIN_C99INLINE
bool
plasma_spin_tktlock_is_free (const plasma_spin_tktlock_t * const restrict spin)
{
const uint32_t tkt = spin->lck.u;
return (PLASMA_SPIN_TKTLOCK_SHIFT(tkt) == PLASMA_SPIN_TKTLOCK_MASK(tkt));
}
#endif
/*(plasma_spin_tktlock_acquire_spinloop() always returns true)*/
__attribute_noinline__
__attribute_nonnull__()
bool
plasma_spin_tktlock_acquire_spinloop (plasma_spin_tktlock_t *
const restrict spin, uint32_t tkt);
/*(plasma_spin_tktlock_acquire() always returns true)*/
__attribute_nonnull__()
PLASMA_SPIN_C99INLINE
bool
plasma_spin_tktlock_acquire (plasma_spin_tktlock_t * const restrict spin);
#ifdef PLASMA_SPIN_C99INLINE_FUNCS
PLASMA_SPIN_C99INLINE
bool
plasma_spin_tktlock_acquire (plasma_spin_tktlock_t * const restrict spin)
{
const uint32_t tkt = /* increment ticket count (high 16-bits of lck) */
plasma_atomic_fetch_add_u32(&spin->lck.u, PLASMA_SPIN_TKTLOCK_TKTINC,
memory_order_relaxed);
if (__builtin_expect(
(PLASMA_SPIN_TKTLOCK_SHIFT(tkt)==PLASMA_SPIN_TKTLOCK_MASK(tkt)), 1)) {
/* (atomic rmw op provides acquire barrier on some platforms (e.g. x86);
* emit barrier only if platform requires barrier after rmw atomic)
* (not sure if atomic_thread_fence(memory_order_acq_rel) does same)*/
plasma_membar_atomic_thread_fence_acq_rel();
return true;
}
return plasma_spin_tktlock_acquire_spinloop(spin, tkt);
}
#endif
__attribute_nonnull__()
PLASMA_SPIN_C99INLINE
void
plasma_spin_tktlock_release (plasma_spin_tktlock_t * const restrict spin);
#ifdef PLASMA_SPIN_C99INLINE_FUNCS
PLASMA_SPIN_C99INLINE
void
plasma_spin_tktlock_release (plasma_spin_tktlock_t * const restrict spin)
{
/* increment ticket num ready to be served (low 16-bits of lck) */
/* must write only 16 bits (and avoid more expensive atomic exchange);
* use union to avoid type-punning pointer between uint32_t and uint16_t
* (writing then reading different members of union is unspecified in C std)
* In C99, Annex J (non-normative) "J.1 Unspecified behavior. The following
* are unspecified: The value of a union member other than the last one
* stored into (6.2.6.1)." However, the result appears predictable in
* practice, as long as the structure is naturally aligned and naturally
* packed without padding. Also, note that reading the 16-bit value,
* incrementing it, and storing it is not a race condition with
* plasma_spin_tktlock_acquire() -- which does 32-bit atomic add, including
* 16 bits modified here -- since acquire does not modify these 16 bits.
* NB: must test behavior with different compiler optimization levels and
* ensure that pointer aliasing in union is not optimized into incorrect
* code. Might need to disable optimization on this routine or set
* compiler optimization fences if atomic store does not provide barrier */
__attribute_may_alias__
#if defined(__LITTLE_ENDIAN__)
uint16_t * const ptr = &spin->lck.t.le;
#elif defined(__BIG_ENDIAN__)
uint16_t * const ptr = &spin->lck.t.be;
#endif
const uint16_t n = *ptr + (uint16_t)1u;
plasma_atomic_store_explicit(ptr, n, memory_order_release);
}
#endif
/* plasma_spin_taglock_*() (fuzzy ticket lock)
* (alternative to strict ticket lock)
*
* Traditional ticket lock is fair, with each waiting thread acquiring the lock
* in strict progression. However, when a ticket lock is released and the count
* incremented, the thread waiting for that ticket number might be sleeping,
* which delays everyone. plasma_spin_taglock_*() provides an alternative.
* With a small number of threads spinning for the same batch of tag numbers,
* there is a higher probability that at least one will be actively spinning
* and able to grab the lock without further delay, while at the same time
* avoiding a thundering herd. Those with tags in the batch all get the lock
* before the next batch, so the number of waiters for the taglock converges
* to one for each batch, maintaining some fairness, though fuzzier than
* traditional, strictly fair ticket lock.
*
* Additional features:
* - plasma_spin_taglock allows for higher priority threads to compete for the
* lock with the current batch, instead of going to the end of the queue,
* (high priority threads can use plasma_spin_taglock_acquire_urgent())
* (high priority threads using this might starve low priority threads)
*/
typedef __attribute_aligned__(16)
struct plasma_spin_taglock_t {
uint32_t lck;
uint32_t tag;
uint64_t udata64; /* user data 8-bytes */
} plasma_spin_taglock_t;
#define PLASMA_SPIN_TAGLOCK_TAGMAX 0x7FFFFFFFu
#define PLASMA_SPIN_TAGLOCK_ORVAL 0x80000000u
#define PLASMA_SPIN_TAGLOCK_MASK(x) ((x) & PLASMA_SPIN_TAGLOCK_TAGMAX)
#define PLASMA_SPIN_TAGLOCK_IS_LOCKED(x) ((x) & PLASMA_SPIN_TAGLOCK_ORVAL)
#define PLASMA_SPIN_TAGLOCK_UNLOCKVAL(x) ((x) & ~PLASMA_SPIN_TAGLOCK_ORVAL)
#define PLASMA_SPIN_TAGLOCK_LOCKVAL(x) ((x) | PLASMA_SPIN_TAGLOCK_ORVAL)
#if (defined(__STDC_VERSION__) && __STDC_VERSION__-0 >= 199901L) /* C99 */
#define PLASMA_SPIN_TAGLOCK_INITIALIZER { .lck = 0, .tag = 0, .udata64 = 0 }
#else
#define PLASMA_SPIN_TAGLOCK_INITIALIZER { 0, 0, 0 }
#endif
#define plasma_spin_taglock_init(t) ((t)->lck=0, (t)->tag=0, (t)->udata64=0)
#define plasma_spin_taglock_is_free(taglock) \
(!PLASMA_SPIN_TAGLOCK_IS_LOCKED((taglock)->lck))
#define plasma_spin_taglock_is_contended(taglock) \
((taglock)->lck != PLASMA_SPIN_TAGLOCK_MASK((taglock)->tag))
#ifndef __ia64__
#define PLASMA_SPIN_TAGLOCK_VIA_FETCH_OR
#endif
/* NOTE: plasma_spin_taglock_acquire_try() is *unfair*; avoid under contention
* (e.g. caller might first check lock using plasma_spin_taglock_is_contended()
* or maintain a flag in separate cache line to call acquire vs acquire_try) */
__attribute_nonnull__()
PLASMA_SPIN_C99INLINE
bool
plasma_spin_taglock_acquire_try (plasma_spin_taglock_t *
const restrict taglock);
#ifdef PLASMA_SPIN_C99INLINE_FUNCS
PLASMA_SPIN_C99INLINE
bool
plasma_spin_taglock_acquire_try (plasma_spin_taglock_t * const restrict taglock)
{
#ifdef PLASMA_SPIN_TAGLOCK_VIA_FETCH_OR
uint32_t * const restrict lck = (uint32_t *)&taglock->lck;
const uint32_t cmp =
plasma_atomic_fetch_or_u32(lck, PLASMA_SPIN_TAGLOCK_ORVAL,
memory_order_relaxed);
if (!PLASMA_SPIN_TAGLOCK_IS_LOCKED(cmp)) {
/* jumped queue by locking without incrementing tag
* (so decrement lck since taglock_release will increment lck) */
*lck = PLASMA_SPIN_TAGLOCK_LOCKVAL(cmp-1);
plasma_membar_atomic_thread_fence_acq_rel();
return true;
}
return false;
#else /* Must read prior to CAS, so check if contended before CAS attempt */
uint32_t * const restrict lck = (uint32_t *)&taglock->lck;
const uint32_t cmp = *lck;/* bypass queue if not locked and not contended */
if (cmp == PLASMA_SPIN_TAGLOCK_MASK(taglock->tag)/*(jump queue:(cmp - 1))*/
&& plasma_atomic_CAS_32(lck,cmp,PLASMA_SPIN_TAGLOCK_LOCKVAL(cmp - 1))) {
plasma_membar_atomic_thread_fence_acq_rel();
return true;
}
return false;
#endif
}
#endif
__attribute_nonnull__()
bool
plasma_spin_taglock_acquire (plasma_spin_taglock_t * const restrict taglock);
/* NOTE: plasma_spin_taglock_acquire_urgent() is *unfair*
* (but that is the point); avoid many urgent contenders */
__attribute_nonnull__()
bool
plasma_spin_taglock_acquire_urgent (plasma_spin_taglock_t *
const restrict taglock);
__attribute_nonnull__()
PLASMA_SPIN_C99INLINE
void
plasma_spin_taglock_release (plasma_spin_taglock_t * const restrict taglock);
#ifdef PLASMA_SPIN_C99INLINE_FUNCS
PLASMA_SPIN_C99INLINE
void
plasma_spin_taglock_release (plasma_spin_taglock_t * const restrict taglock)
{
uint32_t * const restrict lck = &taglock->lck;
plasma_atomic_store_explicit(lck, PLASMA_SPIN_TAGLOCK_UNLOCKVAL(1u+*lck),
memory_order_release);
}
#endif
#ifdef __cplusplus
}
#endif
#endif
/* NOTES and REFERENCES
*
* Discussions about spinning and performance:
*
* http://www.1024cores.net/home/lock-free-algorithms/tricks/spinning
* http://software.intel.com/en-us/articles/implementing-scalable-atomic-locks-for-multi-core-intel-em64t-and-ia32-architectures/
* http://stackoverflow.com/questions/11923151/is-there-any-simple-way-to-improve-performance-of-this-spinlock-function
* http://stackoverflow.com/questions/11959374/fastest-inline-assembly-spinlock
* http://locklessinc.com/articles/priority_locks/
* http://locklessinc.com/articles/locks/
* http://www.embedded.com/electronics-blogs/beginner-s-corner/4023947/Introduction-to-Priority-Inversion
*
* http://www.1024cores.net/home/lock-free-algorithms/tricks/spinning
* "On Linux passive spinning is implemented with pthread_yield(), or nanosleep() for "deeper" spin." [Ed: pthread_yield() not portable; sched_yield() is]
* "On Windows situation is a bit more involved. There is SwitchToThread() which is limited to the current processor. There is Sleep(0) which is limited to threads of no-less priority (it's unclear as to whether it's intra-processor or inter-processor). Finally, there is Sleep(1) which should cover all cases (all priorities/all processors)."
* [See section Hybrid spinning for sample do_backoff() routine]
*
* http://cbloomrants.blogspot.com/2011/07/07-29-11-spinning.html
* "Obviously a good software engineer doesn't just randomly stick calls to STT or whatever in their spin loops, they make a function like MySpinYield() and call their function so that they can encapsulate the tricky platform-specific knowledge in one place. It just staggers me how rarely I see anyone do things like this.
* "But really you shouldn't EVER do a spin-wait on Windows. There's no reason for it. Any place you are tempted to use a spin-wait, use an "eventcount" instead and do a proper wait. (perhaps after a few spins, but only if you have very good reason to believe those spins actually help)
* "BTW I've written about the balanced set manager in detail here :
http://cbloomrants.blogspot.com/2010/09/09-12-10-defficiency-of-windows-multi.html "
*
* SwitchToThread()
* http://msdn.microsoft.com/en-us/library/windows/desktop/ms686352%28v=vs.85%29.aspx
*
* Sleep(1)
* http://msdn.microsoft.com/en-us/library/windows/desktop/ms686298%28v=vs.85%29.aspx
* http://www.bluebytesoftware.com/blog/2006/08/23/PriorityinducedStarvationWhySleep1IsBetterThanSleep0AndTheWindowsBalanceSetManager.aspx
*
* timeBeginPeriod(1) (set default system quantum to 1ms instead of 10-15ms)
* (generally not a good idea to do this for extended periods of time)
* http://msdn.microsoft.com/en-us/library/windows/desktop/dd757624%28v=vs.85%29.aspx
* http://cbloomrants.blogspot.com/2010/09/09-12-10-defficiency-of-windows-multi.html
* http://msdn.microsoft.com/en-us/library/ms713413%28v=vs.85%29.aspx
* http://stackoverflow.com/questions/7590475/why-does-increasing-timer-resolution-via-timebeginperiod-impact-power-consumptio
* http://stackoverflow.com/questions/7685762/windows-7-timing-functions-how-to-use-getsystemtimeadjustment-correctly/11743614#11743614
* Timers, Timer Resolution, and Development of Efficient Code
* http://msdn.microsoft.com/en-us/windows/hardware/gg463266.aspx
*
* Apple OSAtomic.h Reference
* https://developer.apple.com/library/mac/#documentation/System/Reference/OSAtomic_header_reference/Reference/reference.html#//apple_ref/doc/uid/TP40011482
*
* Ticket lock
* http://en.wikipedia.org/wiki/Ticket_lock
* http://en.wikipedia.org/wiki/Fetch-and-add
* http://lwn.net/Articles/267968/ (including comments)
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
*/