// build : gcc -O3 -std=gnu11 ParkEventLeak.c -lm -lpthread -o ParkEventLeak // run : ./ParkEventLeak NThreads=2 #define _GNU_SOURCE 1 #define __USE_GNU 1 // PRIXPTR; int64_t is PRId64 uint64_t is PRIu64 // __STDC_FORMAT_MACROS enables PRIXPTR etc from inttypes.h #define __STDC_FORMAT_MACROS 1 #include #include #include #include // #include // #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __sun #include #include #endif #include #include #include #include #include #include #include #include #if defined(__sparc) // The following CAS32 works for both 32-bit V8PlusA and 64-bit V9 execution modes #define CAS32(ptr,cmp,set) \ ({ typeof(set) tt = (set); \ __asm__ __volatile__ ( \ "cas [%2],%3,%0" \ : "=&r" (tt) \ : "0" (tt), "r" (ptr), "r" (cmp) \ : "memory" ); \ tt ; \ }) #if __SIZEOF_POINTER__ == 4 // ILP32 32-bit mode on SPARC V8PlusA #define CASN CAS32 #define CASP CAS32 #endif #endif #define QUOTE_(x) #x // stringize #define QUOTE(x) QUOTE_(x) #define ASSERT(x) ((x) || printf ("ASSERT %s:%d %s\n", __FILE__,__LINE__,#x)) #define CLAIM(x) ((x) || _afail (__FILE__, __LINE__, #x)) static int _afail (char * File, int Line, char * Expression) { printf ("CLAIM failed %s:%d [%s]\n", File, Line, Expression) ; return 0 ; } #define PHAT(x) ((int64_t) (x)) #define ADR(x) ((uintptr_t)(x)) #define PT(x) (__builtin_expect((x),1)) // predict taken (true) #define PN(x) (__builtin_expect((x),0)) // predict not-taken #define DOUBLE(x) ((double)(x)) #define INT(x) ((intptr_t) (x)) #define UNS(x) ((uintptr_t)(x)) static int Adjust (volatile int * a, int dx) { #if defined(__i386) || defined(__x86_64) return __XADDL(a,dx) ; #else int k = *a ; for (;;) { const int vfy = CAS32 (a, k, k+dx) ; if (vfy == k) return k ; k = vfy ; } #endif } static int64_t SelfUserTime() ; static double DMin (double a, double b) { return (a <= b) ? a : b ; } static int IMin (int a, int b) { return (a <= b) ? a : b ; } static int IMax (int a, int b) { return (a >= b) ? a : b ; } static int IClamp (int v, int a, int b) { // clamp v in [a,b] // Claim : a <= b return IMin (IMax (v, a), b) ; } static int NormalizeInt (int v) { // Equiv : !!v // Equiv : v ? 1 : 0 // Equiv : (UNS(v) | UNS(-v)) >> 31 // Alternate forms : phi; cmov; addc trick // Branch-free idiom : convert control dependency to data dependency // Selection -- Transform : // D = A ? B : C // Into : // D = B ^ ((Normalize(A)-1) & (B ^ C)) // // Annulled store -- Transform : // if (A) B = V // into // int dummy ; // tgt = &B ; // dx = (Normalize(A)-1) & (UNS(&dummy)-UNS(tgt)) // *(tgt+dx) = V return ((v|-v) >> 31) & 1 ; } static intptr_t NormalizeIntPtr (intptr_t v) { // Equiv : return !!v // Equiv : return x ? 1 : 0 // Alternate forms : phi; cmov; addc trick // Branch-free idiom return ((v|-v) >> ((sizeof(v) * 8)-1)) & 1 ; } static intptr_t NormalizeIntPtrM1 (intptr_t v) { // Equiv : return x ? -1 : 0 // Alternate forms : phi; cmov; addc trick // Branch-free idiom return (v|-v) >> ((sizeof(v) * 8)-1) ; } // Applies a supplemental hash function to a given hashCode, which // defends against poor quality hash functions. This is critical // because we use power-of-two length hash tables, that otherwise // encounter collisions for hashCodes that do not differ in lower // or upper bits. // See also Exchanger.java and the FNV-1a supplemental hash. static int spreadHash(int h) { // Spread bits to regularize both segment and index locations, // using variant of single-word Wang/Jenkins hash. h += (h << 15) ^ 0xffffcd7d; h ^= (UNS(h) >> 10); h += (h << 3); h ^= (UNS(h) >> 6); h += (h << 2) + (h << 14); return h ^ (UNS(h) >> 16); } // Supplemental hash to further mix the bits // Defends against poor quality hashcodes // Good avalanche/cascade properties - each input but affects each output bit static int MixHash (int h) { // Apply base state of murmurHash // See http://code.google.com/p/smhasher // Google cityhash // Recall that multiplication can be expressed as a sequence of shift-add pairs h ^= UNS(h) >> 16 ; h *= 0x85EBCA6B ; h ^= UNS(h) >> 13 ; h *= 0xC2B2AE35 ; return (UNS(h) >> 16) ^ (h & 0x7FFFFFFF) ; } static int FNV1AHash (int v) { // Based on FNV-1a hash code (http://www.isthe.com/chongo/tech/comp/fnv/) return (v ^ 0x811c9dc5) * 0x01000193; } // Note that multiply can be thought of as a parallel shift-add operation which // results in a good avalanche effect but tends to move bits "upward" so the // low-order input bits are not as "smeared" or convolved in the output. // Also, the highest order input bits simply overflow and are discarded. // To address this issue we could use the following, where M(x) is a // hash based on multiplication: // hash = M(x) ^ M(ReverseBits(x)) static int RandomSeed () { // Depend in part on ASR // who + what + where + when // Consider reading from /dev/random or /dev/urandom // Consider RDRND on x86 const intptr_t tos = (intptr_t) &tos ; return gethrtime() ^ tos ^ getpid() ; } // Simplistic low-quality Marsaglia shift-xor PRNG. // Bijective except for the final masking operation. // Cycle length for non-zero values is 4G-1. // 0 is absorbing and should be avoided -- fixed point. // // Callers will typically mask the return value with 7FFFFFFF. // Beware that the low-order bits are somewhat less random. // To improve the quality of the PRNG discard the LSBits. // Note that the long cycle _effectively yields selection without replacement. // // Ticket is infrequently accessed. // We intentionally use a non-atomic racy increment. // The race is rare and benign and the operations on Ticket are closed. // We could seed/reseed with either gethrtime() or perhaps // volatile const int tos = (int) &tos; // Note that the use of a static volatile makes it difficult if not // impossible for a compiler to elide the PRNG steps. // Useful for spin loops as the compiler can't optimize it away. // // The larger 4-state Marsaglia shift-XOR PRNGs or the // Ziff 4-tap additive PRNGs are good choices if higher quality is needed. // // Bernoulli trials until 1st success yields a geometric distribution. static int MarsagliaNext (int v) { if (v == 0) { static volatile int Ticket = 0x1BADD1CE ; // Apply mixhash so trajectories of different threads have better separation v = MixHash(++Ticket) ; } v ^= v << 6; v ^= ((unsigned)v) >> 21; v ^= v << 7 ; return v ; } static intptr_t Determinism = 0 ; // Deterministic seeding of PRNGs static int NextRandomAlt () { static __thread int32_t rv = 0 ; // rv state if (rv == 0) { // reseed the PRNG // Consider adding RandomSeed() static volatile int32_t Ticket = 0xD1CEABCD ; const intptr_t tos = ((intptr_t) &tos) & Determinism ; // Apply mixhash so trajectories of different threads have better separation rv = MixHash(++Ticket + tos) ; } return spreadHash (rv++) & 0x7FFFFFFF ; } static int NextRandom () { static __thread int32_t RvPRNG = 0 ; // rv state int rv = RvPRNG ; if (rv == 0) { // reseed the PRNG // Ticket is accessed infrequently and does not constitute a coherence hot-spot. // Note that we use a non-atomic racy increment -- the race is rare and benign. // If the race is a concern switch from ++Ticket to AITicket.getAndIncrement(). // In addition accesses to the RW volatile global "Ticket" variable are not // (readily) predictable at compile-time so the JIT/gcc will not be able to elide // NextRandom() invocations. static volatile int32_t Ticket = 0xD1CEABCD ; const intptr_t tos = ((intptr_t) &tos) & Determinism ; // Apply mixhash so trajectories of different threads have better separation rv = MixHash(++Ticket) ; } rv ^= rv << 6 ; rv ^= ((uint32_t)(rv)) >> 21 ; rv ^= rv << 7 ; RvPRNG = rv ; // Beware that the low-order bits of the stream of return values are // correlated and less random than the stream of values. // post-condition the return value as follows : // rv >> 4 // MixHash(rv >> 4)) & 0x7FFFFFFF // MixHash(rv) & 0x7FFFFFFF // spreadHash(rv) & 0x7FFFFFFF return MixHash(rv >> 4) & 0x7FFFFFFF ; } // Ziff 4-tap PRNG // Better quality than the Marsaglia shift-xor, but more stateful and requires // more memory references. typedef enum { A=171, B=207, C=455, M=512-1 } ZiffConstants ; typedef struct { int ix ; int ra [1] ; } RNGState ; static RNGState * NewRNG () { RNGState * n = malloc (sizeof(RNGState) + (sizeof(n->ra[0]) * (M+1))) ; n->ix = 0 ; for (int k = 0 ; k < M+1 ; k++) { n->ra[k] = NextRandom() ; } return n ; } static int Next (RNGState * s) { int ix = ++s->ix ; int v = s->ra[ix & M] = s->ra[(ix-A) & M] ^ s->ra[(ix-B) & M] ^ s->ra[(ix-C) & M] ; return v ; } // Marsalgia MWC256 and CMWC4096 // http://en.wikipedia.org/wiki/Multiply-with-carry // AWC = Add-With-Carry // SWB = Subtract-With-Borrow // MWC = supplemented with Multiply-With-Carry // CMWC = Complimentary-Multiply-With-Carry // // MWC256 : period = 2^8222 // Choosing random values for the static array Q[256] and c // puts you at a random starting place in the base b=2^32-1 // expansion of 1/p, where p is the prime 1540315826*b^256-1. // The expansion of 1/p has cycles of more than 10^2475 base-b // 'digits', and they are returned in reverse order, from a // random starting point in the cycle. // // CMWC4096 : period = 2^131086 -- every larger than MT! // This RNG has period > 2^131086, some 10^33459 times as // long as that of the Mersenne Twister and rates highly in // all categories. It provides the more than 10^39460 base-b digits // in the expansion of (p-1)/p, where p is the prime // p=18782*b^4096+1. , b=2^32-1. Those base-b 'digits' are // returned in reverse order from a random starting point // determined by the random choice of the initial values in Q[4096] and c. typedef enum { MWC256_Length = 256, CMC4096_Length = 4096, } MWCConstants ; typedef struct { int ix ; uint32_t c ; // carry uint32_t Q [256] ; } MWC256State ; typedef struct { int ix ; uint32_t c ; // carry uint32_t Q [4096] ; } CMWC4096State ; static int MWC256Next (MWC256State * m) { int ix = (++m->ix) & (256-1) ; uint64_t t = (809430660ULL * m->Q[ix]) + m->c ; m->c = t >> 32 ; m->Q[ix] = t ; return (int) t ; } static MWC256State * NewMCW256 (MWC256State * m) { if (m == NULL) { m = malloc (sizeof(*m)) ; } memset (m, 0, sizeof(*m)) ; m->c = 362436 ; m->ix = 256-1 ; for (int i = 0 ; i < 256 ; i++) { m->Q[i] = MixHash(i ^ UNS(m)) ; } for (int j = 0 ; j < 1000 ; j++) { MWC256Next(m) ; } return m ; } static int CMWC4096Next (CMWC4096State * m) { int ix = (++m->ix) & (4096-1) ; uint64_t t = (187882ULL * m->Q[ix]) + m->c ; m->c = t >> 32 ; uint32_t x = t + m->c ; // CONSIDER: make the following branch-free -- optimization if (x < m->c) { x++; m->c++; } m->Q[ix] = 0xFFFFFFFEU - x ; return (int) m->Q[ix] ; } static CMWC4096State * NewCMCW4096 (CMWC4096State * m) { if (m == NULL) { m = malloc (sizeof(*m)) ; } memset (m, 0, sizeof(*m)) ; m->c = 362436 ; m->ix = 4096-1 ; for (int i = 0 ; i < 4096 ; i++) { m->Q[i] = MixHash(i ^ UNS(m)) ; } for (int j = 0 ; j < 1000 ; j++) { CMWC4096Next(m) ; } return m ; } static CMWC4096State * NewCMCW4096Alt (CMWC4096State * m, int x) { if (m == NULL) { m = malloc (sizeof(*m)) ; } memset (m, 0, sizeof(*m)) ; m->c = 362436 ; m->ix = 4096-1 ; const uint32_t Phi = 0x9E3779B9 ; m->Q[0] = x ; m->Q[1] = x + Phi ; m->Q[2] = x + Phi + Phi ; for (int i = 3 ; i < 4096 ; i++) { m->Q[i] = m->Q[i-3] ^ m->Q[i-2] ^ Phi ^ i ; } for (int j = 0 ; j < 1000 ; j++) { CMWC4096Next(m) ; } return m ; } // Here is a MWC example with period 3056868392^33216-1, a mere // 10^4005 times as long as that of the Mersenne twister, yet faster, // far simpler, and seemingly at least as well-performing in tests // of randomness: typedef struct { int ix ; uint32_t c ; uint32_t Q[1038] ; } MWC1038State ; static int MWC1038Next (MWC1038State * m) { const uint64_t a = 611373678LL; uint64_t t = a * m->Q[m->ix] + m->c; m->c = t >> 32; if (--m->ix != 0) { m->Q[m->ix] = t ; return t ; } m->ix = 1038-1 ; m->Q[0] = t ; return t ; } static MWC1038State * NewMWC1038 (MWC1038State * m) { if (m == NULL) { m = malloc (sizeof(*m)) ; } memset (m, 0, sizeof(*m)) ; m->c = 362436 ; m->ix = 1038-1; for (int i = 0 ; i < 1038 ; i++) { m->Q[i] = MixHash(i ^ UNS(m)) ; } for (int j = 0 ; j < 1000 ; j++) { MWC1038Next(m) ; } return m ; } static int toNextPowerOfTwo(int v) { v = v - 1; v = v | (v >> 1); v = v | (v >> 2); v = v | (v >> 4); v = v | (v >> 8); v = v | (v >> 16); return v + 1; } // TODO: use LOCK:XADDL on ia32/x64 // Atomic fetch-and-increment // On Niagara we'd like to use a cache-sparing load as the // subsequent CAS will invalidate the line anyway. // See improved more scalable FetchIncrement() in rw/impl-*/*.c static inline int FetchIncrement (volatile int * v) { #if defined(__i386) return __XADDL(v,1) ; #else int k = *v ; for (;;) { const int vfy = CAS32 (v, k, k+1) ; if (vfy == k) return k ; k = vfy ; } #endif } typedef struct { void * (*func)() ; void * arg ; int Color ; volatile int Rendezvous ; // Ready intptr_t ReturnValue ; // Future-like volatile int Status ; } Trampoline ; // Use raw system call _getcpuid() instead of "improved" S11 getcpuid() // implementation which uses schedctl sc_cpu but does not handle nascent // threads where sc_cpu is not properly initialized. extern int _getcpuid () ; static void * Thunk (void * tmp) { static volatile int Color = 0 ; // Implement stack color either via alloca() or recursion // We consume the return value from alloca() as gcc is sufficiently // clever as to elide the alloca() call if the buffer is never accesssed. // The increment of Color is racy but races are benign. void * a = alloca (64 * (Color++) & (128-1)) ; if (a == (void *) 0xDD) printf ("!") ; void * (*func)() = ((Trampoline *)tmp)->func ; void * arg = ((Trampoline *)tmp)->arg ; free (tmp) ; return (*func)(arg) ; } static pthread_t LaunchThreadC (void * (*func)(void *), void * arg) { pthread_attr_t hx [1]; pthread_attr_init (hx); pthread_attr_setscope(hx, PTHREAD_SCOPE_SYSTEM) ; // Beware: DETACHED -> not join()able // Beware: SCHED_RR and EUID=0 yields RT scheduling class pthread_attr_setdetachstate (hx, PTHREAD_CREATE_DETACHED) ; Trampoline * const tmp = (Trampoline *) malloc (sizeof(Trampoline)) ; tmp->Rendezvous = 0 ; tmp->func = func ; tmp->arg = arg ; pthread_t newid [1] ; newid[0] = 0 ; const int rc = pthread_create(newid, hx, Thunk, tmp); if (rc != 0) { free (tmp) ; printf("error creating thread, error: %s\n", strerror(rc)); return 0 ; } return newid[0] ; } static double RaiseTo (double x, double To) { return exp(log(x) * To) ; // TODO: use pow(x, To) } static void ShuffleFYC (int * A, int m) { // Fisher-Yates shuffle : classic descending form // Content-preserving -- simply changes the order of elements in A. // We have conservation of contents // ASSERT (m >= 0) ; while (m != 0) { // j uniformly distributed in [0,m) [0,m-1] const int j = NextRandom() % m ; --m ; const int tmp = A[j] ; A[j] = A[m] ; A[m] = tmp ; } } static int * ShuffleFYB (int * A, int NSlots) { if (A == NULL) { A = (int *) malloc (sizeof(A[0]) * (NSlots+1)) ; } // Create a permutation on [0,NSlots) // We can create a permutation on a general set of N values // by using Source[i] instead of i, below. A[0] = 0 ; // Source[0] for (int i = 1; i < NSlots ; i++) { // j uniformly distributed in [0,i] int j = NextRandom() % (i+1) ; // printf ("i=%d j=%d A[i]=%d A[j]=%d\n", i, j, a[i], a[j]) ; A[i] = A[j] ; A[j] = i ; // Source[i] } // Claim : A[*] is populated with values in [0,NSlots) with no duplicates // That is, each element in A[*] is unique return A ; } static double RNext () { int rv = NextRandom() ; const int Range = 0x1000000 ; return DOUBLE (rv & (Range-1)) / DOUBLE(Range) ; } typedef struct _ParkEvent { volatile intptr_t AssociatedWith ; struct _ParkEvent * volatile Next ; int Padding [100] ; } ParkEvent ; static volatile int Circulating = 0 ; static ParkEvent * volatile FreeList = NULL ; static ParkEvent * Allocate_Fixed () { static volatile int PopLock = 0 ; // Use PopLock to reduce from MPMC to MPSC in order to avoid ABA. // We can't use a normal lock because Allocate() is called at startup // before any of the subsystems used by Mutex:: etc are properly initialized. // By virtue of PopLock there can be at most one thread trying to pop at // any given time. // The list is grow-only while PopLock is held. // The head is mutable but the interior elements and intrusive linkage remain // stable. // Classic "OTY" Oyama-Taura-Yonezawa might be a better choice but is // requires a larger delta. for (;;) { if (CAS32 (&PopLock, 0, 1) == 0) break ; while (PopLock != 0) poll (NULL,0,1) ; // NakedSleep // tight spin loops or spin-yield are not sufficient and will // result in progress failure scenarios on Solaris. // We must explicity deschedule the caller. // Ideally we'd use SWAP instead of CAS } ParkEvent * ev = FreeList ; if (ev != NULL) { for (;;) { ParkEvent * NextAfter = ev->Next ; ParkEvent * vfy = CASP (&FreeList, ev, NextAfter) ; if (vfy == ev) break ; // The only source of concurrent inteference is the arrival // of new elements via Release(). // Just retry ev = vfy ; // ASSERT vfy != NULL } ev->Next = (ParkEvent *) 1 ; // diagnostic hygiene } // Need Release fence here ! // ASSERT PopLock != 0 PopLock = 0 ; // FreeList was empty so we resort to materializing a new ParkEvent if (ev == NULL) { ev = (ParkEvent *) malloc (sizeof(*ev)) ; memset (ev, 0, sizeof(*ev)) ; int n = Adjust (&Circulating, 1) ; printf ("!%d ", n) ; } if (ev->AssociatedWith != 0) { printf ("Error: %s %s:%d\n", __func__, __FILE__, __LINE__) ; } ev->AssociatedWith = 1 ; return ev ; } // Threads can run in phase (entrainment) and the detached free // list sloshes from thread to thread but spends most of its // time privatized and generally inaccessible, so we effectively // have leakage. static ParkEvent * Allocate_Original () { ParkEvent * ev = NULL ; for (;;) { ev = FreeList ; if (ev == NULL) break ; if (CASP (&FreeList, ev, (ParkEvent *) NULL) != ev) continue ; ParkEvent * List = ev->Next ; if (List == NULL) break ; for (;;) { ParkEvent * arv = CASP (&FreeList, NULL, List) ; if (arv == NULL) break ; #if 1 ParkEvent * Tail = List ; while (Tail->Next != NULL) Tail = Tail->Next ; Tail->Next = arv ; #else ParkEvent * Tail = arv ; while (Tail->Next != NULL) Tail = Tail->Next ; Tail->Next = List ; List = arv ; #endif } } if (ev == NULL) { ev = (ParkEvent *) malloc (sizeof(*ev)) ; memset (ev, 0, sizeof(*ev)) ; int n = Adjust (&Circulating, 1) ; printf ("!%d ", n) ; } ev->AssociatedWith = 1 ; return ev ; } static ParkEvent * Allocate_ThrowAwayExperimentlaForm () { ParkEvent * ev = NULL ; for (;;) { ev = FreeList ; if (ev == NULL) break ; if (CASP (&FreeList, ev, (ParkEvent *) NULL) != ev) continue ; ParkEvent * List = ev->Next ; if (List == NULL) break ; ParkEvent * Tail = List ; for (;;) { ParkEvent * arv = CASP (&FreeList, NULL, List) ; if (arv == NULL) break ; while (Tail->Next != NULL) Tail = Tail->Next ; Tail->Next = arv ; Tail = arv ; } } if (ev == NULL) { ev = (ParkEvent *) malloc (sizeof(*ev)) ; memset (ev, 0, sizeof(*ev)) ; int n = Adjust (&Circulating, 1) ; printf ("!%d ", n) ; } return ev ; } static void Release (ParkEvent * e) { if (e->AssociatedWith == 0) { printf ("Error: %s %s:%d\n", __func__, __FILE__, __LINE__) ; } e->AssociatedWith = 0 ; for (;;) { ParkEvent * List = FreeList ; e->Next = List ; if (CASP (&FreeList, List, e) == List) break ; } } static void * TBodyA (void * arg) { const volatile intptr_t tos = (intptr_t) &tos ; // effectively SP const int _id = (int) arg ; // Consider nested behavior // Code below is completely flat // Insert some short randomized delays int Count = 0 ; for (;;) { // Call either Allocate_Fixed or Allocate_Original ParkEvent * e = Allocate_Original() ; Release (e) ; // Simple progress indicator ++Count ; if (Count > 1000 && (Count & (Count-1)) == 0) printf ("T%d+%d ", _id, Count) ; } return NULL ; } // TODO: Handle Kibi vs Kilo // Consider : allow KK instead of M static int ParseInt (char * str) { if (*str == '=') ++str ; char * p = strdup (str) ; char * const m = p ; int mulf = 1 ; // Units multiplier while (isxdigit(*p) || *p == 'x' || *p == 'X') ++p ; if (*p == 'k' || *p == 'K') { mulf = 1024 ; *p = 0 ; } else if (*p == 'm' || *p == 'M') { mulf = 1024*1024 ; *p = 0 ; } const int v = strtol (m, NULL, 0) * mulf ; free (m) ; return v ; } // Return aggregate user CPU time in secs. // Consider: enable MSA on Solaris // Reconsile Solaris and Linux #if !defined(RUSAGE_THREAD) #define RUSAGE_THREAD RUSAGE_LWP #endif static int64_t AggregateUserTime () { struct rusage ru ; getrusage (RUSAGE_SELF, &ru) ; // process-wide, accumulate user-time to date. // TODO: return (secs*1000L) + (usecs/1000L) ; return ((ru.ru_utime.tv_sec * 1000000LL) + ru.ru_utime.tv_usec)/1000LL ; } static int64_t SelfUserTime () { struct rusage ru ; getrusage (RUSAGE_THREAD, &ru) ; // TODO: return (secs*1000L) + (usecs/1000L) ; return ((ru.ru_utime.tv_sec * 1000000LL) + ru.ru_utime.tv_usec)/1000LL ; } #define ASSIGN(VarName) { if (strcmp(Key,#VarName) == 0) { ++Assigned; VarName = Value ; }} #define ASSIGNP(VarName) { if (strcmp(Key,#VarName) == 0) { ++Assigned; VarName = ValueP ; }} #define ASSIGNS(VarName) { if (strcmp(Key,#VarName) == 0) { ++Assigned; VarName = (p+1) ; }} #define ASSIGND(VarName) { if (strcmp(Key,#VarName) == 0) { ++Assigned; VarName = ValueD ; }} static char * Comment = "" ; static int NThreads = 10 ; static int Verbose = 0 ; static int NCPUS = 0 ; int main ( int argc, char * argv []) { setbuf (stdout, NULL) ; setbuf (stderr, NULL) ; char * const ExecutableName = argv[0] ; for (int ix = 0 ; ix < argc; ix++) { printf ("%s ", argv[ix]) ; } printf ("\n") ; char * Comment = "" ; // Consider : use "" or "." or "_" // We simply parse Key=Value pairs directly from the command line. // We might use a simple symbol table that contains the name and address // of the configurable variables, in which case we could either incorporate // the value directly into in the symbol table entry or keep a pointer to // the "external" value. // Tuple : Name,Next,Value,Address,SetterFunc // The downside to a symbol table is that taking the address of a variable // can often inhibit very useful optimizations as the compiler may not // be able to prove that the address does not escape, in which case // it may need to be conservative with respect to potential aliasing and // and concurrent access. We can mitigate these issues by hoisting // the global values into const local values in TBody(). // The current implementation avoids taking the address of the variable. // An X-macro implementation might be be better. --argc ; ++argv ; while (argc > 0) { --argc ; char * p = *(argv++) ; // Parse Key=Value tokens char * Key = p ; while (*p != '=') ++p ; *p = 0 ; int Value = strtol (p+1, NULL, 0) ; // ParseInt vs strtol int ValueP = ParseInt (p+1) ; double ValueD = atof (p+1) ; int Assigned = 0 ; ASSIGNP(Verbose) ; ASSIGNS(Comment) ; ASSIGNP(NThreads) ; if (Assigned == 0) { printf ("Unknown key : %s\n", Key) ; exit (1) ; } if (Verbose) printf ("Assign %s=%d/%s\n", Key, Value, p+1) ; } if (NThreads == 0) { printf ("No threads specified\n") ; exit (1) ; return 1 ; } const int ncpus = sysconf(_SC_NPROCESSORS_ONLN) ; NCPUS = ncpus ; // Launch all worker threads ... // By convention we always have #NThreads threads that are alive. // We reduce and vary concurrency by setting some subset inactive. // Inactive threads remain alive but do not participate. // When we retire a thread we immediately respawn a replacement. for (int t = 0 ; t < NThreads ; t++) { LaunchThreadC (TBodyA, (void *) t) ; } for (;;) { poll (NULL, 0, 1000) ; } if (0) pthread_exit (NULL) ; return 0 ; }