__group__	ticket	summary	component	milestone	type	created	_description_	_changetime	_reporter
dsteffen@apple.com	35	invalid code generated by GCC 4.5.1 for _dispatch_queue_push_list()			defect	2011-09-08T03:41:32-07:00	"When building libdispatch r197 using GCC 4.5.1 on Solaris (X86_64) we found that sometimes one of the worker threads starts spinning in tight loop in {{{_dispatch_queue_concurrent_drain_one()}}} function.

'''src/queue.c'''
{{{
struct dispatch_object_s *
_dispatch_queue_concurrent_drain_one(dispatch_queue_t dq)
{
    struct dispatch_object_s *head, *next, *const mediator = (void *)~0ul;

    // The mediator value acts both as a ""lock"" and a signal
    head = dispatch_atomic_xchg(&dq->dq_items_head, mediator);

    if (slowpath(head == NULL)) {
        // The first xchg on the tail will tell the enqueueing thread that it
        // is safe to blindly write out to the head pointer. A cmpxchg honors
        // the algorithm.
        dispatch_atomic_cmpxchg(&dq->dq_items_head, mediator, NULL);
        _dispatch_debug(""no work on global work queue"");
        return NULL;
    }

    if (slowpath(head == mediator)) {
        // This thread lost the race for ownership of the queue.
        //
        // The ratio of work to libdispatch overhead must be bad. This
        // scenario implies that there are too many threads in the pool.
        // Create a new pending thread and then exit this thread.
        // The kernel will grant a new thread when the load subsides.
        _dispatch_debug(""Contention on queue: %p"", dq);
        _dispatch_queue_wakeup_global(dq);
#if DISPATCH_PERF_MON
        dispatch_atomic_inc(&_dispatch_bad_ratio);
#endif
        return NULL;
    }

    // Restore the head pointer to a sane value before returning.
    // If 'next' is NULL, then this item _might_ be the last item.
    next = fastpath(head->do_next);

    if (slowpath(!next)) {
        dq->dq_items_head = NULL;

        if (dispatch_atomic_cmpxchg(&dq->dq_items_tail, head, NULL)) {
            // both head and tail are NULL now
            goto out;
        }

        // There must be a next item now. This thread won't wait long.
        while (!(next = head->do_next)) {                // <-------------------------- SBN: spins here forever
            _dispatch_hardware_pause();
        }
    }

    dq->dq_items_head = next;
    _dispatch_queue_wakeup_global(dq);
out:
    return head;
}
}}}

This happens only in optimized build and under high load: 2K-16K events dispatched using {{{dispatch_async_f()}}} function.

As it turned out the problem was in too aggressive reordering performed by GCC optimizer in {{{_dispatch_queue_push_list()}}} function which puts new event into lock-free queue.
This function is inlined in {{{dispatch_async_f()}}}

C source:

'''src/queue_internal.h'''

{{{
__attribute__((always_inline))
static inline void
_dispatch_queue_push_list(dispatch_queue_t dq, dispatch_object_t _head, dispatch_object_t _tail)
{
    struct dispatch_object_s *prev, *head = _head._do, *tail = _tail._do;

    tail->do_next = NULL;                                             // <-------------------------- SBN: (1)
    prev = fastpath(dispatch_atomic_xchg(&dq->dq_items_tail, tail));  // <-------------------------- SBN: (2)
    if (prev) {
        // if we crash here with a value less than 0x1000, then we are at a known bug in client code
        // for example, see _dispatch_queue_dispose or _dispatch_atfork_child
        prev->do_next = head;
    } else {
        _dispatch_queue_push_list_slow(dq, head);
    }
}
}}}

and 

'''src/queue.c'''

{{{
DISPATCH_NOINLINE
void
dispatch_async_f(dispatch_queue_t dq, void *ctxt, dispatch_function_t func)
{
    dispatch_continuation_t dc = fastpath(_dispatch_continuation_alloc_cacheonly());

    // unlike dispatch_sync_f(), we do NOT need to check the queue width,
    // the ""drain"" function will do this test

    if (!dc) {
        return _dispatch_async_f_slow(dq, ctxt, func);
    }

    dc->do_vtable = (void *)DISPATCH_OBJ_ASYNC_BIT;
    dc->dc_func = func;
    dc->dc_ctxt = ctxt;

    _dispatch_queue_push(dq, dc);
}
}}}

----

Disasm of {{{dispatch_async_f()}}} function generated by GCC 4.5.1:

{{{
0000000000007300 <dispatch_async_f>:
    7300:       55                      push   %rbp
    7301:       48 89 e5                mov    %rsp,%rbp
    7304:       48 89 5d e8             mov    %rbx,0xffffffffffffffe8(%rbp)
    7308:       4c 89 65 f0             mov    %r12,0xfffffffffffffff0(%rbp)
    730c:       48 89 fb                mov    %rdi,%rbx
    730f:       4c 89 6d f8             mov    %r13,0xfffffffffffffff8(%rbp)
    7313:       48 83 ec 20             sub    $0x20,%rsp
    7317:       49 89 f4                mov    %rsi,%r12
    731a:       49 89 d5                mov    %rdx,%r13
    731d:       e8 1e f2 ff ff          callq  6540 <_dispatch_continuation_alloc_cacheonly>
    7322:       48 85 c0                test   %rax,%rax
    7325:       74 35                   je     735c <dispatch_async_f+0x5c>
    7327:       48 89 c2                mov    %rax,%rdx
    732a:       48 c7 00 01 00 00 00    movq   $0x1,(%rax)
    7331:       4c 89 68 10             mov    %r13,0x10(%rax)
    7335:       48 87 53 40             xchg   %rdx,0x40(%rbx)                     // <-------------------------- SBN: (2)
    7339:       48 85 d2                test   %rdx,%rdx
    733c:       4c 89 60 18             mov    %r12,0x18(%rax)
    7340:       48 c7 40 08 00 00 00    movq   $0x0,0x8(%rax)                      // <-------------------------- SBN: (1)
    7347:       00 
    7348:       74 2d                   je     7377 <dispatch_async_f+0x77>
    734a:       48 89 42 08             mov    %rax,0x8(%rdx)
    734e:       48 8b 5d e8             mov    0xffffffffffffffe8(%rbp),%rbx
    7352:       4c 8b 65 f0             mov    0xfffffffffffffff0(%rbp),%r12
    7356:       4c 8b 6d f8             mov    0xfffffffffffffff8(%rbp),%r13
    735a:       c9                      leaveq 
    735b:       c3                      retq   
    735c:       4c 89 ea                mov    %r13,%rdx
    735f:       4c 89 e6                mov    %r12,%rsi
    7362:       48 89 df                mov    %rbx,%rdi
    7365:       4c 8b 65 f0             mov    0xfffffffffffffff0(%rbp),%r12
    7369:       48 8b 5d e8             mov    0xffffffffffffffe8(%rbp),%rbx
    736d:       4c 8b 6d f8             mov    0xfffffffffffffff8(%rbp),%r13
    7371:       c9                      leaveq 
    7372:       e9 19 ff ff ff          jmpq   7290 <_dispatch_async_f_slow>
    7377:       48 89 df                mov    %rbx,%rdi
    737a:       4c 8b 65 f0             mov    0xfffffffffffffff0(%rbp),%r12
    737e:       48 8b 5d e8             mov    0xffffffffffffffe8(%rbp),%rbx
    7382:       4c 8b 6d f8             mov    0xfffffffffffffff8(%rbp),%r13
    7386:       c9                      leaveq 
    7387:       48 89 c6                mov    %rax,%rsi
    738a:       e9 51 e2 ff ff          jmpq   55e0 <_dispatch_queue_push_list_slow@plt>
}}}

----

Looks like 2 stores (marked as '''SBN: (1)/(2)''' in listings above) were reordered by optimizer.
(Also note that initialization of {{{dispatch_continuation_t}}} fields ({{{dc_func}}} and {{{dc_ctxt}}}) were also reordered wrt inserting element to queue).

To workaround I added [http://en.wikipedia.org/wiki/Memory_ordering#Compiler_memory_barrier Compiler memory barrier] between initialization of {{{do_next}}} and inserting to queue:

{{{
__attribute__((always_inline))
static inline void
_dispatch_queue_push_list(dispatch_queue_t dq, dispatch_object_t _head, dispatch_object_t _tail)
{
    struct dispatch_object_s *prev, *head = _head._do, *tail = _tail._do;

    tail->do_next = NULL;
    __asm__ __volatile__ ("""" ::: ""memory"");                                       // <-------------------------- SBN: compiler memory barrier
    prev = fastpath(dispatch_atomic_xchg(&dq->dq_items_tail, tail));
    if (prev) {
        // if we crash here with a value less than 0x1000, then we are at a known bug in client code
        // for example, see _dispatch_queue_dispose or _dispatch_atfork_child
        prev->do_next = head;
    } else {
        _dispatch_queue_push_list_slow(dq, head);
    }
}
}}}

After that GCC performs all stores in expected order and I was unable to reproduce the problem anymore."	2011-09-20T00:58:55-07:00	sbn@…
