godot/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp


#include "LinearMath/btMinMax.h"
#include "LinearMath/btAlignedObjectArray.h"
#include "LinearMath/btThreads.h"
#include "LinearMath/btQuickprof.h"
#include <stdio.h>
#include <algorithm>


#if BT_THREADSAFE

#include "btThreadSupportInterface.h"

#if defined( _WIN32 )

#define WIN32_LEAN_AND_MEAN

#include <windows.h>

#endif


typedef unsigned long long btU64;
static const int kCacheLineSize = 64;

void btSpinPause()
{
#if defined( _WIN32 )
    YieldProcessor();
#endif
}


struct WorkerThreadStatus
{
    enum Type
    {
        kInvalid,
        kWaitingForWork,
        kWorking,
        kSleeping,
    };
};


ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives
{
    static const int kMaxThreadCount = BT_MAX_THREAD_COUNT;
    // directives for all worker threads packed into a single cacheline
    char m_threadDirs[kMaxThreadCount];

public:
    enum Type
    {
        kInvalid,
        kGoToSleep,         // go to sleep
        kStayAwakeButIdle,  // wait for not checking job queue
        kScanForJobs,       // actively scan job queue for jobs
    };
    WorkerThreadDirectives()
    {
        for ( int i = 0; i < kMaxThreadCount; ++i )
        {
            m_threadDirs[ i ] = 0;
        }
    }

    Type getDirective(int threadId)
    {
        btAssert(threadId < kMaxThreadCount);
        return static_cast<Type>(m_threadDirs[threadId]);
    }

    void setDirectiveByRange(int threadBegin, int threadEnd, Type dir)
    {
        btAssert( threadBegin < threadEnd );
        btAssert( threadEnd <= kMaxThreadCount );
        char dirChar = static_cast<char>(dir);
        for ( int i = threadBegin; i < threadEnd; ++i )
        {
            m_threadDirs[ i ] = dirChar;
        }
    }
};

class JobQueue;

ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage
{
    int m_threadId;
    WorkerThreadStatus::Type m_status;
    int m_numJobsFinished;
    btSpinMutex m_mutex;
    btScalar m_sumResult;
    WorkerThreadDirectives * m_directive;
    JobQueue* m_queue;
    btClock* m_clock;
    unsigned int m_cooldownTime;
};


struct IJob
{
    virtual void executeJob(int threadId) = 0;
};

class ParallelForJob : public IJob
{
    const btIParallelForBody* m_body;
    int m_begin;
    int m_end;

public:
    ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body )
    {
        m_body = &body;
        m_begin = iBegin;
        m_end = iEnd;
    }
    virtual void executeJob(int threadId) BT_OVERRIDE
    {
        BT_PROFILE( "executeJob" );

        // call the functor body to do the work
        m_body->forLoop( m_begin, m_end );
    }
};


class ParallelSumJob : public IJob
{
    const btIParallelSumBody* m_body;
    ThreadLocalStorage* m_threadLocalStoreArray;
    int m_begin;
    int m_end;

public:
    ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls )
    {
        m_body = &body;
        m_threadLocalStoreArray = tls;
        m_begin = iBegin;
        m_end = iEnd;
    }
    virtual void executeJob( int threadId ) BT_OVERRIDE
    {
        BT_PROFILE( "executeJob" );

        // call the functor body to do the work
        btScalar val = m_body->sumLoop( m_begin, m_end );
#if BT_PARALLEL_SUM_DETERMINISTISM
        // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
        const float TRUNC_SCALE = float(1<<19);
        val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE;  // truncate some bits
#endif
        m_threadLocalStoreArray[threadId].m_sumResult += val;
    }
};


ATTRIBUTE_ALIGNED64(class) JobQueue
{
    btThreadSupportInterface* m_threadSupport;
    btCriticalSection* m_queueLock;
    btSpinMutex m_mutex;

    btAlignedObjectArray<IJob*> m_jobQueue;
    char* m_jobMem;
    int m_jobMemSize;
    bool m_queueIsEmpty;
    int m_tailIndex;
    int m_headIndex;
    int m_allocSize;
    bool m_useSpinMutex;
    btAlignedObjectArray<JobQueue*> m_neighborContexts;
    char m_cachePadding[kCacheLineSize];  // prevent false sharing

    void freeJobMem()
    {
        if ( m_jobMem )
        {
            // free old
            btAlignedFree(m_jobMem);
            m_jobMem = NULL;
        }
    }
    void resizeJobMem(int newSize)
    {
        if (newSize > m_jobMemSize)
        {
            freeJobMem();
            m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize));
            m_jobMemSize = newSize;
        }
    }

public:

    JobQueue()
    {
        m_jobMem = NULL;
        m_jobMemSize = 0;
        m_threadSupport = NULL;
        m_queueLock = NULL;
        m_headIndex = 0;
        m_tailIndex = 0;
        m_useSpinMutex = false;
    }
    ~JobQueue()
    {
		exit();
    }
	void exit()
    {
		freeJobMem();
        if (m_queueLock && m_threadSupport)
        {
            m_threadSupport->deleteCriticalSection(m_queueLock);
            m_queueLock = NULL;
			m_threadSupport = 0;
        }
	}

    void init(btThreadSupportInterface* threadSup, btAlignedObjectArray<JobQueue>* contextArray)
    {
        m_threadSupport = threadSup;
        if (threadSup)
        {
            m_queueLock = m_threadSupport->createCriticalSection();
        }
        setupJobStealing(contextArray, contextArray->size());
    }
    void setupJobStealing(btAlignedObjectArray<JobQueue>* contextArray, int numActiveContexts)
    {
        btAlignedObjectArray<JobQueue>& contexts = *contextArray;
        int selfIndex = 0;
        for (int i = 0; i < contexts.size(); ++i)
        {
            if ( this == &contexts[ i ] )
            {
                selfIndex = i;
                break;
            }
        }
        int numNeighbors = btMin(2, contexts.size() - 1);
        int neighborOffsets[ ] = {-1, 1, -2, 2, -3, 3};
        int numOffsets = sizeof(neighborOffsets)/sizeof(neighborOffsets[0]);
        m_neighborContexts.reserve( numNeighbors );
        m_neighborContexts.resizeNoInitialize(0);
        for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++)
        {
            int neighborIndex = selfIndex + neighborOffsets[i];
            if ( neighborIndex >= 0 && neighborIndex < numActiveContexts)
            {
                m_neighborContexts.push_back( &contexts[ neighborIndex ] );
            }
        }
    }

    bool isQueueEmpty() const {return m_queueIsEmpty;}
    void lockQueue()
    {
        if ( m_useSpinMutex )
        {
            m_mutex.lock();
        }
        else
        {
            m_queueLock->lock();
        }
    }
    void unlockQueue()
    {
        if ( m_useSpinMutex )
        {
            m_mutex.unlock();
        }
        else
        {
            m_queueLock->unlock();
        }
    }
    void clearQueue(int jobCount, int jobSize)
    {
        lockQueue();
        m_headIndex = 0;
        m_tailIndex = 0;
        m_allocSize = 0;
        m_queueIsEmpty = true;
        int jobBufSize = jobSize * jobCount;
        // make sure we have enough memory allocated to store jobs
        if ( jobBufSize > m_jobMemSize )
        {
            resizeJobMem( jobBufSize );
        }
        // make sure job queue is big enough
        if ( jobCount > m_jobQueue.capacity() )
        {
            m_jobQueue.reserve( jobCount );
        }
        unlockQueue();
        m_jobQueue.resizeNoInitialize( 0 );
    }
    void* allocJobMem(int jobSize)
    {
        btAssert(m_jobMemSize >= (m_allocSize + jobSize));
        void* jobMem = &m_jobMem[m_allocSize];
        m_allocSize += jobSize;
        return jobMem;
    }
    void submitJob( IJob* job )
    {
        btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize );
        m_jobQueue.push_back( job );
        lockQueue();
        m_tailIndex++;
        m_queueIsEmpty = false;
        unlockQueue();
    }
    IJob* consumeJobFromOwnQueue()
    {
        if ( m_queueIsEmpty )
        {
            // lock free path. even if this is taken erroneously it isn't harmful
            return NULL;
        }
        IJob* job = NULL;
        lockQueue();
        if ( !m_queueIsEmpty )
        {
            job = m_jobQueue[ m_headIndex++ ];
            btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize );
            if ( m_headIndex == m_tailIndex )
            {
                m_queueIsEmpty = true;
            }
        }
        unlockQueue();
        return job;
    }
    IJob* consumeJob()
    {
        if (IJob* job = consumeJobFromOwnQueue())
        {
            return job;
        }
        // own queue is empty, try to steal from neighbor
        for (int i = 0; i < m_neighborContexts.size(); ++i)
        {
            JobQueue* otherContext = m_neighborContexts[ i ];
            if ( IJob* job = otherContext->consumeJobFromOwnQueue() )
            {
                return job;
            }
        }
        return NULL;
    }
};


static void WorkerThreadFunc( void* userPtr )
{
    BT_PROFILE( "WorkerThreadFunc" );
    ThreadLocalStorage* localStorage = (ThreadLocalStorage*) userPtr;
    JobQueue* jobQueue = localStorage->m_queue;

    bool shouldSleep = false;
    int threadId = localStorage->m_threadId;
    while (! shouldSleep)
    {
        // do work
        localStorage->m_mutex.lock();
        while ( IJob* job = jobQueue->consumeJob() )
        {
            localStorage->m_status = WorkerThreadStatus::kWorking;
            job->executeJob( threadId );
            localStorage->m_numJobsFinished++;
        }
        localStorage->m_status = WorkerThreadStatus::kWaitingForWork;
        localStorage->m_mutex.unlock();
        btU64 clockStart = localStorage->m_clock->getTimeMicroseconds();
        // while queue is empty,
        while (jobQueue->isQueueEmpty())
        {
            // todo: spin wait a bit to avoid hammering the empty queue
            btSpinPause();
            if ( localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep )
            {
                shouldSleep = true;
                break;
            }
            // if jobs are incoming,
            if ( localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs )
            {
                clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock
            }
            else
            {
                for ( int i = 0; i < 50; ++i )
                {
                    btSpinPause();
                    btSpinPause();
                    btSpinPause();
                    btSpinPause();
                    if (localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty())
                    {
                        break;
                    }
                }
                // if no jobs incoming and queue has been empty for the cooldown time, sleep
                btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart;
                if (timeElapsed > localStorage->m_cooldownTime)
                {
                    shouldSleep = true;
                    break;
                }
            }
        }
    }
	{
		BT_PROFILE("sleep");
		// go sleep
		localStorage->m_mutex.lock();
		localStorage->m_status = WorkerThreadStatus::kSleeping;
		localStorage->m_mutex.unlock();
	}
}


class btTaskSchedulerDefault : public btITaskScheduler
{
    btThreadSupportInterface* m_threadSupport;
    WorkerThreadDirectives* m_workerDirective;
    btAlignedObjectArray<JobQueue> m_jobQueues;
    btAlignedObjectArray<JobQueue*> m_perThreadJobQueues;
    btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage;
    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
    btClock m_clock;
    int m_numThreads;
    int m_numWorkerThreads;
    int m_numActiveJobQueues;
    int m_maxNumThreads;
    int m_numJobs;
    static const int kFirstWorkerThreadId = 1;
public:

    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
    {
        m_threadSupport = NULL;
        m_workerDirective = NULL;
    }

    virtual ~btTaskSchedulerDefault()
    {
        waitForWorkersToSleep();

		for ( int i = 0; i < m_jobQueues.size(); ++i )
        {
            m_jobQueues[i].exit();
        }

        if (m_threadSupport)
        {
            delete m_threadSupport;
            m_threadSupport = NULL;
        }
        if (m_workerDirective)
        {
            btAlignedFree(m_workerDirective);
            m_workerDirective = NULL;
        }
    }

    void init()
    {
        btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc );
        m_threadSupport = btThreadSupportInterface::create( constructionInfo );
        m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64));

        m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
        m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
        m_numThreads = m_maxNumThreads;
        // ideal to have one job queue for each physical processor (except for the main thread which needs no queue)
        int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio();
        int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads-1) : (m_maxNumThreads / numThreadsPerQueue);
        m_jobQueues.resize(numJobQueues);
        m_numActiveJobQueues = numJobQueues;
        for ( int i = 0; i < m_jobQueues.size(); ++i )
        {
            m_jobQueues[i].init( m_threadSupport, &m_jobQueues );
        }
        m_perThreadJobQueues.resize(m_numThreads);
        for ( int i = 0; i < m_numThreads; i++ )
        {
            JobQueue* jq = NULL;
            // only worker threads get a job queue
            if (i > 0)
            {
                if (numThreadsPerQueue == 1)
                {
                    // one queue per worker thread
                    jq = &m_jobQueues[ i - kFirstWorkerThreadId ];
                }
                else
                {
                    // 2 threads share each queue
                    jq = &m_jobQueues[ i / numThreadsPerQueue ];
                }
            }
            m_perThreadJobQueues[i] = jq;
        }
        m_threadLocalStorage.resize(m_numThreads);
        for ( int i = 0; i < m_numThreads; i++ )
        {
            ThreadLocalStorage& storage = m_threadLocalStorage[i];
            storage.m_threadId = i;
            storage.m_directive = m_workerDirective;
            storage.m_status = WorkerThreadStatus::kSleeping;
            storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do
            storage.m_clock = &m_clock;
            storage.m_queue = m_perThreadJobQueues[i];
        }
        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); // no work for them yet
        setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() );
    }

    void setWorkerDirectives(WorkerThreadDirectives::Type dir)
    {
        m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir);
    }

    virtual int getMaxNumThreads() const BT_OVERRIDE
    {
        return m_maxNumThreads;
    }

    virtual int getNumThreads() const BT_OVERRIDE
    {
        return m_numThreads;
    }

    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
    {
        m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 );
        m_numWorkerThreads = m_numThreads - 1;
        m_numActiveJobQueues = 0;
        // if there is at least 1 worker,
        if ( m_numWorkerThreads > 0 )
        {
            // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue
            JobQueue* lastActiveContext = m_perThreadJobQueues[ m_numThreads - 1 ];
            int iLastActiveContext = lastActiveContext - &m_jobQueues[0];
            m_numActiveJobQueues = iLastActiveContext + 1;
            for ( int i = 0; i < m_jobQueues.size(); ++i )
            {
                m_jobQueues[ i ].setupJobStealing( &m_jobQueues, m_numActiveJobQueues );
            }
        }
        m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep);
    }

    void waitJobs()
    {
        BT_PROFILE( "waitJobs" );
        // have the main thread work until the job queues are empty
        int numMainThreadJobsFinished = 0;
        for ( int i = 0; i < m_numActiveJobQueues; ++i )
        {
            while ( IJob* job = m_jobQueues[i].consumeJob() )
            {
                job->executeJob( 0 );
                numMainThreadJobsFinished++;
            }
        }

        // done with jobs for now, tell workers to rest (but not sleep)
        setWorkerDirectives( WorkerThreadDirectives::kStayAwakeButIdle );

        btU64 clockStart = m_clock.getTimeMicroseconds();
        // wait for workers to finish any jobs in progress
        while ( true )
        {
            int numWorkerJobsFinished = 0;
            for ( int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread )
            {
                ThreadLocalStorage* storage = &m_threadLocalStorage[iThread];
                storage->m_mutex.lock();
                numWorkerJobsFinished += storage->m_numJobsFinished;
                storage->m_mutex.unlock();
            }
            if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
            {
                break;
            }
            btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart;
            btAssert(timeElapsed < 1000);
            if (timeElapsed > 100000)
            {
                break;
            }
            btSpinPause();
        }
    }

    void wakeWorkers(int numWorkersToWake)
    {
        BT_PROFILE( "wakeWorkers" );
        btAssert( m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs );
        int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
        int numActiveWorkers = 0;
        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
        {
            // note this count of active workers is not necessarily totally reliable, because a worker thread could be
            // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
            ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ];
            if (storage.m_status != WorkerThreadStatus::kSleeping)
            {
                numActiveWorkers++;
            }
        }
        for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker )
        {
            ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ];
            if (storage.m_status == WorkerThreadStatus::kSleeping)
            {
                m_threadSupport->runTask( iWorker, &storage );
                numActiveWorkers++;
            }
        }
    }

    void waitForWorkersToSleep()
    {
        BT_PROFILE( "waitForWorkersToSleep" );
        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep );
        m_threadSupport->waitForAllTasks();
        for ( int i = kFirstWorkerThreadId; i < m_numThreads; i++ )
        {
            ThreadLocalStorage& storage = m_threadLocalStorage[i];
            btAssert( storage.m_status == WorkerThreadStatus::kSleeping );
        }
    }

    virtual void sleepWorkerThreadsHint() BT_OVERRIDE
    {
        BT_PROFILE( "sleepWorkerThreadsHint" );
        // hint the task scheduler that we may not be using these threads for a little while
        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep );
    }

    void prepareWorkerThreads()
    {
        for ( int i = kFirstWorkerThreadId; i < m_numThreads; ++i )
        {
            ThreadLocalStorage& storage = m_threadLocalStorage[i];
            storage.m_mutex.lock();
            storage.m_numJobsFinished = 0;
            storage.m_mutex.unlock();
        }
        setWorkerDirectives( WorkerThreadDirectives::kScanForJobs );
    }

    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_ThreadSupport" );
        btAssert( iEnd >= iBegin );
        btAssert( grainSize >= 1 );
        int iterationCount = iEnd - iBegin;
        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
        {
            typedef ParallelForJob JobType;
            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
            m_numJobs = jobCount;
            btAssert( jobCount >= 2 );  // need more than one job for multithreading
            int jobSize = sizeof( JobType );

            for (int i = 0; i < m_numActiveJobQueues; ++i)
            {
                m_jobQueues[i].clearQueue( jobCount, jobSize );
            }
            // prepare worker threads for incoming work
            prepareWorkerThreads();
            // submit all of the jobs
            int iJob = 0;
            int iThread = kFirstWorkerThreadId;  // first worker thread
            for ( int i = iBegin; i < iEnd; i += grainSize )
            {
                btAssert( iJob < jobCount );
                int iE = btMin( i + grainSize, iEnd );
                JobQueue* jq = m_perThreadJobQueues[ iThread ];
                btAssert(jq);
                btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
                void* jobMem = jq->allocJobMem(jobSize);
                JobType* job = new ( jobMem ) ParallelForJob( i, iE, body );  // placement new
                jq->submitJob( job );
                iJob++;
                iThread++;
                if ( iThread >= m_numThreads )
                {
                    iThread = kFirstWorkerThreadId;  // first worker thread
                }
            }
            wakeWorkers( jobCount - 1 );

            // put the main thread to work on emptying the job queue and then wait for all workers to finish
            waitJobs();
            m_antiNestingLock.unlock();
        }
        else
        {
            BT_PROFILE( "parallelFor_mainThread" );
            // just run on main thread
            body.forLoop( iBegin, iEnd );
        }
    }
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelSum_ThreadSupport" );
        btAssert( iEnd >= iBegin );
        btAssert( grainSize >= 1 );
        int iterationCount = iEnd - iBegin;
        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
        {
            typedef ParallelSumJob JobType;
            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
            m_numJobs = jobCount;
            btAssert( jobCount >= 2 );  // need more than one job for multithreading
            int jobSize = sizeof( JobType );
            for (int i = 0; i < m_numActiveJobQueues; ++i)
            {
                m_jobQueues[i].clearQueue( jobCount, jobSize );
            }

            // initialize summation
            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
            {
                m_threadLocalStorage[iThread].m_sumResult = btScalar(0);
            }

            // prepare worker threads for incoming work
            prepareWorkerThreads();
            // submit all of the jobs
            int iJob = 0;
            int iThread = kFirstWorkerThreadId;  // first worker thread
            for ( int i = iBegin; i < iEnd; i += grainSize )
            {
                btAssert( iJob < jobCount );
                int iE = btMin( i + grainSize, iEnd );
                JobQueue* jq = m_perThreadJobQueues[ iThread ];
                btAssert(jq);
                btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
                void* jobMem = jq->allocJobMem(jobSize);
                JobType* job = new ( jobMem ) ParallelSumJob( i, iE, body, &m_threadLocalStorage[0] );  // placement new
                jq->submitJob( job );
                iJob++;
                iThread++;
                if ( iThread >= m_numThreads )
                {
                    iThread = kFirstWorkerThreadId;  // first worker thread
                }
            }
            wakeWorkers( jobCount - 1 );

            // put the main thread to work on emptying the job queue and then wait for all workers to finish
            waitJobs();

            // add up all the thread sums
            btScalar sum = btScalar(0);
            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
            {
                sum += m_threadLocalStorage[ iThread ].m_sumResult;
            }
            m_antiNestingLock.unlock();
            return sum;
        }
        else
        {
            BT_PROFILE( "parallelSum_mainThread" );
            // just run on main thread
            return body.sumLoop( iBegin, iEnd );
        }
    }
};


btITaskScheduler* btCreateDefaultTaskScheduler()
{
    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
    ts->init();
    return ts;
}

#else // #if BT_THREADSAFE

btITaskScheduler* btCreateDefaultTaskScheduler()
{
    return NULL;
}

#endif // #else // #if BT_THREADSAFE