From c7255388e185e9f6d4363fc6d6c5cce17e944ba1 Mon Sep 17 00:00:00 2001
From: Juan Linietsky <reduzio@gmail.com>
Date: Sat, 23 Jul 2022 19:12:41 +0200
Subject: [PATCH] Remove ThreadWorkPool, replace by WorkerThreadPool

The former needs to be allocated once per usage. The later is shared for all threads, which is more efficient.
It can also be better debugged.
---
 core/object/worker_thread_pool.cpp            | 167 ++++++++++--------
 core/object/worker_thread_pool.h              |  52 ++++++
 core/templates/thread_work_pool.cpp           |  81 ---------
 core/templates/thread_work_pool.h             | 157 ----------------
 doc/classes/WorkerThreadPool.xml              |   6 +
 editor/editor_file_system.cpp                 |   9 +-
 editor/editor_file_system.h                   |   3 -
 modules/navigation/nav_map.cpp                |  13 +-
 modules/navigation/nav_map.h                  |   5 +-
 modules/raycast/raycast_occlusion_cull.cpp    |  43 +++--
 modules/raycast/raycast_occlusion_cull.h      |  11 +-
 modules/text_server_adv/text_server_adv.cpp   |   7 +-
 modules/text_server_adv/text_server_adv.h     |   7 +-
 modules/text_server_fb/text_server_fb.cpp     |   6 +-
 modules/text_server_fb/text_server_fb.h       |   5 +-
 scene/3d/gpu_particles_collision_3d.cpp       |  12 +-
 servers/physics_2d/godot_step_2d.cpp          |   9 +-
 servers/physics_2d/godot_step_2d.h            |   4 +-
 servers/physics_3d/godot_step_3d.cpp          |   9 +-
 servers/physics_3d/godot_step_3d.h            |   4 +-
 .../render_forward_clustered.cpp              |   7 +-
 .../forward_mobile/render_forward_mobile.cpp  |  27 ++-
 .../renderer_rd/renderer_compositor_rd.h      |   1 -
 servers/rendering/renderer_rd/shader_rd.cpp   |   4 +-
 servers/rendering/renderer_scene_cull.cpp     |  16 +-
 .../rendering/renderer_scene_occlusion_cull.h |   3 +-
 servers/rendering/renderer_thread_pool.cpp    |  42 -----
 servers/rendering/renderer_thread_pool.h      |  45 -----
 servers/rendering/renderer_viewport.cpp       |   2 +-
 servers/rendering_server.cpp                  |   2 -
 servers/rendering_server.h                    |   4 +-
 31 files changed, 248 insertions(+), 515 deletions(-)
 delete mode 100644 core/templates/thread_work_pool.cpp
 delete mode 100644 core/templates/thread_work_pool.h
 delete mode 100644 servers/rendering/renderer_thread_pool.cpp
 delete mode 100644 servers/rendering/renderer_thread_pool.h

diff --git a/core/object/worker_thread_pool.cpp b/core/object/worker_thread_pool.cpp
index c276802f99f..54738a673ec 100644
--- a/core/object/worker_thread_pool.cpp
+++ b/core/object/worker_thread_pool.cpp
@@ -32,6 +32,13 @@
 
 #include "core/os/os.h"
 
+void WorkerThreadPool::Task::free_template_userdata() {
+	ERR_FAIL_COND(!template_userdata);
+	ERR_FAIL_COND(native_func_userdata == nullptr);
+	BaseTemplateUserdata *btu = (BaseTemplateUserdata *)native_func_userdata;
+	memdelete(btu);
+}
+
 WorkerThreadPool *WorkerThreadPool::singleton = nullptr;
 
 void WorkerThreadPool::_process_task_queue() {
@@ -48,30 +55,36 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 	if (p_task->group) {
 		// Handling a group
 		bool do_post = false;
-		if (p_task->native_group_func) {
-			while (true) {
-				uint32_t work_index = p_task->group->index.postincrement();
-				if (work_index >= p_task->group->max) {
-					do_post = work_index == p_task->group->max; // First one reaching max handles semaphore and clean-up.
-					break;
-				}
-				p_task->native_group_func(p_task->native_func_userdata, work_index);
-			}
+		Callable::CallError ce;
+		Variant ret;
+		Variant arg;
+		Variant *argptr = &arg;
 
-		} else {
-			Callable::CallError ce;
-			Variant ret;
-			Variant arg;
-			Variant *argptr = &arg;
-			while (true) {
-				uint32_t work_index = p_task->group->index.postincrement();
-				if (work_index >= p_task->group->max) {
-					do_post = work_index == p_task->group->max; // First one reaching max handles semaphore and clean-up.
-					break;
-				}
+		while (true) {
+			uint32_t work_index = p_task->group->index.postincrement();
+
+			if (work_index >= p_task->group->max) {
+				break;
+			}
+			if (p_task->native_group_func) {
+				p_task->native_group_func(p_task->native_func_userdata, work_index);
+			} else if (p_task->template_userdata) {
+				p_task->template_userdata->callback_indexed(work_index);
+			} else {
 				arg = work_index;
 				p_task->callable.call((const Variant **)&argptr, 1, ret, ce);
 			}
+
+			// This is the only way to ensure posting is done when all tasks are really complete.
+			uint32_t completed_amount = p_task->group->completed_index.increment();
+
+			if (completed_amount == p_task->group->max) {
+				do_post = true;
+			}
+		}
+
+		if (do_post && p_task->template_userdata) {
+			memdelete(p_task->template_userdata); // This is no longer needed at this point, so get rid of it.
 		}
 
 		if (low_priority && use_native_low_priority_threads) {
@@ -104,6 +117,9 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 	} else {
 		if (p_task->native_func) {
 			p_task->native_func(p_task->native_func_userdata);
+		} else if (p_task->template_userdata) {
+			p_task->template_userdata->callback();
+			memdelete(p_task->template_userdata);
 		} else {
 			Callable::CallError ce;
 			Variant ret;
@@ -171,13 +187,19 @@ void WorkerThreadPool::_post_task(Task *p_task, bool p_high_priority) {
 }
 
 WorkerThreadPool::TaskID WorkerThreadPool::add_native_task(void (*p_func)(void *), void *p_userdata, bool p_high_priority, const String &p_description) {
+	return _add_task(Callable(), p_func, p_userdata, nullptr, p_high_priority, p_description);
+}
+
+WorkerThreadPool::TaskID WorkerThreadPool::_add_task(const Callable &p_callable, void (*p_func)(void *), void *p_userdata, BaseTemplateUserdata *p_template_userdata, bool p_high_priority, const String &p_description) {
 	task_mutex.lock();
 	// Get a free task
 	Task *task = task_allocator.alloc();
 	TaskID id = last_task++;
+	task->callable = p_callable;
 	task->native_func = p_func;
 	task->native_func_userdata = p_userdata;
 	task->description = p_description;
+	task->template_userdata = p_template_userdata;
 	tasks.insert(id, task);
 	task_mutex.unlock();
 
@@ -187,18 +209,7 @@ WorkerThreadPool::TaskID WorkerThreadPool::add_native_task(void (*p_func)(void *
 }
 
 WorkerThreadPool::TaskID WorkerThreadPool::add_task(const Callable &p_action, bool p_high_priority, const String &p_description) {
-	task_mutex.lock();
-	// Get a free task
-	Task *task = task_allocator.alloc();
-	TaskID id = last_task++;
-	task->callable = p_action;
-	task->description = p_description;
-	tasks.insert(id, task);
-	task_mutex.unlock();
-
-	_post_task(task, p_high_priority);
-
-	return id;
+	return _add_task(p_action, nullptr, nullptr, nullptr, p_high_priority, p_description);
 }
 
 bool WorkerThreadPool::is_task_completed(TaskID p_task_id) const {
@@ -269,8 +280,8 @@ void WorkerThreadPool::wait_for_task_completion(TaskID p_task_id) {
 	task_mutex.unlock();
 }
 
-WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)(void *, uint32_t), void *p_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
-	ERR_FAIL_COND_V(p_elements <= 0, INVALID_TASK_ID);
+WorkerThreadPool::GroupID WorkerThreadPool::_add_group_task(const Callable &p_callable, void (*p_func)(void *, uint32_t), void *p_userdata, BaseTemplateUserdata *p_template_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
+	ERR_FAIL_COND_V(p_elements < 0, INVALID_TASK_ID);
 	if (p_tasks < 0) {
 		p_tasks = threads.size();
 	}
@@ -280,17 +291,34 @@ WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)
 	GroupID id = last_task++;
 	group->max = p_elements;
 	group->self = id;
-	group->tasks_used = p_tasks;
-	Task **tasks_posted = (Task **)alloca(sizeof(Task *) * p_tasks);
-	for (int i = 0; i < p_tasks; i++) {
-		Task *task = task_allocator.alloc();
-		task->native_group_func = p_func;
-		task->native_func_userdata = p_userdata;
-		task->description = p_description;
-		task->group = group;
-		tasks_posted[i] = task;
-		// No task ID is used.
+
+	Task **tasks_posted = nullptr;
+	if (p_elements == 0) {
+		// Should really not call it with zero Elements, but at least it should work.
+		group->completed.set_to(true);
+		group->done_semaphore.post();
+		group->tasks_used = 0;
+		p_tasks = 0;
+		if (p_template_userdata) {
+			memdelete(p_template_userdata);
+		}
+
+	} else {
+		group->tasks_used = p_tasks;
+		tasks_posted = (Task **)alloca(sizeof(Task *) * p_tasks);
+		for (int i = 0; i < p_tasks; i++) {
+			Task *task = task_allocator.alloc();
+			task->native_group_func = p_func;
+			task->native_func_userdata = p_userdata;
+			task->description = p_description;
+			task->group = group;
+			task->callable = p_callable;
+			task->template_userdata = p_template_userdata;
+			tasks_posted[i] = task;
+			// No task ID is used.
+		}
 	}
+
 	groups[id] = group;
 	task_mutex.unlock();
 
@@ -308,43 +336,25 @@ WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)
 	return id;
 }
 
+WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)(void *, uint32_t), void *p_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
+	return _add_group_task(Callable(), p_func, p_userdata, nullptr, p_elements, p_tasks, p_high_priority, p_description);
+}
+
 WorkerThreadPool::GroupID WorkerThreadPool::add_group_task(const Callable &p_action, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
-	ERR_FAIL_COND_V(p_elements <= 0, INVALID_TASK_ID);
-	if (p_tasks < 0) {
-		p_tasks = threads.size();
-	}
-
-	task_mutex.lock();
-	Group *group = group_allocator.alloc();
-	GroupID id = last_task++;
-	group->max = p_elements;
-	group->self = id;
-	group->tasks_used = p_tasks;
-	Task **tasks_posted = (Task **)alloca(sizeof(Task *) * p_tasks);
-	for (int i = 0; i < p_tasks; i++) {
-		Task *task = task_allocator.alloc();
-		task->callable = p_action;
-		task->description = p_description;
-		task->group = group;
-		tasks_posted[i] = task;
-		// No task ID is used.
-	}
-	groups[id] = group;
-	task_mutex.unlock();
-
-	if (!p_high_priority && use_native_low_priority_threads) {
-		group->low_priority_native_tasks.resize(p_tasks);
-	}
-
-	for (int i = 0; i < p_tasks; i++) {
-		_post_task(tasks_posted[i], p_high_priority);
-		if (!p_high_priority && use_native_low_priority_threads) {
-			group->low_priority_native_tasks[i] = tasks_posted[i];
-		}
-	}
-	return id;
+	return _add_group_task(p_action, nullptr, nullptr, nullptr, p_elements, p_tasks, p_high_priority, p_description);
 }
 
+uint32_t WorkerThreadPool::get_group_processed_element_count(GroupID p_group) const {
+	task_mutex.lock();
+	const Group *const *groupp = groups.getptr(p_group);
+	if (!groupp) {
+		task_mutex.unlock();
+		ERR_FAIL_V_MSG(0, "Invalid Group ID");
+	}
+	uint32_t elements = (*groupp)->completed_index.get();
+	task_mutex.unlock();
+	return elements;
+}
 bool WorkerThreadPool::is_group_task_completed(GroupID p_group) const {
 	task_mutex.lock();
 	const Group *const *groupp = groups.getptr(p_group);
@@ -451,6 +461,7 @@ void WorkerThreadPool::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("add_group_task", "action", "elements", "tasks_needed", "high_priority", "description"), &WorkerThreadPool::add_group_task, DEFVAL(-1), DEFVAL(false), DEFVAL(String()));
 	ClassDB::bind_method(D_METHOD("is_group_task_completed", "group_id"), &WorkerThreadPool::is_group_task_completed);
+	ClassDB::bind_method(D_METHOD("get_group_processed_element_count", "group_id"), &WorkerThreadPool::get_group_processed_element_count);
 	ClassDB::bind_method(D_METHOD("wait_for_group_task_completion", "group_id"), &WorkerThreadPool::wait_for_group_task_completion);
 }
 
diff --git a/core/object/worker_thread_pool.h b/core/object/worker_thread_pool.h
index dfb00506055..1debd9ca37e 100644
--- a/core/object/worker_thread_pool.h
+++ b/core/object/worker_thread_pool.h
@@ -53,9 +53,16 @@ public:
 private:
 	struct Task;
 
+	struct BaseTemplateUserdata {
+		virtual void callback() {}
+		virtual void callback_indexed(uint32_t p_index) {}
+		virtual ~BaseTemplateUserdata() {}
+	};
+
 	struct Group {
 		GroupID self;
 		SafeNumeric<uint32_t> index;
+		SafeNumeric<uint32_t> completed_index;
 		uint32_t max = 0;
 		Semaphore done_semaphore;
 		SafeFlag completed;
@@ -76,7 +83,10 @@ private:
 		SelfList<Task> task_elem;
 		bool waiting = false; // Waiting for completion
 		bool low_priority = false;
+		BaseTemplateUserdata *template_userdata = nullptr;
 		Thread *low_priority_thread = nullptr;
+
+		void free_template_userdata();
 		Task() :
 				task_elem(this) {}
 	};
@@ -119,18 +129,60 @@ private:
 
 	static WorkerThreadPool *singleton;
 
+	TaskID _add_task(const Callable &p_callable, void (*p_func)(void *), void *p_userdata, BaseTemplateUserdata *p_template_userdata, bool p_high_priority, const String &p_description);
+	GroupID _add_group_task(const Callable &p_callable, void (*p_func)(void *, uint32_t), void *p_userdata, BaseTemplateUserdata *p_template_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description);
+
+	template <class C, class M, class U>
+	struct TaskUserData : public BaseTemplateUserdata {
+		C *instance;
+		M method;
+		U userdata;
+		virtual void callback() override {
+			(instance->*method)(userdata);
+		}
+	};
+
+	template <class C, class M, class U>
+	struct GroupUserData : public BaseTemplateUserdata {
+		C *instance;
+		M method;
+		U userdata;
+		virtual void callback_indexed(uint32_t p_index) override {
+			(instance->*method)(p_index, userdata);
+		}
+	};
+
 protected:
 	static void _bind_methods();
 
 public:
+	template <class C, class M, class U>
+	TaskID add_template_task(C *p_instance, M p_method, U p_userdata, bool p_high_priority = false, const String &p_description = String()) {
+		typedef TaskUserData<C, M, U> TUD;
+		TUD *ud = memnew(TUD);
+		ud->instance = p_instance;
+		ud->method = p_method;
+		ud->userdata = p_userdata;
+		return _add_task(Callable(), nullptr, nullptr, ud, p_high_priority, p_description);
+	}
 	TaskID add_native_task(void (*p_func)(void *), void *p_userdata, bool p_high_priority = false, const String &p_description = String());
 	TaskID add_task(const Callable &p_action, bool p_high_priority = false, const String &p_description = String());
 
 	bool is_task_completed(TaskID p_task_id) const;
 	void wait_for_task_completion(TaskID p_task_id);
 
+	template <class C, class M, class U>
+	GroupID add_template_group_task(C *p_instance, M p_method, U p_userdata, int p_elements, int p_tasks = -1, bool p_high_priority = false, const String &p_description = String()) {
+		typedef GroupUserData<C, M, U> GUD;
+		GUD *ud = memnew(GUD);
+		ud->instance = p_instance;
+		ud->method = p_method;
+		ud->userdata = p_userdata;
+		return _add_group_task(Callable(), nullptr, nullptr, ud, p_elements, p_tasks, p_high_priority, p_description);
+	}
 	GroupID add_native_group_task(void (*p_func)(void *, uint32_t), void *p_userdata, int p_elements, int p_tasks = -1, bool p_high_priority = false, const String &p_description = String());
 	GroupID add_group_task(const Callable &p_action, int p_elements, int p_tasks = -1, bool p_high_priority = false, const String &p_description = String());
+	uint32_t get_group_processed_element_count(GroupID p_group) const;
 	bool is_group_task_completed(GroupID p_group) const;
 	void wait_for_group_task_completion(GroupID p_group);
 
diff --git a/core/templates/thread_work_pool.cpp b/core/templates/thread_work_pool.cpp
deleted file mode 100644
index a75fd06b9bf..00000000000
--- a/core/templates/thread_work_pool.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*************************************************************************/
-/*  thread_work_pool.cpp                                                 */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#include "thread_work_pool.h"
-
-#include "core/os/os.h"
-
-void ThreadWorkPool::_thread_function(void *p_user) {
-	ThreadData *thread = static_cast<ThreadData *>(p_user);
-	while (true) {
-		thread->start.wait();
-		if (thread->exit.load()) {
-			break;
-		}
-		thread->work->work();
-		thread->completed.post();
-	}
-}
-
-void ThreadWorkPool::init(int p_thread_count) {
-	ERR_FAIL_COND(threads != nullptr);
-	if (p_thread_count < 0) {
-		p_thread_count = OS::get_singleton()->get_default_thread_pool_size();
-	}
-
-	thread_count = p_thread_count;
-	threads = memnew_arr(ThreadData, thread_count);
-
-	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].exit.store(false);
-		threads[i].thread.start(&ThreadWorkPool::_thread_function, &threads[i]);
-	}
-}
-
-void ThreadWorkPool::finish() {
-	if (threads == nullptr) {
-		return;
-	}
-
-	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].exit.store(true);
-		threads[i].start.post();
-	}
-	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].thread.wait_to_finish();
-	}
-
-	memdelete_arr(threads);
-	threads = nullptr;
-}
-
-ThreadWorkPool::~ThreadWorkPool() {
-	finish();
-}
diff --git a/core/templates/thread_work_pool.h b/core/templates/thread_work_pool.h
deleted file mode 100644
index cdb43d6d891..00000000000
--- a/core/templates/thread_work_pool.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*************************************************************************/
-/*  thread_work_pool.h                                                   */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#ifndef THREAD_WORK_POOL_H
-#define THREAD_WORK_POOL_H
-
-#include "core/os/memory.h"
-#include "core/os/semaphore.h"
-#include "core/os/thread.h"
-
-#include <atomic>
-
-class ThreadWorkPool {
-	std::atomic<uint32_t> index;
-
-	struct BaseWork {
-		std::atomic<uint32_t> *index = nullptr;
-		uint32_t max_elements = 0;
-		virtual void work() = 0;
-		virtual ~BaseWork() = default;
-	};
-
-	template <class C, class M, class U>
-	struct Work : public BaseWork {
-		C *instance;
-		M method;
-		U userdata;
-		virtual void work() override {
-			while (true) {
-				uint32_t work_index = index->fetch_add(1, std::memory_order_relaxed);
-				if (work_index >= max_elements) {
-					break;
-				}
-				(instance->*method)(work_index, userdata);
-			}
-		}
-	};
-
-	struct ThreadData {
-		Thread thread;
-		Semaphore start;
-		Semaphore completed;
-		std::atomic<bool> exit;
-		BaseWork *work = nullptr;
-	};
-
-	ThreadData *threads = nullptr;
-	uint32_t thread_count = 0;
-	uint32_t threads_working = 0;
-	BaseWork *current_work = nullptr;
-
-	static void _thread_function(void *p_user);
-
-public:
-	template <class C, class M, class U>
-	void begin_work(uint32_t p_elements, C *p_instance, M p_method, U p_userdata) {
-		ERR_FAIL_COND(!threads); //never initialized
-		ERR_FAIL_COND(current_work != nullptr);
-
-		index.store(0, std::memory_order_release);
-
-		Work<C, M, U> *w = memnew((Work<C, M, U>));
-		w->instance = p_instance;
-		w->userdata = p_userdata;
-		w->method = p_method;
-		w->index = &index;
-		w->max_elements = p_elements;
-
-		current_work = w;
-
-		threads_working = MIN(p_elements, thread_count);
-
-		for (uint32_t i = 0; i < threads_working; i++) {
-			threads[i].work = w;
-			threads[i].start.post();
-		}
-	}
-
-	bool is_working() const {
-		return current_work != nullptr;
-	}
-
-	bool is_done_dispatching() const {
-		ERR_FAIL_COND_V(current_work == nullptr, true);
-		return index.load(std::memory_order_acquire) >= current_work->max_elements;
-	}
-
-	uint32_t get_work_index() const {
-		ERR_FAIL_COND_V(current_work == nullptr, 0);
-		uint32_t idx = index.load(std::memory_order_acquire);
-		return MIN(idx, current_work->max_elements);
-	}
-
-	void end_work() {
-		ERR_FAIL_COND(current_work == nullptr);
-		for (uint32_t i = 0; i < threads_working; i++) {
-			threads[i].completed.wait();
-			threads[i].work = nullptr;
-		}
-
-		threads_working = 0;
-		memdelete(current_work);
-		current_work = nullptr;
-	}
-
-	template <class C, class M, class U>
-	void do_work(uint32_t p_elements, C *p_instance, M p_method, U p_userdata) {
-		switch (p_elements) {
-			case 0:
-				// Nothing to do, so do nothing.
-				break;
-			case 1:
-				// No value in pushing the work to another thread if it's a single job
-				// and we're going to wait for it to finish. Just run it right here.
-				(p_instance->*p_method)(0, p_userdata);
-				break;
-			default:
-				// Multiple jobs to do; commence threaded business.
-				begin_work(p_elements, p_instance, p_method, p_userdata);
-				end_work();
-		}
-	}
-
-	_FORCE_INLINE_ int get_thread_count() const { return thread_count; }
-	void init(int p_thread_count = -1);
-	void finish();
-	~ThreadWorkPool();
-};
-
-#endif // THREAD_WORK_POOL_H
diff --git a/doc/classes/WorkerThreadPool.xml b/doc/classes/WorkerThreadPool.xml
index 22eabf90122..4f614bdadb7 100644
--- a/doc/classes/WorkerThreadPool.xml
+++ b/doc/classes/WorkerThreadPool.xml
@@ -25,6 +25,12 @@
 			<description>
 			</description>
 		</method>
+		<method name="get_group_processed_element_count" qualifiers="const">
+			<return type="int" />
+			<argument index="0" name="group_id" type="int" />
+			<description>
+			</description>
+		</method>
 		<method name="is_group_task_completed" qualifiers="const">
 			<return type="bool" />
 			<argument index="0" name="group_id" type="int" />
diff --git a/editor/editor_file_system.cpp b/editor/editor_file_system.cpp
index 2f106739a4a..56046a07d7c 100644
--- a/editor/editor_file_system.cpp
+++ b/editor/editor_file_system.cpp
@@ -36,6 +36,7 @@
 #include "core/io/resource_importer.h"
 #include "core/io/resource_loader.h"
 #include "core/io/resource_saver.h"
+#include "core/object/worker_thread_pool.h"
 #include "core/os/os.h"
 #include "core/variant/variant_parser.h"
 #include "editor/editor_node.h"
@@ -2137,7 +2138,7 @@ void EditorFileSystem::reimport_files(const Vector<String> &p_files) {
 					data.reimport_from = from;
 					data.reimport_files = reimport_files.ptr();
 
-					import_threads.begin_work(i - from + 1, this, &EditorFileSystem::_reimport_thread, &data);
+					WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &EditorFileSystem::_reimport_thread, &data, i - from + 1, -1, false, vformat(TTR("Import resources of type: %s"), reimport_files[from].importer));
 					int current_index = from - 1;
 					do {
 						if (current_index < data.max_index) {
@@ -2145,9 +2146,9 @@ void EditorFileSystem::reimport_files(const Vector<String> &p_files) {
 							pr.step(reimport_files[current_index].path.get_file(), current_index);
 						}
 						OS::get_singleton()->delay_usec(1);
-					} while (!import_threads.is_done_dispatching());
+					} while (!WorkerThreadPool::get_singleton()->is_group_task_completed(group_task));
 
-					import_threads.end_work();
+					WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 					importer->import_threaded_end();
 				}
@@ -2430,12 +2431,10 @@ EditorFileSystem::EditorFileSystem() {
 
 	scan_total = 0;
 	update_script_classes_queued.clear();
-	import_threads.init();
 	ResourceUID::get_singleton()->clear(); //will be updated on scan
 	ResourceSaver::set_get_resource_id_for_path(_resource_saver_get_resource_id_for_path);
 }
 
 EditorFileSystem::~EditorFileSystem() {
-	import_threads.finish();
 	ResourceSaver::set_get_resource_id_for_path(nullptr);
 }
diff --git a/editor/editor_file_system.h b/editor/editor_file_system.h
index 07b11320462..f4e69b95e72 100644
--- a/editor/editor_file_system.h
+++ b/editor/editor_file_system.h
@@ -36,7 +36,6 @@
 #include "core/os/thread_safe.h"
 #include "core/templates/hash_set.h"
 #include "core/templates/safe_refcount.h"
-#include "core/templates/thread_work_pool.h"
 #include "scene/main/node.h"
 
 class FileAccess;
@@ -275,8 +274,6 @@ class EditorFileSystem : public Node {
 
 	HashSet<String> group_file_cache;
 
-	ThreadWorkPool import_threads;
-
 	struct ImportThreadData {
 		const ImportFile *reimport_files;
 		int reimport_from;
diff --git a/modules/navigation/nav_map.cpp b/modules/navigation/nav_map.cpp
index 17d6e0a0a1d..46daa542391 100644
--- a/modules/navigation/nav_map.cpp
+++ b/modules/navigation/nav_map.cpp
@@ -30,9 +30,9 @@
 
 #include "nav_map.h"
 
+#include "core/object/worker_thread_pool.h"
 #include "nav_region.h"
 #include "rvo_agent.h"
-
 #include <algorithm>
 
 #define THREE_POINTS_CROSS_PRODUCT(m_a, m_b, m_c) (((m_c) - (m_a)).cross((m_b) - (m_a)))
@@ -683,14 +683,8 @@ void NavMap::compute_single_step(uint32_t index, RvoAgent **agent) {
 void NavMap::step(real_t p_deltatime) {
 	deltatime = p_deltatime;
 	if (controlled_agents.size() > 0) {
-		if (step_work_pool.get_thread_count() == 0) {
-			step_work_pool.init();
-		}
-		step_work_pool.do_work(
-				controlled_agents.size(),
-				this,
-				&NavMap::compute_single_step,
-				controlled_agents.data());
+		WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &NavMap::compute_single_step, controlled_agents.data(), controlled_agents.size(), -1, true, SNAME("NavigationMapAgents"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 	}
 }
 
@@ -736,5 +730,4 @@ NavMap::NavMap() {
 }
 
 NavMap::~NavMap() {
-	step_work_pool.finish();
 }
diff --git a/modules/navigation/nav_map.h b/modules/navigation/nav_map.h
index 2036dbecd70..98a5c24b3ee 100644
--- a/modules/navigation/nav_map.h
+++ b/modules/navigation/nav_map.h
@@ -34,8 +34,8 @@
 #include "nav_rid.h"
 
 #include "core/math/math_defs.h"
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/rb_map.h"
-#include "core/templates/thread_work_pool.h"
 #include "nav_utils.h"
 
 #include <KdTree.h>
@@ -81,9 +81,6 @@ class NavMap : public NavRid {
 	/// Change the id each time the map is updated.
 	uint32_t map_update_id = 0;
 
-	/// Pooled threads for computing steps
-	ThreadWorkPool step_work_pool;
-
 public:
 	NavMap();
 	~NavMap();
diff --git a/modules/raycast/raycast_occlusion_cull.cpp b/modules/raycast/raycast_occlusion_cull.cpp
index 55883f9a788..13824c38301 100644
--- a/modules/raycast/raycast_occlusion_cull.cpp
+++ b/modules/raycast/raycast_occlusion_cull.cpp
@@ -30,6 +30,7 @@
 
 #include "raycast_occlusion_cull.h"
 #include "core/config/project_settings.h"
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/local_vector.h"
 
 #ifdef __SSE2__
@@ -78,9 +79,9 @@ void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
 	memset(camera_ray_masks.ptr(), ~0, camera_rays_tile_count * TILE_RAYS * sizeof(uint32_t));
 }
 
-void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) {
+void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal) {
 	CameraRayThreadData td;
-	td.thread_count = p_thread_work_pool.get_thread_count();
+	td.thread_count = WorkerThreadPool::get_singleton()->get_thread_count();
 
 	td.z_near = p_cam_projection.get_z_near();
 	td.z_far = p_cam_projection.get_z_far() * 1.05f;
@@ -106,7 +107,8 @@ void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform3D
 
 	debug_tex_range = td.z_far;
 
-	p_thread_work_pool.do_work(td.thread_count, this, &RaycastHZBuffer::_camera_rays_threaded, &td);
+	WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RaycastHZBuffer::_camera_rays_threaded, &td, td.thread_count, -1, true, SNAME("RaycastOcclusionCullUpdateCamera"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 }
 
 void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data) {
@@ -331,10 +333,10 @@ void RaycastOcclusionCull::scenario_remove_instance(RID p_scenario, RID p_instan
 }
 
 void RaycastOcclusionCull::Scenario::_update_dirty_instance_thread(int p_idx, RID *p_instances) {
-	_update_dirty_instance(p_idx, p_instances, nullptr);
+	_update_dirty_instance(p_idx, p_instances);
 }
 
-void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool) {
+void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_instances) {
 	OccluderInstance *occ_inst = instances.getptr(p_instances[p_idx]);
 
 	if (!occ_inst) {
@@ -355,14 +357,16 @@ void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_in
 	const Vector3 *read_ptr = occ->vertices.ptr();
 	Vector3 *write_ptr = occ_inst->xformed_vertices.ptr();
 
-	if (p_thread_pool && vertices_size > 1024) {
+	if (vertices_size > 1024) {
 		TransformThreadData td;
 		td.xform = occ_inst->xform;
 		td.read = read_ptr;
 		td.write = write_ptr;
 		td.vertex_count = vertices_size;
-		td.thread_count = p_thread_pool->get_thread_count();
-		p_thread_pool->do_work(td.thread_count, this, &Scenario::_transform_vertices_thread, &td);
+		td.thread_count = WorkerThreadPool::get_singleton()->get_thread_count();
+		WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &Scenario::_transform_vertices_thread, &td, td.thread_count, -1, true, SNAME("RaycastOcclusionCull"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 	} else {
 		_transform_vertices_range(read_ptr, write_ptr, occ_inst->xform, 0, vertices_size);
 	}
@@ -392,7 +396,7 @@ void RaycastOcclusionCull::Scenario::_commit_scene(void *p_ud) {
 	scenario->commit_done = true;
 }
 
-bool RaycastOcclusionCull::Scenario::update(ThreadWorkPool &p_thread_pool) {
+bool RaycastOcclusionCull::Scenario::update() {
 	ERR_FAIL_COND_V(singleton == nullptr, false);
 
 	if (commit_thread == nullptr) {
@@ -426,13 +430,15 @@ bool RaycastOcclusionCull::Scenario::update(ThreadWorkPool &p_thread_pool) {
 		instances.erase(removed_instances[i]);
 	}
 
-	if (dirty_instances_array.size() / p_thread_pool.get_thread_count() > 128) {
+	if (dirty_instances_array.size() / WorkerThreadPool::get_singleton()->get_thread_count() > 128) {
 		// Lots of instances, use per-instance threading
-		p_thread_pool.do_work(dirty_instances_array.size(), this, &Scenario::_update_dirty_instance_thread, dirty_instances_array.ptr());
+		WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &Scenario::_update_dirty_instance_thread, dirty_instances_array.ptr(), dirty_instances_array.size(), -1, true, SNAME("RaycastOcclusionCullUpdate"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 	} else {
 		// Few instances, use threading on the vertex transforms
 		for (unsigned int i = 0; i < dirty_instances_array.size(); i++) {
-			_update_dirty_instance(i, dirty_instances_array.ptr(), &p_thread_pool);
+			_update_dirty_instance(i, dirty_instances_array.ptr());
 		}
 	}
 
@@ -484,7 +490,7 @@ void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThrea
 	rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
 }
 
-void RaycastOcclusionCull::Scenario::raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count, ThreadWorkPool &p_thread_pool) const {
+void RaycastOcclusionCull::Scenario::raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count) const {
 	ERR_FAIL_COND(singleton == nullptr);
 	if (raycast_singleton->ebr_device == nullptr) {
 		return; // Embree is initialized on demand when there is some scenario with occluders in it.
@@ -498,7 +504,8 @@ void RaycastOcclusionCull::Scenario::raycast(CameraRayTile *r_rays, const uint32
 	td.rays = r_rays;
 	td.masks = p_valid_masks;
 
-	p_thread_pool.do_work(p_tile_count, this, &Scenario::_raycast, &td);
+	WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &Scenario::_raycast, &td, p_tile_count, -1, true, SNAME("RaycastOcclusionCullRaycast"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 }
 
 ////////////////////////////////////////////////////////
@@ -524,7 +531,7 @@ void RaycastOcclusionCull::buffer_set_size(RID p_buffer, const Vector2i &p_size)
 	buffers[p_buffer].resize(p_size);
 }
 
-void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {
+void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal) {
 	if (!buffers.has(p_buffer)) {
 		return;
 	}
@@ -537,16 +544,16 @@ void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform3D &p_cam_
 
 	Scenario &scenario = scenarios[buffer.scenario_rid];
 
-	bool removed = scenario.update(p_thread_pool);
+	bool removed = scenario.update();
 
 	if (removed) {
 		scenarios.erase(buffer.scenario_rid);
 		return;
 	}
 
-	buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool);
+	buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal);
 
-	scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks.ptr(), buffer.camera_rays_tile_count, p_thread_pool);
+	scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks.ptr(), buffer.camera_rays_tile_count);
 	buffer.sort_rays(-p_cam_transform.basis.get_column(2), p_cam_orthogonal);
 	buffer.update_mips();
 }
diff --git a/modules/raycast/raycast_occlusion_cull.h b/modules/raycast/raycast_occlusion_cull.h
index 8c8b4443097..056b8086407 100644
--- a/modules/raycast/raycast_occlusion_cull.h
+++ b/modules/raycast/raycast_occlusion_cull.h
@@ -76,7 +76,7 @@ public:
 		virtual void clear() override;
 		virtual void resize(const Size2i &p_size) override;
 		void sort_rays(const Vector3 &p_camera_dir, bool p_orthogonal);
-		void update_camera_rays(const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool);
+		void update_camera_rays(const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal);
 
 		~RaycastHZBuffer();
 	};
@@ -143,14 +143,14 @@ private:
 		LocalVector<RID> removed_instances;
 
 		void _update_dirty_instance_thread(int p_idx, RID *p_instances);
-		void _update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool);
+		void _update_dirty_instance(int p_idx, RID *p_instances);
 		void _transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data);
 		void _transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform3D &p_xform, int p_from, int p_to);
 		static void _commit_scene(void *p_ud);
-		bool update(ThreadWorkPool &p_thread_pool);
+		bool update();
 
 		void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const;
-		void raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count, ThreadWorkPool &p_thread_pool) const;
+		void raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count) const;
 	};
 
 	static RaycastOcclusionCull *raycast_singleton;
@@ -183,7 +183,8 @@ public:
 	virtual HZBuffer *buffer_get_ptr(RID p_buffer) override;
 	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) override;
 	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) override;
-	virtual void buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) override;
+	virtual void buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal) override;
+
 	virtual RID buffer_get_debug_texture(RID p_buffer) override;
 
 	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) override;
diff --git a/modules/text_server_adv/text_server_adv.cpp b/modules/text_server_adv/text_server_adv.cpp
index fe2279df69f..fa234081f0b 100644
--- a/modules/text_server_adv/text_server_adv.cpp
+++ b/modules/text_server_adv/text_server_adv.cpp
@@ -29,6 +29,7 @@
 /*************************************************************************/
 
 #include "text_server_adv.h"
+#include "core/object/worker_thread_pool.h"
 
 #ifdef GDEXTENSION
 // Headers for building as GDExtension plug-in.
@@ -1039,10 +1040,8 @@ _FORCE_INLINE_ TextServerAdvanced::FontGlyph TextServerAdvanced::rasterize_msdf(
 		td.projection = &projection;
 		td.distancePixelConversion = &distancePixelConversion;
 
-		if (p_font_data->work_pool.get_thread_count() == 0) {
-			p_font_data->work_pool.init();
-		}
-		p_font_data->work_pool.do_work(h, this, &TextServerAdvanced::_generateMTSDF_threaded, &td);
+		WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &TextServerAdvanced::_generateMTSDF_threaded, &td, h, -1, true, SNAME("FontServerRasterizeMSDF"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 		msdfgen::msdfErrorCorrection(image, shape, projection, p_pixel_range, config);
 
diff --git a/modules/text_server_adv/text_server_adv.h b/modules/text_server_adv/text_server_adv.h
index a772955d90a..8cd0e753ba8 100644
--- a/modules/text_server_adv/text_server_adv.h
+++ b/modules/text_server_adv/text_server_adv.h
@@ -65,11 +65,12 @@
 #include <godot_cpp/classes/image.hpp>
 #include <godot_cpp/classes/image_texture.hpp>
 #include <godot_cpp/classes/ref.hpp>
+#include <godot_cpp/classes/worker_thread_pool.hpp>
 
 #include <godot_cpp/templates/hash_map.hpp>
 #include <godot_cpp/templates/hash_set.hpp>
 #include <godot_cpp/templates/rid_owner.hpp>
-#include <godot_cpp/templates/thread_work_pool.hpp>
+
 #include <godot_cpp/templates/vector.hpp>
 
 using namespace godot;
@@ -77,9 +78,9 @@ using namespace godot;
 #else
 // Headers for building as built-in module.
 
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/hash_map.h"
 #include "core/templates/rid_owner.h"
-#include "core/templates/thread_work_pool.h"
 #include "scene/resources/texture.h"
 #include "servers/text/text_server_extension.h"
 
@@ -252,10 +253,8 @@ class TextServerAdvanced : public TextServerExtension {
 		const uint8_t *data_ptr;
 		size_t data_size;
 		int face_index = 0;
-		mutable ThreadWorkPool work_pool;
 
 		~FontAdvanced() {
-			work_pool.finish();
 			for (const KeyValue<Vector2i, FontForSizeAdvanced *> &E : cache) {
 				memdelete(E.value);
 			}
diff --git a/modules/text_server_fb/text_server_fb.cpp b/modules/text_server_fb/text_server_fb.cpp
index b845beb1588..50ea4677b1f 100644
--- a/modules/text_server_fb/text_server_fb.cpp
+++ b/modules/text_server_fb/text_server_fb.cpp
@@ -461,10 +461,8 @@ _FORCE_INLINE_ TextServerFallback::FontGlyph TextServerFallback::rasterize_msdf(
 		td.projection = &projection;
 		td.distancePixelConversion = &distancePixelConversion;
 
-		if (p_font_data->work_pool.get_thread_count() == 0) {
-			p_font_data->work_pool.init();
-		}
-		p_font_data->work_pool.do_work(h, this, &TextServerFallback::_generateMTSDF_threaded, &td);
+		WorkerThreadPool::GroupID group_id = WorkerThreadPool::get_singleton()->add_template_group_task(this, &TextServerFallback::_generateMTSDF_threaded, &td, h, -1, true, SNAME("TextServerFBRenderMSDF"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_id);
 
 		msdfgen::msdfErrorCorrection(image, shape, projection, p_pixel_range, config);
 
diff --git a/modules/text_server_fb/text_server_fb.h b/modules/text_server_fb/text_server_fb.h
index b3ec4ffb38d..adb5cbb8176 100644
--- a/modules/text_server_fb/text_server_fb.h
+++ b/modules/text_server_fb/text_server_fb.h
@@ -79,9 +79,9 @@ using namespace godot;
 
 #include "servers/text/text_server_extension.h"
 
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/hash_map.h"
 #include "core/templates/rid_owner.h"
-#include "core/templates/thread_work_pool.h"
 #include "scene/resources/texture.h"
 
 #include "modules/modules_enabled.gen.h" // For freetype, msdfgen.
@@ -208,10 +208,7 @@ class TextServerFallback : public TextServerExtension {
 		size_t data_size;
 		int face_index = 0;
 
-		mutable ThreadWorkPool work_pool;
-
 		~FontFallback() {
-			work_pool.finish();
 			for (const KeyValue<Vector2i, FontForSizeFallback *> &E : cache) {
 				memdelete(E.value);
 			}
diff --git a/scene/3d/gpu_particles_collision_3d.cpp b/scene/3d/gpu_particles_collision_3d.cpp
index da0789ccd55..1fcd5160f6a 100644
--- a/scene/3d/gpu_particles_collision_3d.cpp
+++ b/scene/3d/gpu_particles_collision_3d.cpp
@@ -30,6 +30,7 @@
 
 #include "gpu_particles_collision_3d.h"
 
+#include "core/object/worker_thread_pool.h"
 #include "mesh_instance_3d.h"
 #include "scene/3d/camera_3d.h"
 #include "scene/main/viewport.h"
@@ -339,15 +340,12 @@ void GPUParticlesCollisionSDF3D::_compute_sdf_z(uint32_t p_z, ComputeSDFParams *
 }
 
 void GPUParticlesCollisionSDF3D::_compute_sdf(ComputeSDFParams *params) {
-	ThreadWorkPool work_pool;
-	work_pool.init();
-	work_pool.begin_work(params->size.z, this, &GPUParticlesCollisionSDF3D::_compute_sdf_z, params);
-	while (!work_pool.is_done_dispatching()) {
+	WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &GPUParticlesCollisionSDF3D::_compute_sdf_z, params, params->size.z);
+	while (!WorkerThreadPool::get_singleton()->is_group_task_completed(group_task)) {
 		OS::get_singleton()->delay_usec(10000);
-		bake_step_function(work_pool.get_work_index() * 100 / params->size.z, "Baking SDF");
+		bake_step_function(WorkerThreadPool::get_singleton()->get_group_processed_element_count(group_task) * 100 / params->size.z, "Baking SDF");
 	}
-	work_pool.end_work();
-	work_pool.finish();
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 }
 
 Vector3i GPUParticlesCollisionSDF3D::get_estimated_cell_size() const {
diff --git a/servers/physics_2d/godot_step_2d.cpp b/servers/physics_2d/godot_step_2d.cpp
index 551fd9329f5..0603458acdd 100644
--- a/servers/physics_2d/godot_step_2d.cpp
+++ b/servers/physics_2d/godot_step_2d.cpp
@@ -239,7 +239,8 @@ void GodotStep2D::step(GodotSpace2D *p_space, real_t p_delta) {
 	/* SETUP CONSTRAINTS / PROCESS COLLISIONS */
 
 	uint32_t total_contraint_count = all_constraints.size();
-	work_pool.do_work(total_contraint_count, this, &GodotStep2D::_setup_contraint, nullptr);
+	WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &GodotStep2D::_setup_contraint, nullptr, total_contraint_count, -1, true, SNAME("Physics2DConstraintSetup"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 	{ //profile
 		profile_endtime = OS::get_singleton()->get_ticks_usec();
@@ -258,7 +259,8 @@ void GodotStep2D::step(GodotSpace2D *p_space, real_t p_delta) {
 
 	// Warning: _solve_island modifies the constraint islands for optimization purpose,
 	// their content is not reliable after these calls and shouldn't be used anymore.
-	work_pool.do_work(island_count, this, &GodotStep2D::_solve_island, nullptr);
+	group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &GodotStep2D::_solve_island, nullptr, island_count, -1, true, SNAME("Physics2DConstraintSolveIslands"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 	{ //profile
 		profile_endtime = OS::get_singleton()->get_ticks_usec();
@@ -297,10 +299,7 @@ GodotStep2D::GodotStep2D() {
 	body_islands.reserve(BODY_ISLAND_COUNT_RESERVE);
 	constraint_islands.reserve(ISLAND_COUNT_RESERVE);
 	all_constraints.reserve(CONSTRAINT_COUNT_RESERVE);
-
-	work_pool.init();
 }
 
 GodotStep2D::~GodotStep2D() {
-	work_pool.finish();
 }
diff --git a/servers/physics_2d/godot_step_2d.h b/servers/physics_2d/godot_step_2d.h
index 9a6d8caf9bd..9f8fdd6ce3a 100644
--- a/servers/physics_2d/godot_step_2d.h
+++ b/servers/physics_2d/godot_step_2d.h
@@ -33,8 +33,8 @@
 
 #include "godot_space_2d.h"
 
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/local_vector.h"
-#include "core/templates/thread_work_pool.h"
 
 class GodotStep2D {
 	uint64_t _step = 1;
@@ -42,8 +42,6 @@ class GodotStep2D {
 	int iterations = 0;
 	real_t delta = 0.0;
 
-	ThreadWorkPool work_pool;
-
 	LocalVector<LocalVector<GodotBody2D *>> body_islands;
 	LocalVector<LocalVector<GodotConstraint2D *>> constraint_islands;
 	LocalVector<GodotConstraint2D *> all_constraints;
diff --git a/servers/physics_3d/godot_step_3d.cpp b/servers/physics_3d/godot_step_3d.cpp
index 99656d01a01..f384c829a4a 100644
--- a/servers/physics_3d/godot_step_3d.cpp
+++ b/servers/physics_3d/godot_step_3d.cpp
@@ -343,7 +343,8 @@ void GodotStep3D::step(GodotSpace3D *p_space, real_t p_delta) {
 	/* SETUP CONSTRAINTS / PROCESS COLLISIONS */
 
 	uint32_t total_contraint_count = all_constraints.size();
-	work_pool.do_work(total_contraint_count, this, &GodotStep3D::_setup_contraint, nullptr);
+	WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &GodotStep3D::_setup_contraint, nullptr, total_contraint_count, -1, true, SNAME("Physics3DConstraintSetup"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 	{ //profile
 		profile_endtime = OS::get_singleton()->get_ticks_usec();
@@ -362,7 +363,8 @@ void GodotStep3D::step(GodotSpace3D *p_space, real_t p_delta) {
 
 	// Warning: _solve_island modifies the constraint islands for optimization purpose,
 	// their content is not reliable after these calls and shouldn't be used anymore.
-	work_pool.do_work(island_count, this, &GodotStep3D::_solve_island, nullptr);
+	group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &GodotStep3D::_solve_island, nullptr, island_count, -1, true, SNAME("Physics3DConstraintSolveIslands"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 	{ //profile
 		profile_endtime = OS::get_singleton()->get_ticks_usec();
@@ -409,10 +411,7 @@ GodotStep3D::GodotStep3D() {
 	body_islands.reserve(BODY_ISLAND_COUNT_RESERVE);
 	constraint_islands.reserve(ISLAND_COUNT_RESERVE);
 	all_constraints.reserve(CONSTRAINT_COUNT_RESERVE);
-
-	work_pool.init();
 }
 
 GodotStep3D::~GodotStep3D() {
-	work_pool.finish();
 }
diff --git a/servers/physics_3d/godot_step_3d.h b/servers/physics_3d/godot_step_3d.h
index 6d975b0dd31..189487757f6 100644
--- a/servers/physics_3d/godot_step_3d.h
+++ b/servers/physics_3d/godot_step_3d.h
@@ -33,8 +33,8 @@
 
 #include "godot_space_3d.h"
 
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/local_vector.h"
-#include "core/templates/thread_work_pool.h"
 
 class GodotStep3D {
 	uint64_t _step = 1;
@@ -42,8 +42,6 @@ class GodotStep3D {
 	int iterations = 0;
 	real_t delta = 0.0;
 
-	ThreadWorkPool work_pool;
-
 	LocalVector<LocalVector<GodotBody3D *>> body_islands;
 	LocalVector<LocalVector<GodotConstraint3D *>> constraint_islands;
 	LocalVector<GodotConstraint3D *> all_constraints;
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
index d92f37e21e3..4a55a04cd46 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
@@ -775,7 +775,7 @@ void RenderForwardClustered::_render_list(RenderingDevice::DrawListID p_draw_lis
 
 void RenderForwardClustered::_render_list_thread_function(uint32_t p_thread, RenderListParameters *p_params) {
 	uint32_t render_total = p_params->element_count;
-	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+	uint32_t total_threads = WorkerThreadPool::get_singleton()->get_thread_count();
 	uint32_t render_from = p_thread * render_total / total_threads;
 	uint32_t render_to = (p_thread + 1 == total_threads) ? render_total : ((p_thread + 1) * render_total / total_threads);
 	_render_list(thread_draw_lists[p_thread], p_params->framebuffer_format, p_params, render_from, render_to);
@@ -787,9 +787,10 @@ void RenderForwardClustered::_render_list_with_threads(RenderListParameters *p_p
 
 	if ((uint32_t)p_params->element_count > render_list_thread_threshold && false) { // secondary command buffers need more testing at this time
 		//multi threaded
-		thread_draw_lists.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+		thread_draw_lists.resize(WorkerThreadPool::get_singleton()->get_thread_count());
 		RD::get_singleton()->draw_list_begin_split(p_framebuffer, thread_draw_lists.size(), thread_draw_lists.ptr(), p_initial_color_action, p_final_color_action, p_initial_depth_action, p_final_depth_action, p_clear_color_values, p_clear_depth, p_clear_stencil, p_region, p_storage_textures);
-		RendererThreadPool::singleton->thread_work_pool.do_work(thread_draw_lists.size(), this, &RenderForwardClustered::_render_list_thread_function, p_params);
+		WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RenderForwardClustered::_render_list_thread_function, p_params, thread_draw_lists.size(), -1, true, SNAME("ForwardClusteredRenderList"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 		RD::get_singleton()->draw_list_end(p_params->barrier);
 	} else {
 		//single threaded
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
index a07955d2d18..0ca71080fca 100644
--- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
@@ -758,9 +758,12 @@ void RenderForwardMobile::_render_scene(RenderDataRD *p_render_data, const Color
 			if ((uint32_t)render_list_params.element_count > render_list_thread_threshold && false) {
 				// secondary command buffers need more testing at this time
 				//multi threaded
-				thread_draw_lists.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+				thread_draw_lists.resize(WorkerThreadPool::get_singleton()->get_thread_count());
 				RD::get_singleton()->draw_list_begin_split(framebuffer, thread_draw_lists.size(), thread_draw_lists.ptr(), keep_color ? RD::INITIAL_ACTION_KEEP : RD::INITIAL_ACTION_CLEAR, can_continue_color ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CLEAR, can_continue_depth ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, c, 1.0, 0);
-				RendererThreadPool::singleton->thread_work_pool.do_work(thread_draw_lists.size(), this, &RenderForwardMobile::_render_list_thread_function, &render_list_params);
+
+				WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RenderForwardMobile::_render_list_thread_function, &render_list_params, thread_draw_lists.size(), -1, true, SNAME("ForwardMobileRenderList"));
+				WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 			} else {
 				//single threaded
 				RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(framebuffer, keep_color ? RD::INITIAL_ACTION_KEEP : RD::INITIAL_ACTION_CLEAR, can_continue_color ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CLEAR, can_continue_depth ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, c, 1.0, 0);
@@ -822,10 +825,12 @@ void RenderForwardMobile::_render_scene(RenderDataRD *p_render_data, const Color
 			if ((uint32_t)render_list_params.element_count > render_list_thread_threshold && false) {
 				// secondary command buffers need more testing at this time
 				//multi threaded
-				thread_draw_lists.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+				thread_draw_lists.resize(WorkerThreadPool::get_singleton()->get_thread_count());
 				RD::get_singleton()->draw_list_switch_to_next_pass_split(thread_draw_lists.size(), thread_draw_lists.ptr());
 				render_list_params.subpass = RD::get_singleton()->draw_list_get_current_pass();
-				RendererThreadPool::singleton->thread_work_pool.do_work(thread_draw_lists.size(), this, &RenderForwardMobile::_render_list_thread_function, &render_list_params);
+				WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RenderForwardMobile::_render_list_thread_function, &render_list_params, thread_draw_lists.size(), -1, true, SNAME("ForwardMobileRenderSubpass"));
+				WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 			} else {
 				//single threaded
 				RD::DrawListID draw_list = RD::get_singleton()->draw_list_switch_to_next_pass();
@@ -859,9 +864,11 @@ void RenderForwardMobile::_render_scene(RenderDataRD *p_render_data, const Color
 			if ((uint32_t)render_list_params.element_count > render_list_thread_threshold && false) {
 				// secondary command buffers need more testing at this time
 				//multi threaded
-				thread_draw_lists.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+				thread_draw_lists.resize(WorkerThreadPool::get_singleton()->get_thread_count());
 				RD::get_singleton()->draw_list_begin_split(framebuffer, thread_draw_lists.size(), thread_draw_lists.ptr(), can_continue_color ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ, can_continue_depth ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ);
-				RendererThreadPool::singleton->thread_work_pool.do_work(thread_draw_lists.size(), this, &RenderForwardMobile::_render_list_thread_function, &render_list_params);
+				WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RenderForwardMobile::_render_list_thread_function, &render_list_params, thread_draw_lists.size(), -1, true, SNAME("ForwardMobileRenderSubpass"));
+				WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 				RD::get_singleton()->draw_list_end(RD::BARRIER_MASK_ALL);
 			} else {
 				//single threaded
@@ -1771,7 +1778,7 @@ void RenderForwardMobile::_render_list(RenderingDevice::DrawListID p_draw_list,
 
 void RenderForwardMobile::_render_list_thread_function(uint32_t p_thread, RenderListParameters *p_params) {
 	uint32_t render_total = p_params->element_count;
-	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+	uint32_t total_threads = WorkerThreadPool::get_singleton()->get_thread_count();
 	uint32_t render_from = p_thread * render_total / total_threads;
 	uint32_t render_to = (p_thread + 1 == total_threads) ? render_total : ((p_thread + 1) * render_total / total_threads);
 	_render_list(thread_draw_lists[p_thread], p_params->framebuffer_format, p_params, render_from, render_to);
@@ -1783,9 +1790,11 @@ void RenderForwardMobile::_render_list_with_threads(RenderListParameters *p_para
 
 	if ((uint32_t)p_params->element_count > render_list_thread_threshold && false) { // secondary command buffers need more testing at this time
 		//multi threaded
-		thread_draw_lists.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+		thread_draw_lists.resize(WorkerThreadPool::get_singleton()->get_thread_count());
 		RD::get_singleton()->draw_list_begin_split(p_framebuffer, thread_draw_lists.size(), thread_draw_lists.ptr(), p_initial_color_action, p_final_color_action, p_initial_depth_action, p_final_depth_action, p_clear_color_values, p_clear_depth, p_clear_stencil, p_region, p_storage_textures);
-		RendererThreadPool::singleton->thread_work_pool.do_work(thread_draw_lists.size(), this, &RenderForwardMobile::_render_list_thread_function, p_params);
+		WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RenderForwardMobile::_render_list_thread_function, p_params, thread_draw_lists.size(), -1, true, SNAME("ForwardMobileRenderSubpass"));
+		WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 		RD::get_singleton()->draw_list_end(p_params->barrier);
 	} else {
 		//single threaded
diff --git a/servers/rendering/renderer_rd/renderer_compositor_rd.h b/servers/rendering/renderer_rd/renderer_compositor_rd.h
index 1d3a5585e1b..564c26bfe4f 100644
--- a/servers/rendering/renderer_rd/renderer_compositor_rd.h
+++ b/servers/rendering/renderer_rd/renderer_compositor_rd.h
@@ -32,7 +32,6 @@
 #define RENDERER_COMPOSITOR_RD_H
 
 #include "core/os/os.h"
-#include "core/templates/thread_work_pool.h"
 #include "servers/rendering/renderer_compositor.h"
 #include "servers/rendering/renderer_rd/effects_rd.h"
 #include "servers/rendering/renderer_rd/environment/fog.h"
diff --git a/servers/rendering/renderer_rd/shader_rd.cpp b/servers/rendering/renderer_rd/shader_rd.cpp
index 176465234e8..c9b6d09d4c1 100644
--- a/servers/rendering/renderer_rd/shader_rd.cpp
+++ b/servers/rendering/renderer_rd/shader_rd.cpp
@@ -476,7 +476,9 @@ void ShaderRD::_compile_version(Version *p_version) {
 
 #if 1
 
-	RendererThreadPool::singleton->thread_work_pool.do_work(variant_defines.size(), this, &ShaderRD::_compile_variant, p_version);
+	WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &ShaderRD::_compile_variant, p_version, variant_defines.size(), -1, true, SNAME("ShaderCompilation"));
+	WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
+
 #else
 	for (int i = 0; i < variant_defines.size(); i++) {
 		_compile_variant(i, p_version);
diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp
index ce08c27ed53..7631be3669d 100644
--- a/servers/rendering/renderer_scene_cull.cpp
+++ b/servers/rendering/renderer_scene_cull.cpp
@@ -2518,14 +2518,14 @@ void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_
 
 	RENDER_TIMESTAMP("Update Occlusion Buffer")
 	// For now just cull on the first camera
-	RendererSceneOcclusionCull::get_singleton()->buffer_update(p_viewport, camera_data.main_transform, camera_data.main_projection, camera_data.is_orthogonal, RendererThreadPool::singleton->thread_work_pool);
+	RendererSceneOcclusionCull::get_singleton()->buffer_update(p_viewport, camera_data.main_transform, camera_data.main_projection, camera_data.is_orthogonal);
 
 	_render_scene(&camera_data, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_viewport, p_shadow_atlas, RID(), -1, p_screen_mesh_lod_threshold, true, r_render_info);
 #endif
 }
 
 void RendererSceneCull::_visibility_cull_threaded(uint32_t p_thread, VisibilityCullData *cull_data) {
-	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+	uint32_t total_threads = WorkerThreadPool::get_singleton()->get_thread_count();
 	uint32_t bin_from = p_thread * cull_data->cull_count / total_threads;
 	uint32_t bin_to = (p_thread + 1 == total_threads) ? cull_data->cull_count : ((p_thread + 1) * cull_data->cull_count / total_threads);
 
@@ -2622,7 +2622,7 @@ bool RendererSceneCull::_visibility_parent_check(const CullData &p_cull_data, co
 
 void RendererSceneCull::_scene_cull_threaded(uint32_t p_thread, CullData *cull_data) {
 	uint32_t cull_total = cull_data->scenario->instance_data.size();
-	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+	uint32_t total_threads = WorkerThreadPool::get_singleton()->get_thread_count();
 	uint32_t cull_from = p_thread * cull_total / total_threads;
 	uint32_t cull_to = (p_thread + 1 == total_threads) ? cull_total : ((p_thread + 1) * cull_total / total_threads);
 
@@ -2919,7 +2919,8 @@ void RendererSceneCull::_render_scene(const RendererSceneRender::CameraData *p_c
 			}
 
 			if (visibility_cull_data.cull_count > thread_cull_threshold) {
-				RendererThreadPool::singleton->thread_work_pool.do_work(RendererThreadPool::singleton->thread_work_pool.get_thread_count(), this, &RendererSceneCull::_visibility_cull_threaded, &visibility_cull_data);
+				WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RendererSceneCull::_visibility_cull_threaded, &visibility_cull_data, WorkerThreadPool::get_singleton()->get_thread_count(), -1, true, SNAME("VisibilityCullInstances"));
+				WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 			} else {
 				_visibility_cull(visibility_cull_data, visibility_cull_data.cull_offset, visibility_cull_data.cull_offset + visibility_cull_data.cull_count);
 			}
@@ -3024,7 +3025,8 @@ void RendererSceneCull::_render_scene(const RendererSceneRender::CameraData *p_c
 				scene_cull_result_threads[i].clear();
 			}
 
-			RendererThreadPool::singleton->thread_work_pool.do_work(scene_cull_result_threads.size(), this, &RendererSceneCull::_scene_cull_threaded, &cull_data);
+			WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &RendererSceneCull::_scene_cull_threaded, &cull_data, scene_cull_result_threads.size(), -1, true, SNAME("RenderCullInstances"));
+			WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
 
 			for (uint32_t i = 0; i < scene_cull_result_threads.size(); i++) {
 				scene_cull_result.append_from(scene_cull_result_threads[i]);
@@ -4030,14 +4032,14 @@ RendererSceneCull::RendererSceneCull() {
 	}
 
 	scene_cull_result.init(&rid_cull_page_pool, &geometry_instance_cull_page_pool, &instance_cull_page_pool);
-	scene_cull_result_threads.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+	scene_cull_result_threads.resize(WorkerThreadPool::get_singleton()->get_thread_count());
 	for (uint32_t i = 0; i < scene_cull_result_threads.size(); i++) {
 		scene_cull_result_threads[i].init(&rid_cull_page_pool, &geometry_instance_cull_page_pool, &instance_cull_page_pool);
 	}
 
 	indexer_update_iterations = GLOBAL_GET("rendering/limits/spatial_indexer/update_iterations_per_frame");
 	thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances");
-	thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)RendererThreadPool::singleton->thread_work_pool.get_thread_count()); //make sure there is at least one thread per CPU
+	thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)WorkerThreadPool::get_singleton()->get_thread_count()); //make sure there is at least one thread per CPU
 
 	taa_jitter_array.resize(TAA_JITTER_COUNT);
 	for (int i = 0; i < TAA_JITTER_COUNT; i++) {
diff --git a/servers/rendering/renderer_scene_occlusion_cull.h b/servers/rendering/renderer_scene_occlusion_cull.h
index e25aa11e6bf..0d466e8a326 100644
--- a/servers/rendering/renderer_scene_occlusion_cull.h
+++ b/servers/rendering/renderer_scene_occlusion_cull.h
@@ -183,7 +183,8 @@ public:
 	}
 	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) { _print_warning(); }
 	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) { _print_warning(); }
-	virtual void buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {}
+	virtual void buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal) {}
+
 	virtual RID buffer_get_debug_texture(RID p_buffer) {
 		_print_warning();
 		return RID();
diff --git a/servers/rendering/renderer_thread_pool.cpp b/servers/rendering/renderer_thread_pool.cpp
deleted file mode 100644
index ddf1d1bd000..00000000000
--- a/servers/rendering/renderer_thread_pool.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*************************************************************************/
-/*  renderer_thread_pool.cpp                                             */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#include "renderer_thread_pool.h"
-
-RendererThreadPool *RendererThreadPool::singleton = nullptr;
-
-RendererThreadPool::RendererThreadPool() {
-	singleton = this;
-	thread_work_pool.init();
-}
-
-RendererThreadPool::~RendererThreadPool() {
-	thread_work_pool.finish();
-}
diff --git a/servers/rendering/renderer_thread_pool.h b/servers/rendering/renderer_thread_pool.h
deleted file mode 100644
index 0155c17e05b..00000000000
--- a/servers/rendering/renderer_thread_pool.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*************************************************************************/
-/*  renderer_thread_pool.h                                               */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#ifndef RENDERER_THREAD_POOL_H
-#define RENDERER_THREAD_POOL_H
-
-#include "core/templates/thread_work_pool.h"
-
-class RendererThreadPool {
-public:
-	ThreadWorkPool thread_work_pool;
-
-	static RendererThreadPool *singleton;
-	RendererThreadPool();
-	~RendererThreadPool();
-};
-
-#endif // RENDERER_THREAD_POOL_H
diff --git a/servers/rendering/renderer_viewport.cpp b/servers/rendering/renderer_viewport.cpp
index 7c9b2567d69..0e26de8bcbb 100644
--- a/servers/rendering/renderer_viewport.cpp
+++ b/servers/rendering/renderer_viewport.cpp
@@ -154,7 +154,7 @@ void RendererViewport::_draw_3d(Viewport *p_viewport) {
 	if (p_viewport->use_occlusion_culling) {
 		if (p_viewport->occlusion_buffer_dirty) {
 			float aspect = p_viewport->size.aspect();
-			int max_size = occlusion_rays_per_thread * RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+			int max_size = occlusion_rays_per_thread * WorkerThreadPool::get_singleton()->get_thread_count();
 
 			int viewport_size = p_viewport->size.width * p_viewport->size.height;
 			max_size = CLAMP(max_size, viewport_size / (32 * 32), viewport_size / (2 * 2)); // At least one depth pixel for every 16x16 region. At most one depth pixel for every 2x2 region.
diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp
index da02bdc66a9..73405dcbed0 100644
--- a/servers/rendering_server.cpp
+++ b/servers/rendering_server.cpp
@@ -2831,7 +2831,6 @@ void RenderingServer::set_render_loop_enabled(bool p_enabled) {
 RenderingServer::RenderingServer() {
 	//ERR_FAIL_COND(singleton);
 
-	thread_pool = memnew(RendererThreadPool);
 	singleton = this;
 }
 
@@ -3032,6 +3031,5 @@ void RenderingServer::init() {
 }
 
 RenderingServer::~RenderingServer() {
-	memdelete(thread_pool);
 	singleton = nullptr;
 }
diff --git a/servers/rendering_server.h b/servers/rendering_server.h
index 39490a03466..dcb2683adde 100644
--- a/servers/rendering_server.h
+++ b/servers/rendering_server.h
@@ -35,11 +35,11 @@
 #include "core/math/geometry_3d.h"
 #include "core/math/transform_2d.h"
 #include "core/object/class_db.h"
+#include "core/object/worker_thread_pool.h"
 #include "core/templates/rid.h"
 #include "core/variant/typed_array.h"
 #include "core/variant/variant.h"
 #include "servers/display_server.h"
-#include "servers/rendering/renderer_thread_pool.h"
 #include "servers/rendering/rendering_device.h"
 
 class RenderingServer : public Object {
@@ -52,8 +52,6 @@ class RenderingServer : public Object {
 
 	Array _get_array_from_surface(uint32_t p_format, Vector<uint8_t> p_vertex_data, Vector<uint8_t> p_attrib_data, Vector<uint8_t> p_skin_data, int p_vertex_len, Vector<uint8_t> p_index_data, int p_index_len) const;
 
-	RendererThreadPool *thread_pool = nullptr;
-
 	const Vector2 SMALL_VEC2 = Vector2(CMP_EPSILON, CMP_EPSILON);
 	const Vector3 SMALL_VEC3 = Vector3(CMP_EPSILON, CMP_EPSILON, CMP_EPSILON);