HaxeJolt

2026-03-04 00:50:15 -08:00
parent 9126175569
commit 4211317c03
569 changed files with 122194 additions and 0 deletions
--- a/leenkx/Sources/leenkx/trait/physics/jolt/PhysicsConstraintExportHelper.hx
+++ b/leenkx/Sources/leenkx/trait/physics/jolt/PhysicsConstraintExportHelper.hx
@ -0,0 +1,60 @@
+package leenkx.trait.physics.jolt;
+
+import iron.Scene;
+import iron.object.Object;
+#if lnx_jolt
+
+/**
+ * A helper trait to add physics constraints when exporting via Blender. 
+ * This trait will be automatically removed once the constraint is added. Note that this trait
+ * uses object names instead of object reference.
+ **/
+class PhysicsConstraintExportHelper extends iron.Trait {
+
+	var body1: String;
+	var body2: String;
+	var type: Int;
+	var disableCollisions: Bool;
+	var breakingThreshold: Float;
+	var limits: Array<Float>;
+	var constraintAdded: Bool = false;
+	var relativeConstraint: Bool = false;
+
+	public function new(body1: String, body2: String, type: Int, disableCollisions: Bool, breakingThreshold: Float, relatieConstraint: Bool = false, limits: Array<Float> = null) {
+		super();
+
+		this.body1 = body1;
+		this.body2 = body2;
+		this.type = type;
+		this.disableCollisions = disableCollisions;
+		this.breakingThreshold = breakingThreshold;
+		this.relativeConstraint = relatieConstraint;
+		this.limits = limits;
+		notifyOnInit(init);
+		notifyOnUpdate(update);
+	}
+
+	function init() {
+		var target1; 
+		var target2;
+
+		if(relativeConstraint) {
+
+			target1 = object.parent.getChild(body1);
+			target2 = object.parent.getChild(body2);
+		}
+		else {
+			
+			target1 = Scene.active.getChild(body1);
+			target2 = Scene.active.getChild(body2);
+		}
+		object.addTrait(new PhysicsConstraint(target1, target2, type, disableCollisions, breakingThreshold, limits));
+		constraintAdded = true;
+	}
+
+	function update() {
+		if(constraintAdded) this.remove();
+	}
+}
+
+#end
--- a/lib/haxejolt/JoltPhysics/Jolt/AABBTree/AABBTreeBuilder.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/AABBTree/AABBTreeBuilder.cpp
@ -0,0 +1,242 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/AABBTree/AABBTreeBuilder.h>
+
+JPH_NAMESPACE_BEGIN
+
+uint AABBTreeBuilder::Node::GetMinDepth(const Array<Node> &inNodes) const
+{
+	if (HasChildren())
+	{
+		uint left = inNodes[mChild[0]].GetMinDepth(inNodes);
+		uint right = inNodes[mChild[1]].GetMinDepth(inNodes);
+		return min(left, right) + 1;
+	}
+	else
+		return 1;
+}
+
+uint AABBTreeBuilder::Node::GetMaxDepth(const Array<Node> &inNodes) const
+{
+	if (HasChildren())
+	{
+		uint left = inNodes[mChild[0]].GetMaxDepth(inNodes);
+		uint right = inNodes[mChild[1]].GetMaxDepth(inNodes);
+		return max(left, right) + 1;
+	}
+	else
+		return 1;
+}
+
+uint AABBTreeBuilder::Node::GetNodeCount(const Array<Node> &inNodes) const
+{
+	if (HasChildren())
+		return inNodes[mChild[0]].GetNodeCount(inNodes) + inNodes[mChild[1]].GetNodeCount(inNodes) + 1;
+	else
+		return 1;
+}
+
+uint AABBTreeBuilder::Node::GetLeafNodeCount(const Array<Node> &inNodes) const
+{
+	if (HasChildren())
+		return inNodes[mChild[0]].GetLeafNodeCount(inNodes) + inNodes[mChild[1]].GetLeafNodeCount(inNodes);
+	else
+		return 1;
+}
+
+uint AABBTreeBuilder::Node::GetTriangleCountInTree(const Array<Node> &inNodes) const
+{
+	if (HasChildren())
+		return inNodes[mChild[0]].GetTriangleCountInTree(inNodes) + inNodes[mChild[1]].GetTriangleCountInTree(inNodes);
+	else
+		return GetTriangleCount();
+}
+
+void AABBTreeBuilder::Node::GetTriangleCountPerNode(const Array<Node> &inNodes, float &outAverage, uint &outMin, uint &outMax) const
+{
+	outMin = INT_MAX;
+	outMax = 0;
+	outAverage = 0;
+	uint avg_divisor = 0;
+	GetTriangleCountPerNodeInternal(inNodes, outAverage, avg_divisor, outMin, outMax);
+	if (avg_divisor > 0)
+		outAverage /= avg_divisor;
+}
+
+float AABBTreeBuilder::Node::CalculateSAHCost(const Array<Node> &inNodes, float inCostTraversal, float inCostLeaf) const
+{
+	float surface_area = mBounds.GetSurfaceArea();
+	return surface_area > 0.0f? CalculateSAHCostInternal(inNodes, inCostTraversal / surface_area, inCostLeaf / surface_area) : 0.0f;
+}
+
+void AABBTreeBuilder::Node::GetNChildren(const Array<Node> &inNodes, uint inN, Array<const Node*> &outChildren) const
+{
+	JPH_ASSERT(outChildren.empty());
+
+	// Check if there is anything to expand
+	if (!HasChildren())
+		return;
+
+	// Start with the children of this node
+	outChildren.push_back(&inNodes[mChild[0]]);
+	outChildren.push_back(&inNodes[mChild[1]]);
+
+	size_t next = 0;
+	bool all_triangles = true;
+	while (outChildren.size() < inN)
+	{
+		// If we have looped over all nodes, start over with the first node again
+		if (next >= outChildren.size())
+		{
+			// If there only triangle nodes left, we have to terminate
+			if (all_triangles)
+				return;
+			next = 0;
+			all_triangles = true;
+		}
+
+		// Try to expand this node into its two children
+		const Node *to_expand = outChildren[next];
+		if (to_expand->HasChildren())
+		{
+			outChildren.erase(outChildren.begin() + next);
+			outChildren.push_back(&inNodes[to_expand->mChild[0]]);
+			outChildren.push_back(&inNodes[to_expand->mChild[1]]);
+			all_triangles = false;
+		}
+		else
+		{
+			++next;
+		}
+	}
+}
+
+float AABBTreeBuilder::Node::CalculateSAHCostInternal(const Array<Node> &inNodes, float inCostTraversalDivSurfaceArea, float inCostLeafDivSurfaceArea) const
+{
+	if (HasChildren())
+		return inCostTraversalDivSurfaceArea * mBounds.GetSurfaceArea()
+			+ inNodes[mChild[0]].CalculateSAHCostInternal(inNodes, inCostTraversalDivSurfaceArea, inCostLeafDivSurfaceArea)
+			+ inNodes[mChild[1]].CalculateSAHCostInternal(inNodes, inCostTraversalDivSurfaceArea, inCostLeafDivSurfaceArea);
+	else
+		return inCostLeafDivSurfaceArea * mBounds.GetSurfaceArea() * GetTriangleCount();
+}
+
+void AABBTreeBuilder::Node::GetTriangleCountPerNodeInternal(const Array<Node> &inNodes, float &outAverage, uint &outAverageDivisor, uint &outMin, uint &outMax) const
+{
+	if (HasChildren())
+	{
+		inNodes[mChild[0]].GetTriangleCountPerNodeInternal(inNodes, outAverage, outAverageDivisor, outMin, outMax);
+		inNodes[mChild[1]].GetTriangleCountPerNodeInternal(inNodes, outAverage, outAverageDivisor, outMin, outMax);
+	}
+	else
+	{
+		outAverage += GetTriangleCount();
+		outAverageDivisor++;
+		outMin = min(outMin, GetTriangleCount());
+		outMax = max(outMax, GetTriangleCount());
+	}
+}
+
+AABBTreeBuilder::AABBTreeBuilder(TriangleSplitter &inSplitter, uint inMaxTrianglesPerLeaf) :
+	mTriangleSplitter(inSplitter),
+	mMaxTrianglesPerLeaf(inMaxTrianglesPerLeaf)
+{
+}
+
+AABBTreeBuilder::Node *AABBTreeBuilder::Build(AABBTreeBuilderStats &outStats)
+{
+	TriangleSplitter::Range initial = mTriangleSplitter.GetInitialRange();
+
+	// Worst case for number of nodes: 1 leaf node per triangle. At each level above, the number of nodes is half that of the level below.
+	// This means that at most we'll be allocating 2x the number of triangles in nodes.
+	mNodes.reserve(2 * initial.Count());
+	mTriangles.reserve(initial.Count());
+
+	// Build the tree
+	Node &root = mNodes[BuildInternal(initial)];
+
+	// Collect stats
+	float avg_triangles_per_leaf;
+	uint min_triangles_per_leaf, max_triangles_per_leaf;
+	root.GetTriangleCountPerNode(mNodes, avg_triangles_per_leaf, min_triangles_per_leaf, max_triangles_per_leaf);
+
+	mTriangleSplitter.GetStats(outStats.mSplitterStats);
+
+	outStats.mSAHCost = root.CalculateSAHCost(mNodes, 1.0f, 1.0f);
+	outStats.mMinDepth = root.GetMinDepth(mNodes);
+	outStats.mMaxDepth = root.GetMaxDepth(mNodes);
+	outStats.mNodeCount = root.GetNodeCount(mNodes);
+	outStats.mLeafNodeCount = root.GetLeafNodeCount(mNodes);
+	outStats.mMaxTrianglesPerLeaf = mMaxTrianglesPerLeaf;
+	outStats.mTreeMinTrianglesPerLeaf = min_triangles_per_leaf;
+	outStats.mTreeMaxTrianglesPerLeaf = max_triangles_per_leaf;
+	outStats.mTreeAvgTrianglesPerLeaf = avg_triangles_per_leaf;
+
+	return &root;
+}
+
+uint AABBTreeBuilder::BuildInternal(const TriangleSplitter::Range &inTriangles)
+{
+	// Check if there are too many triangles left
+	if (inTriangles.Count() > mMaxTrianglesPerLeaf)
+	{
+		// Split triangles in two batches
+		TriangleSplitter::Range left, right;
+		if (!mTriangleSplitter.Split(inTriangles, left, right))
+		{
+			// When the trace below triggers:
+			//
+			// This code builds a tree structure to accelerate collision detection.
+			// At top level it will start with all triangles in a mesh and then divides the triangles into two batches.
+			// This process repeats until until the batch size is smaller than mMaxTrianglePerLeaf.
+			//
+			// It uses a TriangleSplitter to find a good split. When this warning triggers, the splitter was not able
+			// to create a reasonable split for the triangles. This usually happens when the triangles in a batch are
+			// intersecting. They could also be overlapping when projected on the 3 coordinate axis.
+			//
+			// To solve this issue, you could try to pass your mesh through a mesh cleaning / optimization algorithm.
+			// You could also inspect the triangles that cause this issue and see if that part of the mesh can be fixed manually.
+			//
+			// When you do not fix this warning, the tree will be less efficient for collision detection, but it will still work.
+			JPH_IF_DEBUG(Trace("AABBTreeBuilder: Doing random split for %d triangles (max per node: %u)!", (int)inTriangles.Count(), mMaxTrianglesPerLeaf);)
+			int half = inTriangles.Count() / 2;
+			JPH_ASSERT(half > 0);
+			left = TriangleSplitter::Range(inTriangles.mBegin, inTriangles.mBegin + half);
+			right = TriangleSplitter::Range(inTriangles.mBegin + half, inTriangles.mEnd);
+		}
+
+		// Recursively build
+		const uint node_index = (uint)mNodes.size();
+		mNodes.push_back(Node());
+		uint left_index = BuildInternal(left);
+		uint right_index = BuildInternal(right);
+		Node &node = mNodes[node_index];
+		node.mChild[0] = left_index;
+		node.mChild[1] = right_index;
+		node.mBounds = mNodes[node.mChild[0]].mBounds;
+		node.mBounds.Encapsulate(mNodes[node.mChild[1]].mBounds);
+		return node_index;
+	}
+
+	// Create leaf node
+	const uint node_index = (uint)mNodes.size();
+	mNodes.push_back(Node());
+	Node &node = mNodes.back();
+	node.mTrianglesBegin = (uint)mTriangles.size();
+	node.mNumTriangles = inTriangles.mEnd - inTriangles.mBegin;
+	const VertexList &v = mTriangleSplitter.GetVertices();
+	for (uint i = inTriangles.mBegin; i < inTriangles.mEnd; ++i)
+	{
+		const IndexedTriangle &t = mTriangleSplitter.GetTriangle(i);
+		mTriangles.push_back(t);
+		node.mBounds.Encapsulate(v, t);
+	}
+
+	return node_index;
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/AABBTree/AABBTreeBuilder.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/AABBTree/AABBTreeBuilder.h
@ -0,0 +1,121 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/TriangleSplitter/TriangleSplitter.h>
+#include <Jolt/Geometry/AABox.h>
+#include <Jolt/Core/NonCopyable.h>
+
+JPH_NAMESPACE_BEGIN
+
+struct AABBTreeBuilderStats
+{
+	///@name Splitter stats
+	TriangleSplitter::Stats	mSplitterStats;							///< Stats returned by the triangle splitter algorithm
+
+	///@name Tree structure
+	float					mSAHCost = 0.0f;						///< Surface Area Heuristic cost of this tree
+	int						mMinDepth = 0;							///< Minimal depth of tree (number of nodes)
+	int						mMaxDepth = 0;							///< Maximum depth of tree (number of nodes)
+	int						mNodeCount = 0;							///< Number of nodes in the tree
+	int						mLeafNodeCount = 0;						///< Number of leaf nodes (that contain triangles)
+
+	///@name Configured stats
+	int						mMaxTrianglesPerLeaf = 0;				///< Configured max triangles per leaf
+
+	///@name Actual stats
+	int						mTreeMinTrianglesPerLeaf = 0;			///< Minimal amount of triangles in a leaf
+	int						mTreeMaxTrianglesPerLeaf = 0;			///< Maximal amount of triangles in a leaf
+	float					mTreeAvgTrianglesPerLeaf = 0.0f;		///< Average amount of triangles in leaf nodes
+};
+
+/// Helper class to build an AABB tree
+class JPH_EXPORT AABBTreeBuilder
+{
+public:
+	/// A node in the tree, contains the AABox for the tree and any child nodes or triangles
+	class Node
+	{
+	public:
+		JPH_OVERRIDE_NEW_DELETE
+
+		/// Indicates that there is no child
+		static constexpr uint cInvalidNodeIndex = ~uint(0);
+
+		/// Get number of triangles in this node
+		inline uint			GetTriangleCount() const				{ return mNumTriangles; }
+
+		/// Check if this node has any children
+		inline bool			HasChildren() const						{ return mChild[0] != cInvalidNodeIndex || mChild[1] != cInvalidNodeIndex; }
+
+		/// Get child node
+		inline const Node * GetChild(uint inIdx, const Array<Node> &inNodes) const { return mChild[inIdx] != cInvalidNodeIndex? &inNodes[mChild[inIdx]] : nullptr; }
+
+		/// Min depth of tree
+		uint				GetMinDepth(const Array<Node> &inNodes) const;
+
+		/// Max depth of tree
+		uint				GetMaxDepth(const Array<Node> &inNodes) const;
+
+		/// Number of nodes in tree
+		uint				GetNodeCount(const Array<Node> &inNodes) const;
+
+		/// Number of leaf nodes in tree
+		uint				GetLeafNodeCount(const Array<Node> &inNodes) const;
+
+		/// Get triangle count in tree
+		uint				GetTriangleCountInTree(const Array<Node> &inNodes) const;
+
+		/// Calculate min and max triangles per node
+		void				GetTriangleCountPerNode(const Array<Node> &inNodes, float &outAverage, uint &outMin, uint &outMax) const;
+
+		/// Calculate the total cost of the tree using the surface area heuristic
+		float				CalculateSAHCost(const Array<Node> &inNodes, float inCostTraversal, float inCostLeaf) const;
+
+		/// Recursively get children (breadth first) to get in total inN children (or less if there are no more)
+		void				GetNChildren(const Array<Node> &inNodes, uint inN, Array<const Node *> &outChildren) const;
+
+		/// Bounding box
+		AABox				mBounds;
+
+		/// Triangles (if no child nodes)
+		uint				mTrianglesBegin; // Index into mTriangles
+		uint				mNumTriangles = 0;
+
+		/// Child node indices (if no triangles)
+		uint				mChild[2] = { cInvalidNodeIndex, cInvalidNodeIndex };
+
+	private:
+		friend class AABBTreeBuilder;
+
+		/// Recursive helper function to calculate cost of the tree
+		float				CalculateSAHCostInternal(const Array<Node> &inNodes, float inCostTraversalDivSurfaceArea, float inCostLeafDivSurfaceArea) const;
+
+		/// Recursive helper function to calculate min and max triangles per node
+		void				GetTriangleCountPerNodeInternal(const Array<Node> &inNodes, float &outAverage, uint &outAverageDivisor, uint &outMin, uint &outMax) const;
+	};
+
+	/// Constructor
+	explicit				AABBTreeBuilder(TriangleSplitter &inSplitter, uint inMaxTrianglesPerLeaf = 16);
+
+	/// Recursively build tree, returns the root node of the tree
+	Node *					Build(AABBTreeBuilderStats &outStats);
+
+	/// Get all nodes
+	const Array<Node> &		GetNodes() const						{ return mNodes; }
+
+	/// Get all triangles
+	const Array<IndexedTriangle> &GetTriangles() const				{ return mTriangles; }
+
+private:
+	uint					BuildInternal(const TriangleSplitter::Range &inTriangles);
+
+	TriangleSplitter &		mTriangleSplitter;
+	const uint				mMaxTrianglesPerLeaf;
+	Array<Node>				mNodes;
+	Array<IndexedTriangle>	mTriangles;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/AABBTree/AABBTreeToBuffer.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/AABBTree/AABBTreeToBuffer.h
@ -0,0 +1,296 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/AABBTree/AABBTreeBuilder.h>
+#include <Jolt/Core/ByteBuffer.h>
+#include <Jolt/Geometry/IndexedTriangle.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Conversion algorithm that converts an AABB tree to an optimized binary buffer
+template <class TriangleCodec, class NodeCodec>
+class AABBTreeToBuffer
+{
+public:
+	/// Header for the tree
+	using NodeHeader = typename NodeCodec::Header;
+
+	/// Size in bytes of the header of the tree
+	static const int HeaderSize = NodeCodec::HeaderSize;
+
+	/// Maximum number of children per node in the tree
+	static const int NumChildrenPerNode = NodeCodec::NumChildrenPerNode;
+
+	/// Header for the triangles
+	using TriangleHeader = typename TriangleCodec::TriangleHeader;
+
+	/// Size in bytes of the header for the triangles
+	static const int TriangleHeaderSize = TriangleCodec::TriangleHeaderSize;
+
+	/// Convert AABB tree. Returns false if failed.
+	bool							Convert(const Array<IndexedTriangle> &inTriangles, const Array<AABBTreeBuilder::Node> &inNodes, const VertexList &inVertices, const AABBTreeBuilder::Node *inRoot, bool inStoreUserData, const char *&outError)
+	{
+		typename NodeCodec::EncodingContext node_ctx;
+		typename TriangleCodec::EncodingContext tri_ctx(inVertices);
+
+		// Child nodes out of loop so we don't constantly realloc it
+		Array<const AABBTreeBuilder::Node *> child_nodes;
+		child_nodes.reserve(NumChildrenPerNode);
+
+		// First calculate how big the tree is going to be.
+		// Since the tree can be huge for very large meshes, we don't want
+		// to reallocate the buffer as it may cause out of memory situations.
+		// This loop mimics the construction loop below.
+		uint64 total_size = HeaderSize + TriangleHeaderSize;
+		size_t node_count = 1; // Start with root node
+		size_t to_process_max_size = 1; // Track size of queues so we can do a single reserve below
+		size_t to_process_triangles_max_size = 0;
+		{	// A scope to free the memory associated with to_estimate and to_estimate_triangles
+			Array<const AABBTreeBuilder::Node *> to_estimate;
+			Array<const AABBTreeBuilder::Node *> to_estimate_triangles;
+			to_estimate.push_back(inRoot);
+			for (;;)
+			{
+				while (!to_estimate.empty())
+				{
+					// Get the next node to process
+					const AABBTreeBuilder::Node *node = to_estimate.back();
+					to_estimate.pop_back();
+
+					// Update total size
+					node_ctx.PrepareNodeAllocate(node, total_size);
+
+					if (node->HasChildren())
+					{
+						// Collect the first NumChildrenPerNode sub-nodes in the tree
+						child_nodes.clear(); // Won't free the memory
+						node->GetNChildren(inNodes, NumChildrenPerNode, child_nodes);
+
+						// Increment the number of nodes we're going to store
+						node_count += child_nodes.size();
+
+						// Insert in reverse order so we estimate left child first when taking nodes from the back
+						for (int idx = int(child_nodes.size()) - 1; idx >= 0; --idx)
+						{
+							// Store triangles in separate list so we process them last
+							const AABBTreeBuilder::Node *child = child_nodes[idx];
+							if (child->HasChildren())
+							{
+								to_estimate.push_back(child);
+								to_process_max_size = max(to_estimate.size(), to_process_max_size);
+							}
+							else
+							{
+								to_estimate_triangles.push_back(child);
+								to_process_triangles_max_size = max(to_estimate_triangles.size(), to_process_triangles_max_size);
+							}
+						}
+					}
+					else
+					{
+						// Update total size
+						tri_ctx.PreparePack(&inTriangles[node->mTrianglesBegin], node->mNumTriangles, inStoreUserData, total_size);
+					}
+				}
+
+				// If we've got triangles to estimate, loop again with just the triangles
+				if (to_estimate_triangles.empty())
+					break;
+				else
+					to_estimate.swap(to_estimate_triangles);
+			}
+		}
+
+		// Finalize the prepare stage for the triangle context
+		tri_ctx.FinalizePreparePack(total_size);
+
+		// Reserve the buffer
+		if (size_t(total_size) != total_size)
+		{
+			outError = "AABBTreeToBuffer: Out of memory!";
+			return false;
+		}
+		mTree.reserve(size_t(total_size));
+
+		// Add headers
+		NodeHeader *header = HeaderSize > 0? mTree.Allocate<NodeHeader>() : nullptr;
+		TriangleHeader *triangle_header = TriangleHeaderSize > 0? mTree.Allocate<TriangleHeader>() : nullptr;
+
+		struct NodeData
+		{
+			const AABBTreeBuilder::Node *	mNode = nullptr;							// Node that this entry belongs to
+			Vec3							mNodeBoundsMin;								// Quantized node bounds
+			Vec3							mNodeBoundsMax;
+			size_t							mNodeStart = size_t(-1);					// Start of node in mTree
+			size_t							mTriangleStart = size_t(-1);				// Start of the triangle data in mTree
+			size_t							mChildNodeStart[NumChildrenPerNode];		// Start of the children of the node in mTree
+			size_t							mChildTrianglesStart[NumChildrenPerNode];	// Start of the triangle data in mTree
+			size_t *						mParentChildNodeStart = nullptr;			// Where to store mNodeStart (to patch mChildNodeStart of my parent)
+			size_t *						mParentTrianglesStart = nullptr;			// Where to store mTriangleStart (to patch mChildTrianglesStart of my parent)
+			uint							mNumChildren = 0;							// Number of children
+		};
+
+		Array<NodeData *> to_process;
+		to_process.reserve(to_process_max_size);
+		Array<NodeData *> to_process_triangles;
+		to_process_triangles.reserve(to_process_triangles_max_size);
+		Array<NodeData> node_list;
+		node_list.reserve(node_count); // Needed to ensure that array is not reallocated, so we can keep pointers in the array
+
+		NodeData root;
+		root.mNode = inRoot;
+		root.mNodeBoundsMin = inRoot->mBounds.mMin;
+		root.mNodeBoundsMax = inRoot->mBounds.mMax;
+		node_list.push_back(root);
+		to_process.push_back(&node_list.back());
+
+		for (;;)
+		{
+			while (!to_process.empty())
+			{
+				// Get the next node to process
+				NodeData *node_data = to_process.back();
+				to_process.pop_back();
+
+				// Due to quantization box could have become bigger, not smaller
+				JPH_ASSERT(AABox(node_data->mNodeBoundsMin, node_data->mNodeBoundsMax).Contains(node_data->mNode->mBounds), "AABBTreeToBuffer: Bounding box became smaller!");
+
+				// Collect the first NumChildrenPerNode sub-nodes in the tree
+				child_nodes.clear(); // Won't free the memory
+				node_data->mNode->GetNChildren(inNodes, NumChildrenPerNode, child_nodes);
+				node_data->mNumChildren = (uint)child_nodes.size();
+
+				// Fill in default child bounds
+				Vec3 child_bounds_min[NumChildrenPerNode], child_bounds_max[NumChildrenPerNode];
+				for (size_t i = 0; i < NumChildrenPerNode; ++i)
+					if (i < child_nodes.size())
+					{
+						child_bounds_min[i] = child_nodes[i]->mBounds.mMin;
+						child_bounds_max[i] = child_nodes[i]->mBounds.mMax;
+					}
+					else
+					{
+						child_bounds_min[i] = Vec3::sZero();
+						child_bounds_max[i] = Vec3::sZero();
+					}
+
+				// Start a new node
+				node_data->mNodeStart = node_ctx.NodeAllocate(node_data->mNode, node_data->mNodeBoundsMin, node_data->mNodeBoundsMax, child_nodes, child_bounds_min, child_bounds_max, mTree, outError);
+				if (node_data->mNodeStart == size_t(-1))
+					return false;
+
+				if (node_data->mNode->HasChildren())
+				{
+					// Insert in reverse order so we process left child first when taking nodes from the back
+					for (int idx = int(child_nodes.size()) - 1; idx >= 0; --idx)
+					{
+						const AABBTreeBuilder::Node *child_node = child_nodes[idx];
+
+						// Due to quantization box could have become bigger, not smaller
+						JPH_ASSERT(AABox(child_bounds_min[idx], child_bounds_max[idx]).Contains(child_node->mBounds), "AABBTreeToBuffer: Bounding box became smaller!");
+
+						// Add child to list of nodes to be processed
+						NodeData child;
+						child.mNode = child_node;
+						child.mNodeBoundsMin = child_bounds_min[idx];
+						child.mNodeBoundsMax = child_bounds_max[idx];
+						child.mParentChildNodeStart = &node_data->mChildNodeStart[idx];
+						child.mParentTrianglesStart = &node_data->mChildTrianglesStart[idx];
+						node_list.push_back(child);
+
+						// Store triangles in separate list so we process them last
+						if (child_node->HasChildren())
+							to_process.push_back(&node_list.back());
+						else
+							to_process_triangles.push_back(&node_list.back());
+					}
+				}
+				else
+				{
+					// Add triangles
+					node_data->mTriangleStart = tri_ctx.Pack(&inTriangles[node_data->mNode->mTrianglesBegin], node_data->mNode->mNumTriangles, inStoreUserData, mTree, outError);
+					if (node_data->mTriangleStart == size_t(-1))
+						return false;
+				}
+
+				// Patch offset into parent
+				if (node_data->mParentChildNodeStart != nullptr)
+				{
+					*node_data->mParentChildNodeStart = node_data->mNodeStart;
+					*node_data->mParentTrianglesStart = node_data->mTriangleStart;
+				}
+			}
+
+			// If we've got triangles to process, loop again with just the triangles
+			if (to_process_triangles.empty())
+				break;
+			else
+				to_process.swap(to_process_triangles);
+		}
+
+		// Assert that our reservation was correct (we don't know if we swapped the arrays or not)
+		JPH_ASSERT(to_process_max_size == to_process.capacity() || to_process_triangles_max_size == to_process.capacity());
+		JPH_ASSERT(to_process_max_size == to_process_triangles.capacity() || to_process_triangles_max_size == to_process_triangles.capacity());
+
+		// Finalize all nodes
+		for (NodeData &n : node_list)
+			if (!node_ctx.NodeFinalize(n.mNode, n.mNodeStart, n.mNumChildren, n.mChildNodeStart, n.mChildTrianglesStart, mTree, outError))
+				return false;
+
+		// Finalize the triangles
+		tri_ctx.Finalize(inVertices, triangle_header, mTree);
+
+		// Validate that our reservations were correct
+		if (node_count != node_list.size())
+		{
+			outError = "Internal Error: Node memory estimate was incorrect, memory corruption!";
+			return false;
+		}
+		if (total_size != mTree.size())
+		{
+			outError = "Internal Error: Tree memory estimate was incorrect, memory corruption!";
+			return false;
+		}
+
+		// Finalize the nodes
+		return node_ctx.Finalize(header, inRoot, node_list[0].mNodeStart, node_list[0].mTriangleStart, outError);
+	}
+
+	/// Get resulting data
+	inline const ByteBuffer &		GetBuffer() const
+	{
+		return mTree;
+	}
+
+	/// Get resulting data
+	inline ByteBuffer &				GetBuffer()
+	{
+		return mTree;
+	}
+
+	/// Get header for tree
+	inline const NodeHeader *		GetNodeHeader() const
+	{
+		return mTree.Get<NodeHeader>(0);
+	}
+
+	/// Get header for triangles
+	inline const TriangleHeader *	GetTriangleHeader() const
+	{
+		return mTree.Get<TriangleHeader>(HeaderSize);
+	}
+
+	/// Get root of resulting tree
+	inline const void *				GetRoot() const
+	{
+		return mTree.Get<void>(HeaderSize + TriangleHeaderSize);
+	}
+
+private:
+	ByteBuffer						mTree;									///< Resulting tree structure
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/AABBTree/NodeCodec/NodeCodecQuadTreeHalfFloat.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/AABBTree/NodeCodec/NodeCodecQuadTreeHalfFloat.h
@ -0,0 +1,323 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/ByteBuffer.h>
+#include <Jolt/Math/HalfFloat.h>
+#include <Jolt/AABBTree/AABBTreeBuilder.h>
+
+JPH_NAMESPACE_BEGIN
+
+class NodeCodecQuadTreeHalfFloat
+{
+public:
+	/// Number of child nodes of this node
+	static constexpr int				NumChildrenPerNode = 4;
+
+	/// Header for the tree
+	struct Header
+	{
+		Float3							mRootBoundsMin;
+		Float3							mRootBoundsMax;
+		uint32							mRootProperties;
+		uint8							mBlockIDBits;			///< Number of bits to address a triangle block
+		uint8							mPadding[3] = { 0 };
+	};
+
+	/// Size of the header (an empty struct is always > 0 bytes so this needs a separate variable)
+	static constexpr int				HeaderSize = sizeof(Header);
+
+	/// Stack size to use during DecodingContext::sWalkTree
+	static constexpr int				StackSize = 128;
+
+	/// Node properties
+	enum : uint32
+	{
+		TRIANGLE_COUNT_BITS				= 4,
+		TRIANGLE_COUNT_SHIFT			= 28,
+		TRIANGLE_COUNT_MASK				= (1 << TRIANGLE_COUNT_BITS) - 1,
+		OFFSET_BITS						= 28,
+		OFFSET_MASK						= (1 << OFFSET_BITS) - 1,
+		OFFSET_NON_SIGNIFICANT_BITS		= 2,
+		OFFSET_NON_SIGNIFICANT_MASK		= (1 << OFFSET_NON_SIGNIFICANT_BITS) - 1,
+	};
+
+	/// Node structure
+	struct Node
+	{
+		HalfFloat						mBoundsMinX[4];			///< 4 child bounding boxes
+		HalfFloat						mBoundsMinY[4];
+		HalfFloat						mBoundsMinZ[4];
+		HalfFloat						mBoundsMaxX[4];
+		HalfFloat						mBoundsMaxY[4];
+		HalfFloat						mBoundsMaxZ[4];
+		uint32							mNodeProperties[4];		///< 4 child node properties
+	};
+
+	static_assert(sizeof(Node) == 64, "Node should be 64 bytes");
+
+	/// This class encodes and compresses quad tree nodes
+	class EncodingContext
+	{
+	public:
+		/// Mimics the size a call to NodeAllocate() would add to the buffer
+		void							PrepareNodeAllocate(const AABBTreeBuilder::Node *inNode, uint64 &ioBufferSize) const
+		{
+			// We don't emit nodes for leafs
+			if (!inNode->HasChildren())
+				return;
+
+			// Add size of node
+			ioBufferSize += sizeof(Node);
+		}
+
+		/// Allocate a new node for inNode.
+		/// Algorithm can modify the order of ioChildren to indicate in which order children should be compressed
+		/// Algorithm can enlarge the bounding boxes of the children during compression and returns these in outChildBoundsMin, outChildBoundsMax
+		/// inNodeBoundsMin, inNodeBoundsMax is the bounding box if inNode possibly widened by compressing the parent node
+		/// Returns size_t(-1) on error and reports the error in outError
+		size_t							NodeAllocate(const AABBTreeBuilder::Node *inNode, Vec3Arg inNodeBoundsMin, Vec3Arg inNodeBoundsMax, Array<const AABBTreeBuilder::Node *> &ioChildren, Vec3 outChildBoundsMin[NumChildrenPerNode], Vec3 outChildBoundsMax[NumChildrenPerNode], ByteBuffer &ioBuffer, const char *&outError) const
+		{
+			// We don't emit nodes for leafs
+			if (!inNode->HasChildren())
+				return ioBuffer.size();
+
+			// Remember the start of the node
+			size_t node_start = ioBuffer.size();
+
+			// Fill in bounds
+			Node *node = ioBuffer.Allocate<Node>();
+
+			for (size_t i = 0; i < 4; ++i)
+			{
+				if (i < ioChildren.size())
+				{
+					const AABBTreeBuilder::Node *this_node = ioChildren[i];
+
+					// Copy bounding box
+					node->mBoundsMinX[i] = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_NEG_INF>(this_node->mBounds.mMin.GetX());
+					node->mBoundsMinY[i] = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_NEG_INF>(this_node->mBounds.mMin.GetY());
+					node->mBoundsMinZ[i] = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_NEG_INF>(this_node->mBounds.mMin.GetZ());
+					node->mBoundsMaxX[i] = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_POS_INF>(this_node->mBounds.mMax.GetX());
+					node->mBoundsMaxY[i] = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_POS_INF>(this_node->mBounds.mMax.GetY());
+					node->mBoundsMaxZ[i] = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_POS_INF>(this_node->mBounds.mMax.GetZ());
+
+					// Store triangle count
+					node->mNodeProperties[i] = this_node->GetTriangleCount() << TRIANGLE_COUNT_SHIFT;
+					if (this_node->GetTriangleCount() >= TRIANGLE_COUNT_MASK)
+					{
+						outError = "NodeCodecQuadTreeHalfFloat: Too many triangles";
+						return size_t(-1);
+					}
+				}
+				else
+				{
+					// Make this an invalid triangle node
+					node->mNodeProperties[i] = uint32(TRIANGLE_COUNT_MASK) << TRIANGLE_COUNT_SHIFT;
+
+					// Make bounding box invalid
+					node->mBoundsMinX[i] = HALF_FLT_MAX;
+					node->mBoundsMinY[i] = HALF_FLT_MAX;
+					node->mBoundsMinZ[i] = HALF_FLT_MAX;
+					node->mBoundsMaxX[i] = HALF_FLT_MAX;
+					node->mBoundsMaxY[i] = HALF_FLT_MAX;
+					node->mBoundsMaxZ[i] = HALF_FLT_MAX;
+				}
+			}
+
+			// Since we don't keep track of the bounding box while descending the tree, we keep the root bounds at all levels for triangle compression
+			for (int i = 0; i < NumChildrenPerNode; ++i)
+			{
+				outChildBoundsMin[i] = inNodeBoundsMin;
+				outChildBoundsMax[i] = inNodeBoundsMax;
+			}
+
+			return node_start;
+		}
+
+		/// Once all nodes have been added, this call finalizes all nodes by patching in the offsets of the child nodes (that were added after the node itself was added)
+		bool						NodeFinalize(const AABBTreeBuilder::Node *inNode, size_t inNodeStart, uint inNumChildren, const size_t *inChildrenNodeStart, const size_t *inChildrenTrianglesStart, ByteBuffer &ioBuffer, const char *&outError)
+		{
+			if (!inNode->HasChildren())
+				return true;
+
+			Node *node = ioBuffer.Get<Node>(inNodeStart);
+			for (uint i = 0; i < inNumChildren; ++i)
+			{
+				size_t offset;
+				if (node->mNodeProperties[i] != 0)
+				{
+					// This is a triangle block
+					offset = inChildrenTrianglesStart[i];
+
+					// Store highest block with triangles so we can count the number of bits we need
+					mHighestTriangleBlock = max(mHighestTriangleBlock, offset);
+				}
+				else
+				{
+					// This is a node block
+					offset = inChildrenNodeStart[i];
+				}
+
+				// Store offset of next node / triangles
+				if (offset & OFFSET_NON_SIGNIFICANT_MASK)
+				{
+					outError = "NodeCodecQuadTreeHalfFloat: Internal Error: Offset has non-significant bits set";
+					return false;
+				}
+				offset >>= OFFSET_NON_SIGNIFICANT_BITS;
+				if (offset > OFFSET_MASK)
+				{
+					outError = "NodeCodecQuadTreeHalfFloat: Offset too large. Too much data.";
+					return false;
+				}
+				node->mNodeProperties[i] |= uint32(offset);
+			}
+
+			return true;
+		}
+
+		/// Once all nodes have been finalized, this will finalize the header of the nodes
+		bool						Finalize(Header *outHeader, const AABBTreeBuilder::Node *inRoot, size_t inRootNodeStart, size_t inRootTrianglesStart, const char *&outError) const
+		{
+			// Check if we can address the root node
+			size_t offset = inRoot->HasChildren()? inRootNodeStart : inRootTrianglesStart;
+			if (offset & OFFSET_NON_SIGNIFICANT_MASK)
+			{
+				outError = "NodeCodecQuadTreeHalfFloat: Internal Error: Offset has non-significant bits set";
+				return false;
+			}
+			offset >>= OFFSET_NON_SIGNIFICANT_BITS;
+			if (offset > OFFSET_MASK)
+			{
+				outError = "NodeCodecQuadTreeHalfFloat: Offset too large. Too much data.";
+				return false;
+			}
+
+			// If the root has triangles, we need to take that offset instead since the mHighestTriangleBlock will be zero
+			size_t highest_triangle_block = inRootTrianglesStart != size_t(-1)? inRootTrianglesStart : mHighestTriangleBlock;
+			highest_triangle_block >>= OFFSET_NON_SIGNIFICANT_BITS;
+
+			inRoot->mBounds.mMin.StoreFloat3(&outHeader->mRootBoundsMin);
+			inRoot->mBounds.mMax.StoreFloat3(&outHeader->mRootBoundsMax);
+			outHeader->mRootProperties = uint32(offset) + (inRoot->GetTriangleCount() << TRIANGLE_COUNT_SHIFT);
+			outHeader->mBlockIDBits = uint8(32 - CountLeadingZeros(uint32(highest_triangle_block)));
+			if (inRoot->GetTriangleCount() >= TRIANGLE_COUNT_MASK)
+			{
+				outError = "NodeCodecQuadTreeHalfFloat: Too many triangles";
+				return false;
+			}
+
+			return true;
+		}
+
+	private:
+		size_t						mHighestTriangleBlock = 0;
+	};
+
+	/// This class decodes and decompresses quad tree nodes
+	class DecodingContext
+	{
+	public:
+		/// Get the amount of bits needed to store an ID to a triangle block
+		inline static uint			sTriangleBlockIDBits(const Header *inHeader)
+		{
+			return inHeader->mBlockIDBits;
+		}
+
+		/// Convert a triangle block ID to the start of the triangle buffer
+		inline static const void *	sGetTriangleBlockStart(const uint8 *inBufferStart, uint inTriangleBlockID)
+		{
+			return inBufferStart + (inTriangleBlockID << OFFSET_NON_SIGNIFICANT_BITS);
+		}
+
+		/// Constructor
+		JPH_INLINE explicit			DecodingContext(const Header *inHeader)
+		{
+			// Start with the root node on the stack
+			mNodeStack[0] = inHeader->mRootProperties;
+		}
+
+		/// Walk the node tree calling the Visitor::VisitNodes for each node encountered and Visitor::VisitTriangles for each triangle encountered
+		template <class TriangleContext, class Visitor>
+		JPH_INLINE void				WalkTree(const uint8 *inBufferStart, const TriangleContext &inTriangleContext, Visitor &ioVisitor)
+		{
+			do
+			{
+				// Test if node contains triangles
+				uint32 node_properties = mNodeStack[mTop];
+				uint32 tri_count = node_properties >> TRIANGLE_COUNT_SHIFT;
+				if (tri_count == 0)
+				{
+					const Node *node = reinterpret_cast<const Node *>(inBufferStart + (node_properties << OFFSET_NON_SIGNIFICANT_BITS));
+
+					// Unpack bounds
+				#ifdef JPH_CPU_BIG_ENDIAN
+					Vec4 bounds_minx = HalfFloatConversion::ToFloat(UVec4(node->mBoundsMinX[0] + (node->mBoundsMinX[1] << 16), node->mBoundsMinX[2] + (node->mBoundsMinX[3] << 16), 0, 0));
+					Vec4 bounds_miny = HalfFloatConversion::ToFloat(UVec4(node->mBoundsMinY[0] + (node->mBoundsMinY[1] << 16), node->mBoundsMinY[2] + (node->mBoundsMinY[3] << 16), 0, 0));
+					Vec4 bounds_minz = HalfFloatConversion::ToFloat(UVec4(node->mBoundsMinZ[0] + (node->mBoundsMinZ[1] << 16), node->mBoundsMinZ[2] + (node->mBoundsMinZ[3] << 16), 0, 0));
+
+					Vec4 bounds_maxx = HalfFloatConversion::ToFloat(UVec4(node->mBoundsMaxX[0] + (node->mBoundsMaxX[1] << 16), node->mBoundsMaxX[2] + (node->mBoundsMaxX[3] << 16), 0, 0));
+					Vec4 bounds_maxy = HalfFloatConversion::ToFloat(UVec4(node->mBoundsMaxY[0] + (node->mBoundsMaxY[1] << 16), node->mBoundsMaxY[2] + (node->mBoundsMaxY[3] << 16), 0, 0));
+					Vec4 bounds_maxz = HalfFloatConversion::ToFloat(UVec4(node->mBoundsMaxZ[0] + (node->mBoundsMaxZ[1] << 16), node->mBoundsMaxZ[2] + (node->mBoundsMaxZ[3] << 16), 0, 0));
+				#else
+					UVec4 bounds_minxy = UVec4::sLoadInt4(reinterpret_cast<const uint32 *>(&node->mBoundsMinX[0]));
+					Vec4 bounds_minx = HalfFloatConversion::ToFloat(bounds_minxy);
+					Vec4 bounds_miny = HalfFloatConversion::ToFloat(bounds_minxy.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+
+					UVec4 bounds_minzmaxx = UVec4::sLoadInt4(reinterpret_cast<const uint32 *>(&node->mBoundsMinZ[0]));
+					Vec4 bounds_minz = HalfFloatConversion::ToFloat(bounds_minzmaxx);
+					Vec4 bounds_maxx = HalfFloatConversion::ToFloat(bounds_minzmaxx.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+
+					UVec4 bounds_maxyz = UVec4::sLoadInt4(reinterpret_cast<const uint32 *>(&node->mBoundsMaxY[0]));
+					Vec4 bounds_maxy = HalfFloatConversion::ToFloat(bounds_maxyz);
+					Vec4 bounds_maxz = HalfFloatConversion::ToFloat(bounds_maxyz.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+				#endif
+
+					// Load properties for 4 children
+					UVec4 properties = UVec4::sLoadInt4(&node->mNodeProperties[0]);
+
+					// Check which sub nodes to visit
+					int num_results = ioVisitor.VisitNodes(bounds_minx, bounds_miny, bounds_minz, bounds_maxx, bounds_maxy, bounds_maxz, properties, mTop);
+
+					// Push them onto the stack
+					JPH_ASSERT(mTop + 4 < StackSize);
+					properties.StoreInt4(&mNodeStack[mTop]);
+					mTop += num_results;
+				}
+				else if (tri_count != TRIANGLE_COUNT_MASK) // TRIANGLE_COUNT_MASK indicates a padding node, normally we shouldn't visit these nodes but when querying with a big enough box you could touch HALF_FLT_MAX (about 65K)
+				{
+					// Node contains triangles, do individual tests
+					uint32 triangle_block_id = node_properties & OFFSET_MASK;
+					const void *triangles = sGetTriangleBlockStart(inBufferStart, triangle_block_id);
+
+					ioVisitor.VisitTriangles(inTriangleContext, triangles, tri_count, triangle_block_id);
+				}
+
+				// Check if we're done
+				if (ioVisitor.ShouldAbort())
+					break;
+
+				// Fetch next node until we find one that the visitor wants to see
+				do
+					--mTop;
+				while (mTop >= 0 && !ioVisitor.ShouldVisitNode(mTop));
+			}
+			while (mTop >= 0);
+		}
+
+		/// This can be used to have the visitor early out (ioVisitor.ShouldAbort() returns true) and later continue again (call WalkTree() again)
+		bool						IsDoneWalking() const
+		{
+			return mTop < 0;
+		}
+
+	private:
+		uint32						mNodeStack[StackSize];
+		int							mTop = 0;
+	};
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/AABBTree/TriangleCodec/TriangleCodecIndexed8BitPackSOA4Flags.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/AABBTree/TriangleCodec/TriangleCodecIndexed8BitPackSOA4Flags.h
@ -0,0 +1,555 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Geometry/RayTriangle.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Store vertices in 64 bits and indices in 8 bits + 8 bit of flags per triangle like this:
+///
+/// TriangleBlockHeader,
+/// TriangleBlock (4 triangles and their flags in 16 bytes),
+/// TriangleBlock...
+/// [Optional] UserData (4 bytes per triangle)
+///
+/// Vertices are stored:
+///
+/// VertexData (1 vertex in 64 bits),
+/// VertexData...
+///
+/// They're compressed relative to the bounding box as provided by the node codec.
+class TriangleCodecIndexed8BitPackSOA4Flags
+{
+public:
+	class TriangleHeader
+	{
+	public:
+		Float3						mOffset;			///< Offset of all vertices
+		Float3						mScale;				///< Scale of all vertices, vertex_position = mOffset + mScale * compressed_vertex_position
+	};
+
+	/// Size of the header (an empty struct is always > 0 bytes so this needs a separate variable)
+	static constexpr int			TriangleHeaderSize = sizeof(TriangleHeader);
+
+	/// If this codec could return a different offset than the current buffer size when calling Pack()
+	static constexpr bool			ChangesOffsetOnPack = false;
+
+	/// Amount of bits per component
+	enum EComponentData : uint32
+	{
+		COMPONENT_BITS = 21,
+		COMPONENT_MASK = (1 << COMPONENT_BITS) - 1,
+	};
+
+	/// Packed X and Y coordinate
+	enum EVertexXY : uint32
+	{
+		COMPONENT_X = 0,
+		COMPONENT_Y1 = COMPONENT_BITS,
+		COMPONENT_Y1_BITS = 32 - COMPONENT_BITS,
+	};
+
+	/// Packed Z and Y coordinate
+	enum EVertexZY : uint32
+	{
+		COMPONENT_Z = 0,
+		COMPONENT_Y2 = COMPONENT_BITS,
+		COMPONENT_Y2_BITS = 31 - COMPONENT_BITS,
+	};
+
+	/// A single packed vertex
+	struct VertexData
+	{
+		uint32						mVertexXY;
+		uint32						mVertexZY;
+	};
+
+	static_assert(sizeof(VertexData) == 8, "Compiler added padding");
+
+	/// A block of 4 triangles
+	struct TriangleBlock
+	{
+		uint8						mIndices[3][4];				///< 8 bit indices to triangle vertices for 4 triangles in the form mIndices[vertex][triangle] where vertex in [0, 2] and triangle in [0, 3]
+		uint8						mFlags[4];					///< Triangle flags (could contain material and active edges)
+	};
+
+	static_assert(sizeof(TriangleBlock) == 16, "Compiler added padding");
+
+	enum ETriangleBlockHeaderFlags : uint32
+	{
+		OFFSET_TO_VERTICES_BITS = 29,							///< Offset from current block to start of vertices in bytes
+		OFFSET_TO_VERTICES_MASK = (1 << OFFSET_TO_VERTICES_BITS) - 1,
+		OFFSET_NON_SIGNIFICANT_BITS = 2,						///< The offset from the current block to the start of the vertices must be a multiple of 4 bytes
+		OFFSET_NON_SIGNIFICANT_MASK = (1 << OFFSET_NON_SIGNIFICANT_BITS) - 1,
+		OFFSET_TO_USERDATA_BITS = 3,							///< When user data is stored, this is the number of blocks to skip to get to the user data (0 = no user data)
+		OFFSET_TO_USERDATA_MASK = (1 << OFFSET_TO_USERDATA_BITS) - 1,
+	};
+
+	/// A triangle header, will be followed by one or more TriangleBlocks
+	struct TriangleBlockHeader
+	{
+		const VertexData *			GetVertexData() const		{ return reinterpret_cast<const VertexData *>(reinterpret_cast<const uint8 *>(this) + ((mFlags & OFFSET_TO_VERTICES_MASK) << OFFSET_NON_SIGNIFICANT_BITS)); }
+		const TriangleBlock *		GetTriangleBlock() const	{ return reinterpret_cast<const TriangleBlock *>(reinterpret_cast<const uint8 *>(this) + sizeof(TriangleBlockHeader)); }
+		const uint32 *				GetUserData() const			{ uint32 offset = mFlags >> OFFSET_TO_VERTICES_BITS; return offset == 0? nullptr : reinterpret_cast<const uint32 *>(GetTriangleBlock() + offset); }
+
+		uint32						mFlags;
+	};
+
+	static_assert(sizeof(TriangleBlockHeader) == 4, "Compiler added padding");
+
+	/// This class is used to validate that the triangle data will not be degenerate after compression
+	class ValidationContext
+	{
+	public:
+		/// Constructor
+									ValidationContext(const IndexedTriangleList &inTriangles, const VertexList &inVertices) :
+			mVertices(inVertices)
+		{
+			// Only used the referenced triangles, just like EncodingContext::Finalize does
+			for (const IndexedTriangle &i : inTriangles)
+				for (uint32 idx : i.mIdx)
+					mBounds.Encapsulate(Vec3(inVertices[idx]));
+		}
+
+		/// Test if a triangle will be degenerate after quantization
+		bool						IsDegenerate(const IndexedTriangle &inTriangle) const
+		{
+			// Quantize the triangle in the same way as EncodingContext::Finalize does
+			UVec4 quantized_vertex[3];
+			Vec3 compress_scale = Vec3::sReplicate(COMPONENT_MASK) / Vec3::sMax(mBounds.GetSize(), Vec3::sReplicate(1.0e-20f));
+			for (int i = 0; i < 3; ++i)
+				quantized_vertex[i] = ((Vec3(mVertices[inTriangle.mIdx[i]]) - mBounds.mMin) * compress_scale + Vec3::sReplicate(0.5f)).ToInt();
+			return quantized_vertex[0] == quantized_vertex[1] || quantized_vertex[1] == quantized_vertex[2] || quantized_vertex[0] == quantized_vertex[2];
+		}
+
+	private:
+		const VertexList &			mVertices;
+		AABox						mBounds;
+	};
+
+	/// This class is used to encode and compress triangle data into a byte buffer
+	class EncodingContext
+	{
+	public:
+		/// Indicates a vertex hasn't been seen yet in the triangle list
+		static constexpr uint32		cNotFound = 0xffffffff;
+
+		/// Construct the encoding context
+		explicit					EncodingContext(const VertexList &inVertices) :
+			mVertexMap(inVertices.size(), cNotFound)
+		{
+		}
+
+		/// Mimics the size a call to Pack() would add to the buffer
+		void						PreparePack(const IndexedTriangle *inTriangles, uint inNumTriangles, bool inStoreUserData, uint64 &ioBufferSize)
+		{
+			// Add triangle block header
+			ioBufferSize += sizeof(TriangleBlockHeader);
+
+			// Compute first vertex that this batch will use (ensuring there's enough room if none of the vertices are shared)
+			uint start_vertex = Clamp((int)mVertexCount - 256 + (int)inNumTriangles * 3, 0, (int)mVertexCount);
+
+			// Pack vertices
+			uint padded_triangle_count = AlignUp(inNumTriangles, 4);
+			for (uint t = 0; t < padded_triangle_count; t += 4)
+			{
+				// Add triangle block header
+				ioBufferSize += sizeof(TriangleBlock);
+
+				for (uint vertex_nr = 0; vertex_nr < 3; ++vertex_nr)
+					for (uint block_tri_idx = 0; block_tri_idx < 4; ++block_tri_idx)
+					{
+						// Fetch vertex index. Create degenerate triangles for padding triangles.
+						bool triangle_available = t + block_tri_idx < inNumTriangles;
+						uint32 src_vertex_index = triangle_available? inTriangles[t + block_tri_idx].mIdx[vertex_nr] : inTriangles[inNumTriangles - 1].mIdx[0];
+
+						// Check if we've seen this vertex before and if it is in the range that we can encode
+						uint32 &vertex_index = mVertexMap[src_vertex_index];
+						if (vertex_index == cNotFound || vertex_index < start_vertex)
+						{
+							// Add vertex
+							vertex_index = mVertexCount;
+							mVertexCount++;
+						}
+					}
+			}
+
+			// Add user data
+			if (inStoreUserData)
+				ioBufferSize += inNumTriangles * sizeof(uint32);
+		}
+
+		/// Mimics the size the Finalize() call would add to ioBufferSize
+		void						FinalizePreparePack(uint64 &ioBufferSize)
+		{
+			// Remember where the vertices are going to start in the output buffer
+			JPH_ASSERT(IsAligned(ioBufferSize, 4));
+			mVerticesStartIdx = size_t(ioBufferSize);
+
+			// Add vertices to buffer
+			ioBufferSize += uint64(mVertexCount) * sizeof(VertexData);
+
+			// Reserve the amount of memory we need for the vertices
+			mVertices.reserve(mVertexCount);
+
+			// Set vertex map back to 'not found'
+			for (uint32 &v : mVertexMap)
+				v = cNotFound;
+		}
+
+		/// Pack the triangles in inContainer to ioBuffer. This stores the mMaterialIndex of a triangle in the 8 bit flags.
+		/// Returns size_t(-1) on error.
+		size_t						Pack(const IndexedTriangle *inTriangles, uint inNumTriangles, bool inStoreUserData, ByteBuffer &ioBuffer, const char *&outError)
+		{
+			JPH_ASSERT(inNumTriangles > 0);
+
+			// Determine position of triangles start
+			size_t triangle_block_start = ioBuffer.size();
+
+			// Allocate triangle block header
+			TriangleBlockHeader *header = ioBuffer.Allocate<TriangleBlockHeader>();
+
+			// Compute first vertex that this batch will use (ensuring there's enough room if none of the vertices are shared)
+			uint start_vertex = Clamp((int)mVertices.size() - 256 + (int)inNumTriangles * 3, 0, (int)mVertices.size());
+
+			// Store the start vertex offset relative to TriangleBlockHeader
+			size_t offset_to_vertices = mVerticesStartIdx - triangle_block_start + size_t(start_vertex) * sizeof(VertexData);
+			if (offset_to_vertices & OFFSET_NON_SIGNIFICANT_MASK)
+			{
+				outError = "TriangleCodecIndexed8BitPackSOA4Flags: Internal Error: Offset has non-significant bits set";
+				return size_t(-1);
+			}
+			offset_to_vertices >>= OFFSET_NON_SIGNIFICANT_BITS;
+			if (offset_to_vertices > OFFSET_TO_VERTICES_MASK)
+			{
+				outError = "TriangleCodecIndexed8BitPackSOA4Flags: Offset to vertices doesn't fit. Too much data.";
+				return size_t(-1);
+			}
+			header->mFlags = uint32(offset_to_vertices);
+
+			// When we store user data we need to store the offset to the user data in TriangleBlocks
+			uint padded_triangle_count = AlignUp(inNumTriangles, 4);
+			if (inStoreUserData)
+			{
+				uint32 num_blocks = padded_triangle_count >> 2;
+				JPH_ASSERT(num_blocks <= OFFSET_TO_USERDATA_MASK);
+				header->mFlags |= num_blocks << OFFSET_TO_VERTICES_BITS;
+			}
+
+			// Pack vertices
+			for (uint t = 0; t < padded_triangle_count; t += 4)
+			{
+				TriangleBlock *block = ioBuffer.Allocate<TriangleBlock>();
+				for (uint vertex_nr = 0; vertex_nr < 3; ++vertex_nr)
+					for (uint block_tri_idx = 0; block_tri_idx < 4; ++block_tri_idx)
+					{
+						// Fetch vertex index. Create degenerate triangles for padding triangles.
+						bool triangle_available = t + block_tri_idx < inNumTriangles;
+						uint32 src_vertex_index = triangle_available? inTriangles[t + block_tri_idx].mIdx[vertex_nr] : inTriangles[inNumTriangles - 1].mIdx[0];
+
+						// Check if we've seen this vertex before and if it is in the range that we can encode
+						uint32 &vertex_index = mVertexMap[src_vertex_index];
+						if (vertex_index == cNotFound || vertex_index < start_vertex)
+						{
+							// Add vertex
+							vertex_index = (uint32)mVertices.size();
+							mVertices.push_back(src_vertex_index);
+						}
+
+						// Store vertex index
+						uint32 vertex_offset = vertex_index - start_vertex;
+						if (vertex_offset > 0xff)
+						{
+							outError = "TriangleCodecIndexed8BitPackSOA4Flags: Offset doesn't fit in 8 bit";
+							return size_t(-1);
+						}
+						block->mIndices[vertex_nr][block_tri_idx] = (uint8)vertex_offset;
+
+						// Store flags
+						uint32 flags = triangle_available? inTriangles[t + block_tri_idx].mMaterialIndex : 0;
+						if (flags > 0xff)
+						{
+							outError = "TriangleCodecIndexed8BitPackSOA4Flags: Material index doesn't fit in 8 bit";
+							return size_t(-1);
+						}
+						block->mFlags[block_tri_idx] = (uint8)flags;
+					}
+			}
+
+			// Store user data
+			if (inStoreUserData)
+			{
+				uint32 *user_data = ioBuffer.Allocate<uint32>(inNumTriangles);
+				for (uint t = 0; t < inNumTriangles; ++t)
+					user_data[t] = inTriangles[t].mUserData;
+			}
+
+			return triangle_block_start;
+		}
+
+		/// After all triangles have been packed, this finalizes the header and triangle buffer
+		void						Finalize(const VertexList &inVertices, TriangleHeader *ioHeader, ByteBuffer &ioBuffer) const
+		{
+			// Assert that our reservations were correct
+			JPH_ASSERT(mVertices.size() == mVertexCount);
+			JPH_ASSERT(ioBuffer.size() == mVerticesStartIdx);
+
+			// Check if anything to do
+			if (mVertices.empty())
+				return;
+
+			// Calculate bounding box
+			AABox bounds;
+			for (uint32 v : mVertices)
+				bounds.Encapsulate(Vec3(inVertices[v]));
+
+			// Compress vertices
+			VertexData *vertices = ioBuffer.Allocate<VertexData>(mVertices.size());
+			Vec3 compress_scale = Vec3::sReplicate(COMPONENT_MASK) / Vec3::sMax(bounds.GetSize(), Vec3::sReplicate(1.0e-20f));
+			for (uint32 v : mVertices)
+			{
+				UVec4 c = ((Vec3(inVertices[v]) - bounds.mMin) * compress_scale + Vec3::sReplicate(0.5f)).ToInt();
+				JPH_ASSERT(c.GetX() <= COMPONENT_MASK);
+				JPH_ASSERT(c.GetY() <= COMPONENT_MASK);
+				JPH_ASSERT(c.GetZ() <= COMPONENT_MASK);
+				vertices->mVertexXY = c.GetX() + (c.GetY() << COMPONENT_Y1);
+				vertices->mVertexZY = c.GetZ() + ((c.GetY() >> COMPONENT_Y1_BITS) << COMPONENT_Y2);
+				++vertices;
+			}
+
+			// Store decompression information
+			bounds.mMin.StoreFloat3(&ioHeader->mOffset);
+			(bounds.GetSize() / Vec3::sReplicate(COMPONENT_MASK)).StoreFloat3(&ioHeader->mScale);
+		}
+
+	private:
+		using VertexMap = Array<uint32>;
+
+		uint32						mVertexCount = 0;			///< Number of vertices calculated during PreparePack
+		size_t						mVerticesStartIdx = 0;		///< Start of the vertices in the output buffer, calculated during PreparePack
+		Array<uint32>				mVertices;					///< Output vertices as an index into the original vertex list (inVertices), sorted according to occurrence
+		VertexMap					mVertexMap;					///< Maps from the original mesh vertex index (inVertices) to the index in our output vertices (mVertices)
+	};
+
+	/// This class is used to decode and decompress triangle data packed by the EncodingContext
+	class DecodingContext
+	{
+	private:
+		/// Private helper function to unpack the 1 vertex of 4 triangles (outX contains the x coordinate of triangle 0 .. 3 etc.)
+		JPH_INLINE void				Unpack(const VertexData *inVertices, UVec4Arg inIndex, Vec4 &outX, Vec4 &outY, Vec4 &outZ) const
+		{
+			// Get compressed data
+			UVec4 c1 = UVec4::sGatherInt4<8>(&inVertices->mVertexXY, inIndex);
+			UVec4 c2 = UVec4::sGatherInt4<8>(&inVertices->mVertexZY, inIndex);
+
+			// Unpack the x y and z component
+			UVec4 xc = UVec4::sAnd(c1, UVec4::sReplicate(COMPONENT_MASK));
+			UVec4 yc = UVec4::sOr(c1.LogicalShiftRight<COMPONENT_Y1>(), c2.LogicalShiftRight<COMPONENT_Y2>().LogicalShiftLeft<COMPONENT_Y1_BITS>());
+			UVec4 zc = UVec4::sAnd(c2, UVec4::sReplicate(COMPONENT_MASK));
+
+			// Convert to float
+			outX = Vec4::sFusedMultiplyAdd(xc.ToFloat(), mScaleX, mOffsetX);
+			outY = Vec4::sFusedMultiplyAdd(yc.ToFloat(), mScaleY, mOffsetY);
+			outZ = Vec4::sFusedMultiplyAdd(zc.ToFloat(), mScaleZ, mOffsetZ);
+		}
+
+		/// Private helper function to unpack 4 triangles from a triangle block
+		JPH_INLINE void				Unpack(const TriangleBlock *inBlock, const VertexData *inVertices, Vec4 &outX1, Vec4 &outY1, Vec4 &outZ1, Vec4 &outX2, Vec4 &outY2, Vec4 &outZ2, Vec4 &outX3, Vec4 &outY3, Vec4 &outZ3) const
+		{
+			// Get the indices for the three vertices (reads 4 bytes extra, but these are the flags so that's ok)
+			UVec4 indices = UVec4::sLoadInt4(reinterpret_cast<const uint32 *>(&inBlock->mIndices[0]));
+			UVec4 iv1 = indices.Expand4Byte0();
+			UVec4 iv2 = indices.Expand4Byte4();
+			UVec4 iv3 = indices.Expand4Byte8();
+
+		#ifdef JPH_CPU_BIG_ENDIAN
+			// On big endian systems we need to reverse the bytes
+			iv1 = iv1.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>();
+			iv2 = iv2.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>();
+			iv3 = iv3.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>();
+		#endif
+
+			// Decompress the triangle data
+			Unpack(inVertices, iv1, outX1, outY1, outZ1);
+			Unpack(inVertices, iv2, outX2, outY2, outZ2);
+			Unpack(inVertices, iv3, outX3, outY3, outZ3);
+		}
+
+	public:
+		JPH_INLINE explicit			DecodingContext(const TriangleHeader *inHeader) :
+			mOffsetX(Vec4::sReplicate(inHeader->mOffset.x)),
+			mOffsetY(Vec4::sReplicate(inHeader->mOffset.y)),
+			mOffsetZ(Vec4::sReplicate(inHeader->mOffset.z)),
+			mScaleX(Vec4::sReplicate(inHeader->mScale.x)),
+			mScaleY(Vec4::sReplicate(inHeader->mScale.y)),
+			mScaleZ(Vec4::sReplicate(inHeader->mScale.z))
+		{
+		}
+
+		/// Unpacks triangles in the format t1v1,t1v2,t1v3, t2v1,t2v2,t2v3, ...
+		JPH_INLINE void				Unpack(const void *inTriangleStart, uint32 inNumTriangles, Vec3 *outTriangles) const
+		{
+			JPH_ASSERT(inNumTriangles > 0);
+			const TriangleBlockHeader *header = reinterpret_cast<const TriangleBlockHeader *>(inTriangleStart);
+			const VertexData *vertices = header->GetVertexData();
+			const TriangleBlock *t = header->GetTriangleBlock();
+			const TriangleBlock *end = t + ((inNumTriangles + 3) >> 2);
+
+			int triangles_left = inNumTriangles;
+
+			do
+			{
+				// Unpack the vertices for 4 triangles
+				Vec4 v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z;
+				Unpack(t, vertices, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+
+				// Transpose it so we get normal vectors
+				Mat44 v1 = Mat44(v1x, v1y, v1z, Vec4::sZero()).Transposed();
+				Mat44 v2 = Mat44(v2x, v2y, v2z, Vec4::sZero()).Transposed();
+				Mat44 v3 = Mat44(v3x, v3y, v3z, Vec4::sZero()).Transposed();
+
+				// Store triangle data
+				for (int i = 0; i < 4 && triangles_left > 0; ++i, --triangles_left)
+				{
+					*outTriangles++ = v1.GetColumn3(i);
+					*outTriangles++ = v2.GetColumn3(i);
+					*outTriangles++ = v3.GetColumn3(i);
+				}
+
+				++t;
+			}
+			while (t < end);
+		}
+
+		/// Tests a ray against the packed triangles
+		JPH_INLINE float			TestRay(Vec3Arg inRayOrigin, Vec3Arg inRayDirection, const void *inTriangleStart, uint32 inNumTriangles, float inClosest, uint32 &outClosestTriangleIndex) const
+		{
+			JPH_ASSERT(inNumTriangles > 0);
+			const TriangleBlockHeader *header = reinterpret_cast<const TriangleBlockHeader *>(inTriangleStart);
+			const VertexData *vertices = header->GetVertexData();
+			const TriangleBlock *t = header->GetTriangleBlock();
+			const TriangleBlock *end = t + ((inNumTriangles + 3) >> 2);
+
+			Vec4 closest = Vec4::sReplicate(inClosest);
+			UVec4 closest_triangle_idx = UVec4::sZero();
+
+			UVec4 start_triangle_idx = UVec4::sZero();
+			do
+			{
+				// Unpack the vertices for 4 triangles
+				Vec4 v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z;
+				Unpack(t, vertices, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+
+				// Perform ray vs triangle test
+				Vec4 distance = RayTriangle4(inRayOrigin, inRayDirection, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+
+				// Update closest with the smaller values
+				UVec4 smaller = Vec4::sLess(distance, closest);
+				closest = Vec4::sSelect(closest, distance, smaller);
+
+				// Update triangle index with the smallest values
+				UVec4 triangle_idx = start_triangle_idx + UVec4(0, 1, 2, 3);
+				closest_triangle_idx = UVec4::sSelect(closest_triangle_idx, triangle_idx, smaller);
+
+				// Next block
+				++t;
+				start_triangle_idx += UVec4::sReplicate(4);
+			}
+			while (t < end);
+
+			// Get the smallest component
+			Vec4::sSort4(closest, closest_triangle_idx);
+			outClosestTriangleIndex = closest_triangle_idx.GetX();
+			return closest.GetX();
+		}
+
+		/// Decode a single triangle
+		inline void					GetTriangle(const void *inTriangleStart, uint32 inTriangleIdx, Vec3 &outV1, Vec3 &outV2, Vec3 &outV3) const
+		{
+			const TriangleBlockHeader *header = reinterpret_cast<const TriangleBlockHeader *>(inTriangleStart);
+			const VertexData *vertices = header->GetVertexData();
+			const TriangleBlock *block = header->GetTriangleBlock() + (inTriangleIdx >> 2);
+			uint32 block_triangle_idx = inTriangleIdx & 0b11;
+
+			// Get the 3 vertices
+			const VertexData &v1 = vertices[block->mIndices[0][block_triangle_idx]];
+			const VertexData &v2 = vertices[block->mIndices[1][block_triangle_idx]];
+			const VertexData &v3 = vertices[block->mIndices[2][block_triangle_idx]];
+
+			// Pack the vertices
+			UVec4 c1(v1.mVertexXY, v2.mVertexXY, v3.mVertexXY, 0);
+			UVec4 c2(v1.mVertexZY, v2.mVertexZY, v3.mVertexZY, 0);
+
+			// Unpack the x y and z component
+			UVec4 xc = UVec4::sAnd(c1, UVec4::sReplicate(COMPONENT_MASK));
+			UVec4 yc = UVec4::sOr(c1.LogicalShiftRight<COMPONENT_Y1>(), c2.LogicalShiftRight<COMPONENT_Y2>().LogicalShiftLeft<COMPONENT_Y1_BITS>());
+			UVec4 zc = UVec4::sAnd(c2, UVec4::sReplicate(COMPONENT_MASK));
+
+			// Convert to float
+			Vec4 vx = Vec4::sFusedMultiplyAdd(xc.ToFloat(), mScaleX, mOffsetX);
+			Vec4 vy = Vec4::sFusedMultiplyAdd(yc.ToFloat(), mScaleY, mOffsetY);
+			Vec4 vz = Vec4::sFusedMultiplyAdd(zc.ToFloat(), mScaleZ, mOffsetZ);
+
+			// Transpose it so we get normal vectors
+			Mat44 trans = Mat44(vx, vy, vz, Vec4::sZero()).Transposed();
+			outV1 = trans.GetAxisX();
+			outV2 = trans.GetAxisY();
+			outV3 = trans.GetAxisZ();
+		}
+
+		/// Get user data for a triangle
+		JPH_INLINE uint32			GetUserData(const void *inTriangleStart, uint32 inTriangleIdx) const
+		{
+			const TriangleBlockHeader *header = reinterpret_cast<const TriangleBlockHeader *>(inTriangleStart);
+			const uint32 *user_data = header->GetUserData();
+			return user_data != nullptr? user_data[inTriangleIdx] : 0;
+		}
+
+		/// Get flags for entire triangle block
+		JPH_INLINE static void		sGetFlags(const void *inTriangleStart, uint32 inNumTriangles, uint8 *outTriangleFlags)
+		{
+			JPH_ASSERT(inNumTriangles > 0);
+			const TriangleBlockHeader *header = reinterpret_cast<const TriangleBlockHeader *>(inTriangleStart);
+			const TriangleBlock *t = header->GetTriangleBlock();
+			const TriangleBlock *end = t + ((inNumTriangles + 3) >> 2);
+
+			int triangles_left = inNumTriangles;
+			do
+			{
+				for (int i = 0; i < 4 && triangles_left > 0; ++i, --triangles_left)
+					*outTriangleFlags++ = t->mFlags[i];
+
+				++t;
+			}
+			while (t < end);
+		}
+
+		/// Get flags for a particular triangle
+		JPH_INLINE static uint8		sGetFlags(const void *inTriangleStart, int inTriangleIndex)
+		{
+			const TriangleBlockHeader *header = reinterpret_cast<const TriangleBlockHeader *>(inTriangleStart);
+			const TriangleBlock *first_block = header->GetTriangleBlock();
+			return first_block[inTriangleIndex >> 2].mFlags[inTriangleIndex & 0b11];
+		}
+
+		/// Unpacks triangles and flags, convenience function
+		JPH_INLINE void				Unpack(const void *inTriangleStart, uint32 inNumTriangles, Vec3 *outTriangles, uint8 *outTriangleFlags) const
+		{
+			Unpack(inTriangleStart, inNumTriangles, outTriangles);
+			sGetFlags(inTriangleStart, inNumTriangles, outTriangleFlags);
+		}
+
+	private:
+		Vec4						mOffsetX;
+		Vec4						mOffsetY;
+		Vec4						mOffsetZ;
+		Vec4						mScaleX;
+		Vec4						mScaleY;
+		Vec4						mScaleZ;
+	};
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeBufferCPU.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeBufferCPU.cpp
@ -0,0 +1,36 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+#include <Jolt/Compute/CPU/ComputeBufferCPU.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeBufferCPU::ComputeBufferCPU(EType inType, uint64 inSize, uint inStride, const void *inData) :
+	ComputeBuffer(inType, inSize, inStride)
+{
+	size_t buffer_size = size_t(mSize) * mStride;
+	mData = Allocate(buffer_size);
+	if (inData != nullptr)
+		memcpy(mData, inData, buffer_size);
+}
+
+ComputeBufferCPU::~ComputeBufferCPU()
+{
+	Free(mData);
+}
+
+ComputeBufferResult ComputeBufferCPU::CreateReadBackBuffer() const
+{
+	ComputeBufferResult result;
+	result.Set(const_cast<ComputeBufferCPU *>(this));
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeBufferCPU.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeBufferCPU.h
@ -0,0 +1,36 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeBuffer.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+JPH_NAMESPACE_BEGIN
+
+/// Buffer that can be used with the CPU compute system
+class JPH_EXPORT ComputeBufferCPU final : public ComputeBuffer
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor / destructor
+									ComputeBufferCPU(EType inType, uint64 inSize, uint inStride, const void *inData);
+	virtual							~ComputeBufferCPU() override;
+
+	ComputeBufferResult				CreateReadBackBuffer() const override;
+
+	void *							GetData() const										{ return mData; }
+
+private:
+	virtual void *					MapInternal(EMode inMode) override					{ return mData; }
+	virtual void					UnmapInternal() override							{ /* Nothing to do */ }
+
+	void *							mData;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeQueueCPU.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeQueueCPU.cpp
@ -0,0 +1,101 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+#include <Jolt/Compute/CPU/ComputeQueueCPU.h>
+#include <Jolt/Compute/CPU/ComputeShaderCPU.h>
+#include <Jolt/Compute/CPU/ComputeBufferCPU.h>
+#include <Jolt/Compute/CPU/ShaderWrapper.h>
+#include <Jolt/Compute/CPU/HLSLToCPP.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeQueueCPU::~ComputeQueueCPU()
+{
+	JPH_ASSERT(mShader == nullptr && mWrapper == nullptr);
+}
+
+void ComputeQueueCPU::SetShader(const ComputeShader *inShader)
+{
+	JPH_ASSERT(mShader == nullptr && mWrapper == nullptr);
+
+	mShader = static_cast<const ComputeShaderCPU *>(inShader);
+	mWrapper = mShader->CreateWrapper();
+}
+
+void ComputeQueueCPU::SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::ConstantBuffer);
+	const ComputeBufferCPU *buffer = static_cast<const ComputeBufferCPU *>(inBuffer);
+	mWrapper->Bind(inName, buffer->GetData(), buffer->GetSize() * buffer->GetStride());
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueCPU::SetBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::UploadBuffer || inBuffer->GetType() == ComputeBuffer::EType::Buffer || inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+	const ComputeBufferCPU *buffer = static_cast<const ComputeBufferCPU *>(inBuffer);
+	mWrapper->Bind(inName, buffer->GetData(), buffer->GetSize() * buffer->GetStride());
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueCPU::SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+	const ComputeBufferCPU *buffer = static_cast<const ComputeBufferCPU *>(inBuffer);
+	mWrapper->Bind(inName, buffer->GetData(), buffer->GetSize() * buffer->GetStride());
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueCPU::ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc)
+{
+	/* Nothing to read back */
+}
+
+void ComputeQueueCPU::Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ)
+{
+	uint nx = inThreadGroupsX * mShader->GetGroupSizeX();
+	uint ny = inThreadGroupsY * mShader->GetGroupSizeY();
+	uint nz = inThreadGroupsZ * mShader->GetGroupSizeZ();
+
+	for (uint z = 0; z < nz; ++z)
+		for (uint y = 0; y < ny; ++y)
+			for (uint x = 0; x < nx; ++x)
+			{
+				HLSLToCPP::uint3 tid { x, y, z };
+				mWrapper->Main(tid);
+			}
+
+	delete mWrapper;
+	mWrapper = nullptr;
+
+	mUsedBuffers.clear();
+	mShader = nullptr;
+}
+
+void ComputeQueueCPU::Execute()
+{
+	/* Nothing to do */
+}
+
+void ComputeQueueCPU::Wait()
+{
+	/* Nothing to do */
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeQueueCPU.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeQueueCPU.h
@ -0,0 +1,43 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeQueue.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+#include <Jolt/Compute/CPU/ComputeShaderCPU.h>
+#include <Jolt/Core/UnorderedSet.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// A command queue for the CPU compute system
+class JPH_EXPORT ComputeQueueCPU final : public ComputeQueue
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Destructor
+	virtual								~ComputeQueueCPU() override;
+
+	// See: ComputeQueue
+	virtual void						SetShader(const ComputeShader *inShader) override;
+	virtual void						SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void						SetBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void 						SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier = EBarrier::Yes) override;
+	virtual void						ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc) override;
+	virtual void						Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ) override;
+	virtual void						Execute() override;
+	virtual void						Wait() override;
+
+private:
+	RefConst<ComputeShaderCPU>			mShader = nullptr;							///< Current active shader
+	ShaderWrapper *						mWrapper = nullptr;							///< The active shader wrapper
+	UnorderedSet<RefConst<ComputeBuffer>> mUsedBuffers;								///< Buffers that are in use by the current execution, these will be retained until execution is finished so that we don't free buffers that are in use
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeShaderCPU.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeShaderCPU.h
@ -0,0 +1,42 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeShader.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+JPH_NAMESPACE_BEGIN
+
+class ShaderWrapper;
+
+/// Compute shader handle for CPU compute
+class JPH_EXPORT ComputeShaderCPU : public ComputeShader
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	using CreateShader = ShaderWrapper *(*)();
+
+	/// Constructor
+									ComputeShaderCPU(CreateShader inCreateShader, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) :
+		ComputeShader(inGroupSizeX, inGroupSizeY, inGroupSizeZ),
+		mCreateShader(inCreateShader)
+	{
+	}
+
+	/// Create an instance of the shader wrapper
+	ShaderWrapper *					CreateWrapper() const
+	{
+		return mCreateShader();
+	}
+
+private:
+	CreateShader					mCreateShader;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeSystemCPU.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeSystemCPU.cpp
@ -0,0 +1,56 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+#include <Jolt/Compute/CPU/ComputeSystemCPU.h>
+#include <Jolt/Compute/CPU/ComputeQueueCPU.h>
+#include <Jolt/Compute/CPU/ComputeBufferCPU.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemCPU)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemCPU, ComputeSystem)
+}
+
+ComputeShaderResult ComputeSystemCPU::CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ)
+{
+	ComputeShaderResult result;
+	const ShaderRegistry::const_iterator it = mShaderRegistry.find(inName);
+	if (it == mShaderRegistry.end())
+	{
+		result.SetError("Compute shader not found");
+		return result;
+	}
+	result.Set(new ComputeShaderCPU(it->second, inGroupSizeX, inGroupSizeY, inGroupSizeZ));
+	return result;
+}
+
+ComputeBufferResult ComputeSystemCPU::CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData)
+{
+	ComputeBufferResult result;
+	result.Set(new ComputeBufferCPU(inType, inSize, inStride, inData));
+	return result;
+}
+
+ComputeQueueResult ComputeSystemCPU::CreateComputeQueue()
+{
+	ComputeQueueResult result;
+	result.Set(new ComputeQueueCPU());
+	return result;
+}
+
+ComputeSystemResult CreateComputeSystemCPU()
+{
+	ComputeSystemResult result;
+	result.Set(new ComputeSystemCPU());
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeSystemCPU.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ComputeSystemCPU.h
@ -0,0 +1,52 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeSystem.h>
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+#include <Jolt/Core/UnorderedMap.h>
+#include <Jolt/Compute/CPU/ComputeShaderCPU.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Interface to run a workload on the CPU
+/// This is intended mainly for debugging purposes and is not optimized for performance
+class JPH_EXPORT ComputeSystemCPU : public ComputeSystem
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemCPU)
+
+	// See: ComputeSystem
+	virtual ComputeShaderResult  	CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) override;
+	virtual ComputeBufferResult  	CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData = nullptr) override;
+	virtual ComputeQueueResult  	CreateComputeQueue() override;
+
+	using CreateShader = ComputeShaderCPU::CreateShader;
+
+	void							RegisterShader(const char *inName, CreateShader inCreateShader)
+	{
+		mShaderRegistry[inName] = inCreateShader;
+	}
+
+private:
+	using ShaderRegistry = UnorderedMap<string_view, CreateShader>;
+	ShaderRegistry					mShaderRegistry;
+};
+
+// Internal helpers
+#define JPH_SHADER_WRAPPER_FUNCTION_NAME(name)		RegisterShader##name
+#define JPH_SHADER_WRAPPER_FUNCTION(sys, name)		void JPH_EXPORT JPH_SHADER_WRAPPER_FUNCTION_NAME(name)(ComputeSystemCPU *sys)
+
+/// Macro to declare a shader register function
+#define JPH_DECLARE_REGISTER_SHADER(name)			namespace JPH { class ComputeSystemCPU; JPH_SHADER_WRAPPER_FUNCTION(, name); }
+
+/// Macro to register a shader
+#define JPH_REGISTER_SHADER(sys, name)				JPH::JPH_SHADER_WRAPPER_FUNCTION_NAME(name)(sys)
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/HLSLToCPP.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/HLSLToCPP.h
@ -0,0 +1,525 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Emulates HLSL vector types and operations in C++.
+/// Note doesn't emulate things like barriers and group shared memory.
+namespace HLSLToCPP {
+
+using std::sqrt;
+using std::min;
+using std::max;
+using std::round;
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// float2
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct float2
+{
+	// Constructors
+	inline				float2() = default;
+	constexpr			float2(float inX, float inY)						: x(inX), y(inY) { }
+	explicit constexpr	float2(float inS)									: x(inS), y(inS) { }
+
+	// Operators
+	constexpr float2 &	operator += (const float2 &inRHS)					{ x += inRHS.x; y += inRHS.y; return *this; }
+	constexpr float2 &	operator -= (const float2 &inRHS)					{ x -= inRHS.x; y -= inRHS.y; return *this; }
+	constexpr float2 &	operator *= (float inRHS)							{ x *= inRHS; y *= inRHS; return *this; }
+	constexpr float2 &	operator /= (float inRHS)							{ x /= inRHS; y /= inRHS; return *this; }
+	constexpr float2 &	operator *= (const float2 &inRHS)					{ x *= inRHS.x; y *= inRHS.y; return *this; }
+	constexpr float2 &	operator /= (const float2 &inRHS)					{ x /= inRHS.x; y /= inRHS.y; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const float2 &inRHS) const				{ return x == inRHS.x && y == inRHS.y; }
+	constexpr bool		operator != (const float2 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const float &		operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	float &				operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const float2		swizzle_xy() const									{ return float2(x, y); }
+	const float2		swizzle_yx() const									{ return float2(y, x); }
+
+	float				x, y;
+};
+
+// Operators
+constexpr float2		operator - (const float2 &inA)						{ return float2(-inA.x, -inA.y); }
+constexpr float2		operator + (const float2 &inA, const float2 &inB)	{ return float2(inA.x + inB.x, inA.y + inB.y); }
+constexpr float2		operator - (const float2 &inA, const float2 &inB)	{ return float2(inA.x - inB.x, inA.y - inB.y); }
+constexpr float2		operator * (const float2 &inA, const float2 &inB)	{ return float2(inA.x * inB.x, inA.y * inB.y); }
+constexpr float2		operator / (const float2 &inA, const float2 &inB)	{ return float2(inA.x / inB.x, inA.y / inB.y); }
+constexpr float2		operator * (const float2 &inA, float inS)			{ return float2(inA.x * inS, inA.y * inS); }
+constexpr float2		operator * (float inS, const float2 &inA)			{ return inA * inS; }
+constexpr float2		operator / (const float2 &inA, float inS)			{ return float2(inA.x / inS, inA.y / inS); }
+
+// Dot product
+constexpr float			dot(const float2 &inA, const float2 &inB)			{ return inA.x * inB.x + inA.y * inB.y; }
+
+// Min value
+constexpr float2		min(const float2 &inA, const float2 &inB)			{ return float2(min(inA.x, inB.x), min(inA.y, inB.y)); }
+
+// Max value
+constexpr float2		max(const float2 &inA, const float2 &inB)			{ return float2(max(inA.x, inB.x), max(inA.y, inB.y)); }
+
+// Length
+inline float			length(const float2 &inV)							{ return sqrt(dot(inV, inV)); }
+
+// Normalization
+inline float2			normalize(const float2 &inV)						{ return inV / length(inV); }
+
+// Rounding to int
+inline float2			round(const float2 &inV)							{ return float2(round(inV.x), round(inV.y)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// float3
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct uint3;
+
+struct float3
+{
+	// Constructors
+	inline				float3() = default;
+	constexpr			float3(const float2 &inV, float inZ)				: x(inV.x), y(inV.y), z(inZ) { }
+	constexpr			float3(float inX, float inY, float inZ)				: x(inX), y(inY), z(inZ) { }
+	explicit constexpr	float3(float inS)									: x(inS), y(inS), z(inS) { }
+	explicit constexpr	float3(const uint3 &inV);
+
+	// Operators
+	constexpr float3 &	operator += (const float3 &inRHS)					{ x += inRHS.x; y += inRHS.y; z += inRHS.z; return *this; }
+	constexpr float3 &	operator -= (const float3 &inRHS)					{ x -= inRHS.x; y -= inRHS.y; z -= inRHS.z; return *this; }
+	constexpr float3 &	operator *= (float inRHS)							{ x *= inRHS; y *= inRHS; z *= inRHS; return *this; }
+	constexpr float3 &	operator /= (float inRHS)							{ x /= inRHS; y /= inRHS; z /= inRHS; return *this; }
+	constexpr float3 &	operator *= (const float3 &inRHS)					{ x *= inRHS.x; y *= inRHS.y; z *= inRHS.z; return *this; }
+	constexpr float3 &	operator /= (const float3 &inRHS)					{ x /= inRHS.x; y /= inRHS.y; z /= inRHS.z; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const float3 &inRHS) const				{ return x == inRHS.x && y == inRHS.y && z == inRHS.z; }
+	constexpr bool		operator != (const float3 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const float &		operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	float &				operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const float2		swizzle_xy() const									{ return float2(x, y); }
+	const float2		swizzle_yx() const									{ return float2(y, x); }
+	const float3		swizzle_xyz() const									{ return float3(x, y, z); }
+	const float3		swizzle_xzy() const									{ return float3(x, z, y); }
+	const float3		swizzle_yxz() const									{ return float3(y, x, z); }
+	const float3		swizzle_yzx() const									{ return float3(y, z, x); }
+	const float3		swizzle_zxy() const									{ return float3(z, x, y); }
+	const float3		swizzle_zyx() const									{ return float3(z, y, x); }
+
+	float				x, y, z;
+};
+
+// Operators
+constexpr float3		operator - (const float3 &inA)						{ return float3(-inA.x, -inA.y, -inA.z); }
+constexpr float3		operator + (const float3 &inA, const float3 &inB)	{ return float3(inA.x + inB.x, inA.y + inB.y, inA.z + inB.z); }
+constexpr float3		operator - (const float3 &inA, const float3 &inB)	{ return float3(inA.x - inB.x, inA.y - inB.y, inA.z - inB.z); }
+constexpr float3		operator * (const float3 &inA, const float3 &inB)	{ return float3(inA.x * inB.x, inA.y * inB.y, inA.z * inB.z); }
+constexpr float3		operator / (const float3 &inA, const float3 &inB)	{ return float3(inA.x / inB.x, inA.y / inB.y, inA.z / inB.z); }
+constexpr float3		operator * (const float3 &inA, float inS)			{ return float3(inA.x * inS, inA.y * inS, inA.z * inS); }
+constexpr float3		operator * (float inS, const float3 &inA)			{ return inA * inS; }
+constexpr float3		operator / (const float3 &inA, float inS)			{ return float3(inA.x / inS, inA.y / inS, inA.z / inS); }
+
+// Dot product
+constexpr float			dot(const float3 &inA, const float3 &inB)			{ return inA.x * inB.x + inA.y * inB.y + inA.z * inB.z; }
+
+// Min value
+constexpr float3		min(const float3 &inA, const float3 &inB)			{ return float3(min(inA.x, inB.x), min(inA.y, inB.y), min(inA.z, inB.z)); }
+
+// Max value
+constexpr float3		max(const float3 &inA, const float3 &inB)			{ return float3(max(inA.x, inB.x), max(inA.y, inB.y), max(inA.z, inB.z)); }
+
+// Length
+inline float			length(const float3 &inV)							{ return sqrt(dot(inV, inV)); }
+
+// Normalization
+inline float3			normalize(const float3 &inV)						{ return inV / length(inV); }
+
+// Rounding to int
+inline float3			round(const float3 &inV)							{ return float3(round(inV.x), round(inV.y), round(inV.z)); }
+
+// Cross product
+constexpr float3		cross(const float3 &inA, const float3 &inB)			{ return float3(inA.y * inB.z - inA.z * inB.y, inA.z * inB.x - inA.x * inB.z, inA.x * inB.y - inA.y * inB.x); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// float4
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct int4;
+
+struct float4
+{
+	// Constructors
+	inline				float4() = default;
+	constexpr			float4(const float3 &inV, float inW)				: x(inV.x), y(inV.y), z(inV.z), w(inW) { }
+	constexpr			float4(float inX, float inY, float inZ, float inW)	: x(inX), y(inY), z(inZ), w(inW) { }
+	explicit constexpr	float4(float inS)									: x(inS), y(inS), z(inS), w(inS) { }
+	explicit constexpr	float4(const int4 &inV);
+
+	// Operators
+	constexpr float4 &	operator += (const float4 &inRHS)					{ x += inRHS.x; y += inRHS.y; z += inRHS.z; w += inRHS.w; return *this; }
+	constexpr float4 &	operator -= (const float4 &inRHS)					{ x -= inRHS.x; y -= inRHS.y; z -= inRHS.z; w -= inRHS.w; return *this; }
+	constexpr float4 &	operator *= (float inRHS)							{ x *= inRHS; y *= inRHS; z *= inRHS; w *= inRHS; return *this; }
+	constexpr float4 &	operator /= (float inRHS)							{ x /= inRHS; y /= inRHS; z /= inRHS; w /= inRHS; return *this; }
+	constexpr float4 &	operator *= (const float4 &inRHS)					{ x *= inRHS.x; y *= inRHS.y; z *= inRHS.z; w *= inRHS.w; return *this; }
+	constexpr float4 &	operator /= (const float4 &inRHS)					{ x /= inRHS.x; y /= inRHS.y; z /= inRHS.z; w /= inRHS.w; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const float4 &inRHS) const				{ return x == inRHS.x && y == inRHS.y && z == inRHS.z && w == inRHS.w; }
+	constexpr bool		operator != (const float4 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const float &		operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	float &				operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const float2		swizzle_xy() const									{ return float2(x, y); }
+	const float2		swizzle_yx() const									{ return float2(y, x); }
+	const float3		swizzle_xyz() const									{ return float3(x, y, z); }
+	const float3		swizzle_xzy() const									{ return float3(x, z, y); }
+	const float3		swizzle_yxz() const									{ return float3(y, x, z); }
+	const float3		swizzle_yzx() const									{ return float3(y, z, x); }
+	const float3		swizzle_zxy() const									{ return float3(z, x, y); }
+	const float3		swizzle_zyx() const									{ return float3(z, y, x); }
+	const float4		swizzle_xywz() const								{ return float4(x, y, w, z); }
+	const float4		swizzle_xwyz() const								{ return float4(x, w, y, z); }
+	const float4		swizzle_wxyz() const								{ return float4(w, x, y, z); }
+
+	float				x, y, z, w;
+};
+
+// Operators
+constexpr float4		operator - (const float4 &inA)						{ return float4(-inA.x, -inA.y, -inA.z, -inA.w); }
+constexpr float4		operator + (const float4 &inA, const float4 &inB)	{ return float4(inA.x + inB.x, inA.y + inB.y, inA.z + inB.z, inA.w + inB.w); }
+constexpr float4		operator - (const float4 &inA, const float4 &inB)	{ return float4(inA.x - inB.x, inA.y - inB.y, inA.z - inB.z, inA.w - inB.w); }
+constexpr float4		operator * (const float4 &inA, const float4 &inB)	{ return float4(inA.x * inB.x, inA.y * inB.y, inA.z * inB.z, inA.w * inB.w); }
+constexpr float4		operator / (const float4 &inA, const float4 &inB)	{ return float4(inA.x / inB.x, inA.y / inB.y, inA.z / inB.z, inA.w / inB.w); }
+constexpr float4		operator * (const float4 &inA, float inS)			{ return float4(inA.x * inS, inA.y * inS, inA.z * inS, inA.w * inS); }
+constexpr float4		operator * (float inS, const float4 &inA)			{ return inA * inS; }
+constexpr float4		operator / (const float4 &inA, float inS)			{ return float4(inA.x / inS, inA.y / inS, inA.z / inS, inA.w / inS); }
+
+// Dot product
+constexpr float			dot(const float4 &inA, const float4 &inB)			{ return inA.x * inB.x + inA.y * inB.y + inA.z * inB.z + inA.w * inB.w; }
+
+// Min value
+constexpr float4		min(const float4 &inA, const float4 &inB)			{ return float4(min(inA.x, inB.x), min(inA.y, inB.y), min(inA.z, inB.z), min(inA.w, inB.w)); }
+
+// Max value
+constexpr float4		max(const float4 &inA, const float4 &inB)			{ return float4(max(inA.x, inB.x), max(inA.y, inB.y), max(inA.z, inB.z), max(inA.w, inB.w)); }
+
+// Length
+inline float			length(const float4 &inV)							{ return sqrt(dot(inV, inV)); }
+
+// Normalization
+inline float4			normalize(const float4 &inV)						{ return inV / length(inV); }
+
+// Rounding to int
+inline float4			round(const float4 &inV)							{ return float4(round(inV.x), round(inV.y), round(inV.z), round(inV.w)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// uint3
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct uint3
+{
+	inline				uint3() = default;
+	constexpr			uint3(uint32 inX, uint32 inY, uint32 inZ)			: x(inX), y(inY), z(inZ) { }
+	explicit constexpr	uint3(const float3 &inV)							: x(uint32(inV.x)), y(uint32(inV.y)), z(uint32(inV.z)) { }
+
+	// Operators
+	constexpr uint3 &	operator += (const uint3 &inRHS)					{ x += inRHS.x; y += inRHS.y; z += inRHS.z; return *this; }
+	constexpr uint3 &	operator -= (const uint3 &inRHS)					{ x -= inRHS.x; y -= inRHS.y; z -= inRHS.z; return *this; }
+	constexpr uint3 &	operator *= (uint32 inRHS)							{ x *= inRHS; y *= inRHS; z *= inRHS; return *this; }
+	constexpr uint3 &	operator /= (uint32 inRHS)							{ x /= inRHS; y /= inRHS; z /= inRHS; return *this; }
+	constexpr uint3 &	operator *= (const uint3 &inRHS)					{ x *= inRHS.x; y *= inRHS.y; z *= inRHS.z; return *this; }
+	constexpr uint3 &	operator /= (const uint3 &inRHS)					{ x /= inRHS.x; y /= inRHS.y; z /= inRHS.z; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const uint3 &inRHS) const				{ return x == inRHS.x && y == inRHS.y && z == inRHS.z; }
+	constexpr bool		operator != (const uint3 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const uint32 &		operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	uint32 &			operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const uint3			swizzle_xyz() const									{ return uint3(x, y, z); }
+	const uint3			swizzle_xzy() const									{ return uint3(x, z, y); }
+	const uint3			swizzle_yxz() const									{ return uint3(y, x, z); }
+	const uint3			swizzle_yzx() const									{ return uint3(y, z, x); }
+	const uint3			swizzle_zxy() const									{ return uint3(z, x, y); }
+	const uint3			swizzle_zyx() const									{ return uint3(z, y, x); }
+
+	uint32				x, y, z;
+};
+
+// Operators
+constexpr uint3			operator + (const uint3 &inA, const uint3 &inB)		{ return uint3(inA.x + inB.x, inA.y + inB.y, inA.z + inB.z); }
+constexpr uint3			operator - (const uint3 &inA, const uint3 &inB)		{ return uint3(inA.x - inB.x, inA.y - inB.y, inA.z - inB.z); }
+constexpr uint3			operator * (const uint3 &inA, const uint3 &inB)		{ return uint3(inA.x * inB.x, inA.y * inB.y, inA.z * inB.z); }
+constexpr uint3			operator / (const uint3 &inA, const uint3 &inB)		{ return uint3(inA.x / inB.x, inA.y / inB.y, inA.z / inB.z); }
+constexpr uint3			operator * (const uint3 &inA, uint32 inS)			{ return uint3(inA.x * inS, inA.y * inS, inA.z * inS); }
+constexpr uint3			operator * (uint32 inS, const uint3 &inA)			{ return inA * inS; }
+constexpr uint3			operator / (const uint3 &inA, uint32 inS)			{ return uint3(inA.x / inS, inA.y / inS, inA.z / inS); }
+
+// Dot product
+constexpr uint32		dot(const uint3 &inA, const uint3 &inB)				{ return inA.x * inB.x + inA.y * inB.y + inA.z * inB.z; }
+
+// Min value
+constexpr uint3			min(const uint3 &inA, const uint3 &inB)				{ return uint3(min(inA.x, inB.x), min(inA.y, inB.y), min(inA.z, inB.z)); }
+
+// Max value
+constexpr uint3			max(const uint3 &inA, const uint3 &inB)				{ return uint3(max(inA.x, inB.x), max(inA.y, inB.y), max(inA.z, inB.z)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// uint4
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct uint4
+{
+	// Constructors
+	inline				uint4() = default;
+	constexpr			uint4(const uint3 &inV, uint32 inW)					: x(inV.x), y(inV.y), z(inV.z), w(inW) { }
+	constexpr			uint4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW) : x(inX), y(inY), z(inZ), w(inW) { }
+	explicit constexpr	uint4(uint32 inS)									: x(inS), y(inS), z(inS), w(inS) { }
+
+	// Operators
+	constexpr uint4 &	operator += (const uint4 &inRHS)					{ x += inRHS.x; y += inRHS.y; z += inRHS.z; w += inRHS.w; return *this; }
+	constexpr uint4 &	operator -= (const uint4 &inRHS)					{ x -= inRHS.x; y -= inRHS.y; z -= inRHS.z; w -= inRHS.w; return *this; }
+	constexpr uint4 &	operator *= (uint32 inRHS)							{ x *= inRHS; y *= inRHS; z *= inRHS; w *= inRHS; return *this; }
+	constexpr uint4 &	operator /= (uint32 inRHS)							{ x /= inRHS; y /= inRHS; z /= inRHS; w /= inRHS; return *this; }
+	constexpr uint4 &	operator *= (const uint4 &inRHS)					{ x *= inRHS.x; y *= inRHS.y; z *= inRHS.z; w *= inRHS.w; return *this; }
+	constexpr uint4 &	operator /= (const uint4 &inRHS)					{ x /= inRHS.x; y /= inRHS.y; z /= inRHS.z; w /= inRHS.w; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const uint4 &inRHS) const				{ return x == inRHS.x && y == inRHS.y && z == inRHS.z && w == inRHS.w; }
+	constexpr bool		operator != (const uint4 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const uint32 &		operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	uint32 &			operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const uint3			swizzle_xyz() const									{ return uint3(x, y, z); }
+	const uint3			swizzle_xzy() const									{ return uint3(x, z, y); }
+	const uint3			swizzle_yxz() const									{ return uint3(y, x, z); }
+	const uint3			swizzle_yzx() const									{ return uint3(y, z, x); }
+	const uint3			swizzle_zxy() const									{ return uint3(z, x, y); }
+	const uint3			swizzle_zyx() const									{ return uint3(z, y, x); }
+	const uint4			swizzle_xywz() const								{ return uint4(x, y, w, z); }
+	const uint4			swizzle_xwyz() const								{ return uint4(x, w, y, z); }
+	const uint4			swizzle_wxyz() const								{ return uint4(w, x, y, z); }
+
+	uint32				x, y, z, w;
+};
+
+// Operators
+constexpr uint4			operator + (const uint4 &inA, const uint4 &inB)		{ return uint4(inA.x + inB.x, inA.y + inB.y, inA.z + inB.z, inA.w + inB.w); }
+constexpr uint4			operator - (const uint4 &inA, const uint4 &inB)		{ return uint4(inA.x - inB.x, inA.y - inB.y, inA.z - inB.z, inA.w - inB.w); }
+constexpr uint4			operator * (const uint4 &inA, const uint4 &inB)		{ return uint4(inA.x * inB.x, inA.y * inB.y, inA.z * inB.z, inA.w * inB.w); }
+constexpr uint4			operator / (const uint4 &inA, const uint4 &inB)		{ return uint4(inA.x / inB.x, inA.y / inB.y, inA.z / inB.z, inA.w / inB.w); }
+constexpr uint4			operator * (const uint4 &inA, uint32 inS)			{ return uint4(inA.x * inS, inA.y * inS, inA.z * inS, inA.w * inS); }
+constexpr uint4			operator * (uint32 inS, const uint4 &inA)			{ return inA * inS; }
+constexpr uint4			operator / (const uint4 &inA, uint32 inS)			{ return uint4(inA.x / inS, inA.y / inS, inA.z / inS, inA.w / inS); }
+
+// Dot product
+constexpr uint32		dot(const uint4 &inA, const uint4 &inB)				{ return inA.x * inB.x + inA.y * inB.y + inA.z * inB.z + inA.w * inB.w; }
+
+// Min value
+constexpr uint4			min(const uint4 &inA, const uint4 &inB)				{ return uint4(min(inA.x, inB.x), min(inA.y, inB.y), min(inA.z, inB.z), min(inA.w, inB.w)); }
+
+// Max value
+constexpr uint4			max(const uint4 &inA, const uint4 &inB)				{ return uint4(max(inA.x, inB.x), max(inA.y, inB.y), max(inA.z, inB.z), max(inA.w, inB.w)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// int3
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct int3
+{
+	inline				int3() = default;
+	constexpr			int3(int inX, int inY, int inZ)						: x(inX), y(inY), z(inZ) { }
+	explicit constexpr	int3(const float3 &inV)								: x(int(inV.x)), y(int(inV.y)), z(int(inV.z)) { }
+
+	// Operators
+	constexpr int3 &	operator += (const int3 &inRHS)						{ x += inRHS.x; y += inRHS.y; z += inRHS.z; return *this; }
+	constexpr int3 &	operator -= (const int3 &inRHS)						{ x -= inRHS.x; y -= inRHS.y; z -= inRHS.z; return *this; }
+	constexpr int3 &	operator *= (int inRHS)								{ x *= inRHS; y *= inRHS; z *= inRHS; return *this; }
+	constexpr int3 &	operator /= (int inRHS)								{ x /= inRHS; y /= inRHS; z /= inRHS; return *this; }
+	constexpr int3 &	operator *= (const int3 &inRHS)						{ x *= inRHS.x; y *= inRHS.y; z *= inRHS.z; return *this; }
+	constexpr int3 &	operator /= (const int3 &inRHS)						{ x /= inRHS.x; y /= inRHS.y; z /= inRHS.z; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const int3 &inRHS) const				{ return x == inRHS.x && y == inRHS.y && z == inRHS.z; }
+	constexpr bool		operator != (const int3 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const int &			operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	int &				operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const int3			swizzle_xyz() const									{ return int3(x, y, z); }
+	const int3			swizzle_xzy() const									{ return int3(x, z, y); }
+	const int3			swizzle_yxz() const									{ return int3(y, x, z); }
+	const int3			swizzle_yzx() const									{ return int3(y, z, x); }
+	const int3			swizzle_zxy() const									{ return int3(z, x, y); }
+	const int3			swizzle_zyx() const									{ return int3(z, y, x); }
+
+	int					x, y, z;
+};
+
+// Operators
+constexpr int3			operator - (const int3 &inA)						{ return int3(-inA.x, -inA.y, -inA.z); }
+constexpr int3			operator + (const int3 &inA, const int3 &inB)		{ return int3(inA.x + inB.x, inA.y + inB.y, inA.z + inB.z); }
+constexpr int3			operator - (const int3 &inA, const int3 &inB)		{ return int3(inA.x - inB.x, inA.y - inB.y, inA.z - inB.z); }
+constexpr int3			operator * (const int3 &inA, const int3 &inB)		{ return int3(inA.x * inB.x, inA.y * inB.y, inA.z * inB.z); }
+constexpr int3			operator / (const int3 &inA, const int3 &inB)		{ return int3(inA.x / inB.x, inA.y / inB.y, inA.z / inB.z); }
+constexpr int3			operator * (const int3 &inA, int inS)				{ return int3(inA.x * inS, inA.y * inS, inA.z * inS); }
+constexpr int3			operator * (int inS, const int3 &inA)				{ return inA * inS; }
+constexpr int3			operator / (const int3 &inA, int inS)				{ return int3(inA.x / inS, inA.y / inS, inA.z / inS); }
+
+// Dot product
+constexpr int			dot(const int3 &inA, const int3 &inB)				{ return inA.x * inB.x + inA.y * inB.y + inA.z * inB.z; }
+
+// Min value
+constexpr int3			min(const int3 &inA, const int3 &inB)				{ return int3(min(inA.x, inB.x), min(inA.y, inB.y), min(inA.z, inB.z)); }
+
+// Max value
+constexpr int3			max(const int3 &inA, const int3 &inB)				{ return int3(max(inA.x, inB.x), max(inA.y, inB.y), max(inA.z, inB.z)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// int4
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct int4
+{
+	// Constructors
+	inline				int4() = default;
+	constexpr			int4(const int3 &inV, int inW)						: x(inV.x), y(inV.y), z(inV.z), w(inW) { }
+	constexpr			int4(int inX, int inY, int inZ, int inW)			: x(inX), y(inY), z(inZ), w(inW) { }
+	explicit constexpr	int4(int inS)										: x(inS), y(inS), z(inS), w(inS) { }
+	explicit constexpr	int4(const float4 &inV)								: x(int(inV.x)), y(int(inV.y)), z(int(inV.z)), w(int(inV.w)) { }
+
+	// Operators
+	constexpr int4 &	operator += (const int4 &inRHS)						{ x += inRHS.x; y += inRHS.y; z += inRHS.z; w += inRHS.w; return *this; }
+	constexpr int4 &	operator -= (const int4 &inRHS)						{ x -= inRHS.x; y -= inRHS.y; z -= inRHS.z; w -= inRHS.w; return *this; }
+	constexpr int4 &	operator *= (int inRHS)								{ x *= inRHS; y *= inRHS; z *= inRHS; w *= inRHS; return *this; }
+	constexpr int4 &	operator /= (int inRHS)								{ x /= inRHS; y /= inRHS; z /= inRHS; w /= inRHS; return *this; }
+	constexpr int4 &	operator *= (const int4 &inRHS)						{ x *= inRHS.x; y *= inRHS.y; z *= inRHS.z; w *= inRHS.w; return *this; }
+	constexpr int4 &	operator /= (const int4 &inRHS)						{ x /= inRHS.x; y /= inRHS.y; z /= inRHS.z; w /= inRHS.w; return *this; }
+
+	// Equality
+	constexpr bool		operator == (const int4 &inRHS) const				{ return x == inRHS.x && y == inRHS.y && z == inRHS.z && w == inRHS.w; }
+	constexpr bool		operator != (const int4 &inRHS) const				{ return !(*this == inRHS); }
+
+	// Component access
+	const int &			operator [] (uint inIndex) const					{ return (&x)[inIndex]; }
+	int &				operator [] (uint inIndex)							{ return (&x)[inIndex]; }
+
+	// Swizzling (note return value is const to prevent assignment to swizzled results)
+	const int3			swizzle_xyz() const									{ return int3(x, y, z); }
+	const int3			swizzle_xzy() const									{ return int3(x, z, y); }
+	const int3			swizzle_yxz() const									{ return int3(y, x, z); }
+	const int3			swizzle_yzx() const									{ return int3(y, z, x); }
+	const int3			swizzle_zxy() const									{ return int3(z, x, y); }
+	const int3			swizzle_zyx() const									{ return int3(z, y, x); }
+	const int4			swizzle_xywz() const								{ return int4(x, y, w, z); }
+	const int4			swizzle_xwyz() const								{ return int4(x, w, y, z); }
+	const int4			swizzle_wxyz() const								{ return int4(w, x, y, z); }
+
+	int					x, y, z, w;
+};
+
+// Operators
+constexpr int4			operator - (const int4 &inA)						{ return int4(-inA.x, -inA.y, -inA.z, -inA.w); }
+constexpr int4			operator + (const int4 &inA, const int4 &inB)		{ return int4(inA.x + inB.x, inA.y + inB.y, inA.z + inB.z, inA.w + inB.w); }
+constexpr int4			operator - (const int4 &inA, const int4 &inB)		{ return int4(inA.x - inB.x, inA.y - inB.y, inA.z - inB.z, inA.w - inB.w); }
+constexpr int4			operator * (const int4 &inA, const int4 &inB)		{ return int4(inA.x * inB.x, inA.y * inB.y, inA.z * inB.z, inA.w * inB.w); }
+constexpr int4			operator / (const int4 &inA, const int4 &inB)		{ return int4(inA.x / inB.x, inA.y / inB.y, inA.z / inB.z, inA.w / inB.w); }
+constexpr int4			operator * (const int4 &inA, int inS)				{ return int4(inA.x * inS, inA.y * inS, inA.z * inS, inA.w * inS); }
+constexpr int4			operator * (int inS, const int4 &inA)				{ return inA * inS; }
+constexpr int4			operator / (const int4 &inA, int inS)				{ return int4(inA.x / inS, inA.y / inS, inA.z / inS, inA.w / inS); }
+
+// Dot product
+constexpr int			dot(const int4 &inA, const int4 &inB)				{ return inA.x * inB.x + inA.y * inB.y + inA.z * inB.z + inA.w * inB.w; }
+
+// Min value
+constexpr int4			min(const int4 &inA, const int4 &inB)				{ return int4(min(inA.x, inB.x), min(inA.y, inB.y), min(inA.z, inB.z), min(inA.w, inB.w)); }
+
+// Max value
+constexpr int4			max(const int4 &inA, const int4 &inB)				{ return int4(max(inA.x, inB.x), max(inA.y, inB.y), max(inA.z, inB.z), max(inA.w, inB.w)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Mat44
+//////////////////////////////////////////////////////////////////////////////////////////
+
+struct Mat44
+{
+	// Constructors
+	inline				Mat44() = default;
+	constexpr 			Mat44(const float4 &inC0, const float4 &inC1, const float4 &inC2, const float4 &inC3) : c { inC0, inC1, inC2, inC3 } { }
+
+	// Columns
+	float4 &			operator [] (uint inIndex)							{ return c[inIndex]; }
+	const float4 &		operator [] (uint inIndex) const					{ return c[inIndex]; }
+
+private:
+	float4				c[4];
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Other types
+//////////////////////////////////////////////////////////////////////////////////////////
+
+using Quat = float4;
+using Plane = float4;
+
+// Clamp value
+template <class T>
+constexpr T				clamp(const T &inValue, const T &inMinValue, const T &inMaxValue)
+{
+	return min(max(inValue, inMinValue), inMaxValue);
+}
+
+// Atomic add
+template <class T>
+T						JPH_AtomicAdd(T &ioT, const T &inValue)
+{
+	std::atomic<T> *value = reinterpret_cast<std::atomic<T> *>(&ioT);
+	return value->fetch_add(inValue) + inValue;
+}
+
+// Bitcast float4 to int4
+inline int4				asint(const float4 &inV)							{ return int4(BitCast<int>(inV.x), BitCast<int>(inV.y), BitCast<int>(inV.z), BitCast<int>(inV.w)); }
+
+// Functions that couldn't be declared earlier
+constexpr				float3::float3(const uint3 &inV)					: x(float(inV.x)), y(float(inV.y)), z(float(inV.z)) { }
+constexpr				float4::float4(const int4 &inV)						: x(float(inV.x)), y(float(inV.y)), z(float(inV.z)), w(float(inV.w)) { }
+
+// Swizzle operators
+#define xy				swizzle_xy()
+#define yx				swizzle_yx()
+#define xyz				swizzle_xyz()
+#define xzy				swizzle_xzy()
+#define yxz				swizzle_yxz()
+#define yzx				swizzle_yzx()
+#define zxy				swizzle_zxy()
+#define zyx				swizzle_zyx()
+#define xywz			swizzle_xywz()
+#define xwyz			swizzle_xwyz()
+#define wxyz			swizzle_wxyz()
+
+} // HLSLToCPP
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ShaderWrapper.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/ShaderWrapper.h
@ -0,0 +1,29 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_CPU_COMPUTE
+
+JPH_NAMESPACE_BEGIN
+
+namespace HLSLToCPP { struct uint3; }
+
+/// Wraps a compute shader to allow calling it from C++
+class ShaderWrapper
+{
+public:
+	/// Destructor
+	virtual				~ShaderWrapper() = default;
+
+	/// Bind buffer to shader
+	virtual void		Bind(const char *inName, void *inData, uint64 inSize) = 0;
+
+	/// Execute a single shader thread
+	virtual void		Main(const HLSLToCPP::uint3 &inThreadID) = 0;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_CPU_COMPUTE
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/WrapShaderBegin.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/WrapShaderBegin.h
@ -0,0 +1,75 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Core/HashCombine.h>
+#include <Jolt/Compute/CPU/ComputeSystemCPU.h>
+#include <Jolt/Compute/CPU/ShaderWrapper.h>
+#include <Jolt/Compute/CPU/HLSLToCPP.h>
+
+/// @cond INTERNAL
+
+JPH_NAMESPACE_BEGIN
+JPH_MSVC_SUPPRESS_WARNING(5031) // #pragma warning(pop): likely mismatch, popping warning state pushed in different file
+
+#define JPH_SHADER_OVERRIDE_MACROS
+#define JPH_SHADER_GENERATE_WRAPPER
+
+#define JPH_SHADER_CONSTANT(type, name, value)	inline static constexpr type name = value;
+
+#define JPH_SHADER_CONSTANTS_BEGIN(type, name)	struct type { alignas(16) int dummy; } name; // Ensure that the first constant is 16 byte aligned
+#define JPH_SHADER_CONSTANTS_MEMBER(type, name)	type c##name;
+#define JPH_SHADER_CONSTANTS_END(type)
+
+#define JPH_SHADER_BUFFER(type)					const type *
+#define JPH_SHADER_RW_BUFFER(type)				type *
+
+#define JPH_SHADER_BIND_BEGIN(name)
+#define JPH_SHADER_BIND_END(name)
+#define JPH_SHADER_BIND_BUFFER(type, name)		const type *name = nullptr;
+#define JPH_SHADER_BIND_RW_BUFFER(type, name)	type *name = nullptr;
+
+#define JPH_SHADER_FUNCTION_BEGIN(return_type, name, group_size_x, group_size_y, group_size_z) \
+		virtual void Main(
+#define JPH_SHADER_PARAM_THREAD_ID(name)		const HLSLToCPP::uint3 &name
+#define JPH_SHADER_FUNCTION_END					) override
+
+#define JPH_SHADER_STRUCT_BEGIN(name)			struct name {
+#define JPH_SHADER_STRUCT_MEMBER(type, name)	type m##name;
+#define JPH_SHADER_STRUCT_END(name)				};
+
+#define JPH_TO_STRING(name)						JPH_TO_STRING2(name)
+#define JPH_TO_STRING2(name)					#name
+
+#define JPH_SHADER_CLASS_NAME(name)				JPH_SHADER_CLASS_NAME2(name)
+#define JPH_SHADER_CLASS_NAME2(name)			name##ShaderWrapper
+
+#define JPH_IN(type)							const type &
+#define JPH_OUT(type)							type &
+#define JPH_IN_OUT(type)						type &
+
+// Namespace to prevent 'using' from leaking out
+namespace ShaderWrappers {
+
+using namespace HLSLToCPP;
+
+class JPH_SHADER_CLASS_NAME(JPH_SHADER_NAME) : public ShaderWrapper
+{
+public:
+	// Define types
+	using JPH_float = float;
+	using JPH_float3 = HLSLToCPP::float3;
+	using JPH_float4 = HLSLToCPP::float4;
+	using JPH_uint = uint;
+	using JPH_uint3 = HLSLToCPP::uint3;
+	using JPH_uint4 = HLSLToCPP::uint4;
+	using JPH_int = int;
+	using JPH_int3 = HLSLToCPP::int3;
+	using JPH_int4 = HLSLToCPP::int4;
+	using JPH_Quat = HLSLToCPP::Quat;
+	using JPH_Plane = HLSLToCPP::Plane;
+	using JPH_Mat44 = HLSLToCPP::Mat44;
+
+	// Now the shader code should be included followed by WrapShaderBindings.h
+
+/// @endcond
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/WrapShaderBindings.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/WrapShaderBindings.h
@ -0,0 +1,40 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+/// @cond INTERNAL
+
+	// First WrapShaderBegin.h should have been included, then the shader code
+
+	/// Bind a buffer to the shader
+	virtual void			Bind(const char *inName, void *inData, uint64 inSize) override
+	{
+		// Don't redefine constants
+		#undef JPH_SHADER_CONSTANT
+		#define JPH_SHADER_CONSTANT(type, name, value)
+
+		// Don't redefine structs
+		#undef JPH_SHADER_STRUCT_BEGIN
+		#undef JPH_SHADER_STRUCT_MEMBER
+		#undef JPH_SHADER_STRUCT_END
+		#define JPH_SHADER_STRUCT_BEGIN(name)
+		#define JPH_SHADER_STRUCT_MEMBER(type, name)
+		#define JPH_SHADER_STRUCT_END(name)
+
+		// When a constant buffer is bound, copy the data into the members
+		#undef JPH_SHADER_CONSTANTS_BEGIN
+		#undef JPH_SHADER_CONSTANTS_MEMBER
+		#define JPH_SHADER_CONSTANTS_BEGIN(type, name)	case HashString(#name): memcpy(&name + 1, inData, size_t(inSize));	break; // Very hacky way to get the address of the first constant and to copy the entire block of constants
+		#define JPH_SHADER_CONSTANTS_MEMBER(type, name)
+
+		// When a buffer is bound, set the pointer
+		#undef JPH_SHADER_BIND_BUFFER
+		#undef JPH_SHADER_BIND_RW_BUFFER
+		#define JPH_SHADER_BIND_BUFFER(type, name)		case HashString(#name): name = (const type *)inData;		break;
+		#define JPH_SHADER_BIND_RW_BUFFER(type, name)	case HashString(#name): name = (type *)inData;				break;
+
+		switch (HashString(inName))
+		{
+			// Now include the shader bindings followed by WrapShaderEnd.h
+
+/// @endcond
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/WrapShaderEnd.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/CPU/WrapShaderEnd.h
@ -0,0 +1,61 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+/// @cond INTERNAL
+
+		// WrapShaderBindings.h should have been included followed by the shader bindings
+
+		default:
+			JPH_ASSERT(false, "Buffer cannot be bound to this shader");
+			break;
+		}
+	}
+
+	/// Factory function to create a shader wrapper for this shader
+	static ShaderWrapper *	sCreate()
+	{
+		return new JPH_SHADER_CLASS_NAME(JPH_SHADER_NAME)();
+	}
+};
+
+} // ShaderWrappers
+
+/// @endcond
+
+// Stop clang from complaining that the register function is missing a prototype
+JPH_SHADER_WRAPPER_FUNCTION(, JPH_SHADER_NAME);
+
+/// Register this wrapper
+JPH_SHADER_WRAPPER_FUNCTION(inComputeSystem, JPH_SHADER_NAME)
+{
+	inComputeSystem->RegisterShader(JPH_TO_STRING(JPH_SHADER_NAME), ShaderWrappers::JPH_SHADER_CLASS_NAME(JPH_SHADER_NAME)::sCreate);
+}
+
+#undef JPH_SHADER_OVERRIDE_MACROS
+#undef JPH_SHADER_GENERATE_WRAPPER
+#undef JPH_SHADER_CONSTANT
+#undef JPH_SHADER_CONSTANTS_BEGIN
+#undef JPH_SHADER_CONSTANTS_MEMBER
+#undef JPH_SHADER_CONSTANTS_END
+#undef JPH_SHADER_BUFFER
+#undef JPH_SHADER_RW_BUFFER
+#undef JPH_SHADER_BIND_BEGIN
+#undef JPH_SHADER_BIND_END
+#undef JPH_SHADER_BIND_BUFFER
+#undef JPH_SHADER_BIND_RW_BUFFER
+#undef JPH_SHADER_FUNCTION_BEGIN
+#undef JPH_SHADER_PARAM_THREAD_ID
+#undef JPH_SHADER_FUNCTION_END
+#undef JPH_SHADER_STRUCT_BEGIN
+#undef JPH_SHADER_STRUCT_MEMBER
+#undef JPH_SHADER_STRUCT_END
+#undef JPH_TO_STRING
+#undef JPH_TO_STRING2
+#undef JPH_SHADER_CLASS_NAME
+#undef JPH_SHADER_CLASS_NAME2
+#undef JPH_OUT
+#undef JPH_IN_OUT
+#undef JPH_SHADER_NAME
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeBuffer.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeBuffer.h
@ -0,0 +1,69 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/Reference.h>
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/Result.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeBuffer;
+using ComputeBufferResult = Result<Ref<ComputeBuffer>>;
+
+/// Buffer that can be read from / written to by a compute shader
+class JPH_EXPORT ComputeBuffer : public RefTarget<ComputeBuffer>, public NonCopyable
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Type of buffer
+	enum class EType
+	{
+		UploadBuffer,			///< Buffer that can be written on the CPU and then uploaded to the GPU.
+		ReadbackBuffer,			///< Buffer to be sent from the GPU to the CPU, used to read back data.
+		ConstantBuffer,			///< A smallish buffer that is used to pass constants to a shader.
+		Buffer,					///< Buffer that can be read from by a shader. Must be initialized with data at construction time and is read only thereafter.
+		RWBuffer,				///< Buffer that can be read from and written to by a shader.
+	};
+
+	/// Constructor / Destructor
+								ComputeBuffer(EType inType, uint64 inSize, uint inStride) : mType(inType), mSize(inSize), mStride(inStride) { }
+	virtual						~ComputeBuffer()								{ JPH_ASSERT(!mIsMapped); }
+
+	/// Properties
+	EType						GetType() const									{ return mType; }
+	uint64						GetSize() const									{ return mSize; }
+	uint						GetStride() const								{ return mStride; }
+
+	/// Mode in which the buffer is accessed
+	enum class EMode
+	{
+		Read,					///< Read only access to the buffer
+		Write,					///< Write only access to the buffer (this will discard all previous data in the buffer)
+	};
+
+	/// Map / unmap buffer (get pointer to data).
+	void *						Map(EMode inMode)								{ JPH_ASSERT(!mIsMapped); JPH_IF_ENABLE_ASSERTS(mIsMapped = true;) return MapInternal(inMode); }
+	template <typename T> T *	Map(EMode inMode)								{ JPH_ASSERT(!mIsMapped); JPH_IF_ENABLE_ASSERTS(mIsMapped = true;) JPH_ASSERT(sizeof(T) == mStride); return reinterpret_cast<T *>(MapInternal(inMode)); }
+	void						Unmap()											{ JPH_ASSERT(mIsMapped); JPH_IF_ENABLE_ASSERTS(mIsMapped = false;) UnmapInternal(); }
+
+	/// Create a readback buffer of the same size and stride that can be used to read the data stored in this buffer on CPU.
+	/// Note that this could also be implemented as 'return this' in case the underlying implementation allows locking GPU data on CPU directly.
+	virtual ComputeBufferResult	CreateReadBackBuffer() const = 0;
+
+protected:
+	EType						mType;
+	uint64						mSize;
+	uint						mStride;
+#ifdef JPH_ENABLE_ASSERTS
+	bool						mIsMapped = false;
+#endif // JPH_ENABLE_ASSERTS
+
+	virtual void *				MapInternal(EMode inMode) = 0;
+	virtual void				UnmapInternal() = 0;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeQueue.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeQueue.h
@ -0,0 +1,83 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/Reference.h>
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/Result.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeShader;
+class ComputeBuffer;
+
+/// A command queue for executing compute workloads on the GPU.
+///
+/// Note that only a single thread should be using a ComputeQueue at any time (although an implementation could be made that is thread safe).
+class JPH_EXPORT ComputeQueue : public RefTarget<ComputeQueue>, public NonCopyable
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Destructor
+	virtual					~ComputeQueue() = default;
+
+	/// Activate a shader. Shader must be set first before buffers can be bound.
+	/// After every Dispatch call, the shader must be set again and all buffers must be bound again.
+	virtual void			SetShader(const ComputeShader *inShader) = 0;
+
+	/// If a barrier should be placed before accessing the buffer
+	enum class EBarrier
+	{
+		Yes,
+		No
+	};
+
+	/// Bind a constant buffer to the shader. Note that the contents of the buffer cannot be modified until execution finishes.
+	/// A reference to the buffer is added to make sure it stays alive until execution finishes.
+	/// @param inName Name of the buffer as specified in the shader.
+	/// @param inBuffer The buffer to bind.
+	virtual void			SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer) = 0;
+
+	/// Bind a read only buffer to the shader. Note that the contents of the buffer cannot be modified on CPU until execution finishes (only relevant for buffers of type UploadBuffer).
+	/// A reference to the buffer is added to make sure it stays alive until execution finishes.
+	/// @param inName Name of the buffer as specified in the shader.
+	/// @param inBuffer The buffer to bind.
+	virtual void			SetBuffer(const char *inName, const ComputeBuffer *inBuffer) = 0;
+
+	/// Bind a read/write buffer to the shader.
+	/// A reference to the buffer is added to make sure it stays alive until execution finishes.
+	/// @param inName Name of the buffer as specified in the shader.
+	/// @param inBuffer The buffer to bind.
+	/// @param inBarrier If set to Yes, a barrier will be placed before accessing the buffer to ensure all previous writes to the buffer are visible.
+	virtual void 			SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier = EBarrier::Yes) = 0;
+
+	/// Dispatch a compute shader with the specified number of thread groups
+	virtual void			Dispatch(uint inThreadGroupsX, uint inThreadGroupsY = 1, uint inThreadGroupsZ = 1) = 0;
+
+	/// Schedule buffer to be copied from GPU to CPU.
+	/// A reference to the buffers is added to make sure they stay alive until execution finishes.
+	virtual void			ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc) = 0;
+
+	/// Execute accumulated command list.
+	/// No more commands can be added until Wait is called.
+	virtual void			Execute() = 0;
+
+	/// After executing, this waits until execution is done.
+	/// This also makes sure that any readback operations have completed and the data is available on CPU.
+	virtual void			Wait() = 0;
+
+	/// Execute and wait for the command list to finish
+	/// @see Execute, Wait
+	void					ExecuteAndWait()
+	{
+		Execute();
+		Wait();
+	}
+};
+
+using ComputeQueueResult = Result<Ref<ComputeQueue>>;
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeShader.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeShader.h
@ -0,0 +1,41 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/Reference.h>
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/Result.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Compute shader handle
+class JPH_EXPORT ComputeShader : public RefTarget<ComputeShader>, public NonCopyable
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor / destructor
+							ComputeShader(uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) :
+		mGroupSizeX(inGroupSizeX),
+		mGroupSizeY(inGroupSizeY),
+		mGroupSizeZ(inGroupSizeZ)
+	{
+	}
+	virtual					~ComputeShader() = default;
+
+	/// Get group sizes
+	uint32					GetGroupSizeX() const						{ return mGroupSizeX; }
+	uint32					GetGroupSizeY() const						{ return mGroupSizeY; }
+	uint32					GetGroupSizeZ() const						{ return mGroupSizeZ; }
+
+private:
+	uint32					mGroupSizeX;
+	uint32					mGroupSizeY;
+	uint32					mGroupSizeZ;
+};
+
+using ComputeShaderResult = Result<Ref<ComputeShader>>;
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeSystem.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeSystem.cpp
@ -0,0 +1,15 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Compute/ComputeSystem.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_ABSTRACT_BASE(ComputeSystem)
+{
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeSystem.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/ComputeSystem.h
@ -0,0 +1,78 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeShader.h>
+#include <Jolt/Compute/ComputeBuffer.h>
+#include <Jolt/Compute/ComputeQueue.h>
+#include <Jolt/Core/RTTI.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Interface to run a workload on the GPU
+class JPH_EXPORT ComputeSystem : public RefTarget<ComputeSystem>, public NonCopyable
+{
+public:
+	JPH_DECLARE_RTTI_ABSTRACT_BASE(JPH_EXPORT, ComputeSystem)
+
+	/// Destructor
+	virtual								~ComputeSystem() = default;
+
+	/// Compile a compute shader
+	virtual ComputeShaderResult			CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY = 1, uint32 inGroupSizeZ = 1) = 0;
+
+	/// Create a buffer for use with a compute shader
+	virtual ComputeBufferResult			CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData = nullptr) = 0;
+
+	/// Create a queue for executing compute shaders
+	virtual ComputeQueueResult			CreateComputeQueue() = 0;
+
+	/// Callback used when loading shaders
+	using ShaderLoader = std::function<bool(const char *inName, Array<uint8> &outData, String &outError)>;
+	ShaderLoader						mShaderLoader = [](const char *, Array<uint8> &, String &outError) { JPH_ASSERT(false, "Override this function"); outError = "Not implemented"; return false; };
+};
+
+using ComputeSystemResult = Result<Ref<ComputeSystem>>;
+
+#ifdef JPH_USE_VK
+/// Factory function to create a compute system using Vulkan
+extern JPH_EXPORT ComputeSystemResult	CreateComputeSystemVK();
+#endif
+
+#ifdef JPH_USE_CPU_COMPUTE
+/// Factory function to create a compute system that falls back to CPU.
+/// This is intended mainly for debugging purposes and is not optimized for performance
+extern JPH_EXPORT ComputeSystemResult	CreateComputeSystemCPU();
+#endif
+
+#ifdef JPH_USE_DX12
+
+/// Factory function to create a compute system using DirectX 12
+extern JPH_EXPORT ComputeSystemResult	CreateComputeSystemDX12();
+
+/// Factory function to create the default compute system for this platform
+inline ComputeSystemResult 				CreateComputeSystem()		{ return CreateComputeSystemDX12(); }
+
+#elif defined(JPH_USE_MTL)
+
+/// Factory function to create a compute system using Metal
+extern JPH_EXPORT ComputeSystemResult	CreateComputeSystemMTL();
+
+/// Factory function to create the default compute system for this platform
+inline ComputeSystemResult 				CreateComputeSystem()		{ return CreateComputeSystemMTL(); }
+
+#elif defined(JPH_USE_VK)
+
+/// Factory function to create the default compute system for this platform
+inline ComputeSystemResult 				CreateComputeSystem()		{ return CreateComputeSystemVK(); }
+
+#else
+
+/// Fallback implementation when no compute system is available
+inline ComputeSystemResult 				CreateComputeSystem()		{ ComputeSystemResult result; result.SetError("Not implemented"); return result; }
+
+#endif
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeBufferDX12.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeBufferDX12.cpp
@ -0,0 +1,167 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/ComputeBufferDX12.h>
+#include <Jolt/Compute/DX12/ComputeSystemDX12.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeBufferDX12::ComputeBufferDX12(ComputeSystemDX12 *inComputeSystem, EType inType, uint64 inSize, uint inStride) :
+	ComputeBuffer(inType, inSize, inStride),
+	mComputeSystem(inComputeSystem)
+{
+}
+
+bool ComputeBufferDX12::Initialize(const void *inData)
+{
+	uint64 buffer_size = mSize * mStride;
+
+	switch (mType)
+	{
+	case EType::UploadBuffer:
+		mBufferCPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+		mBufferGPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+		if (mBufferCPU == nullptr || mBufferGPU == nullptr)
+			return false;
+		break;
+
+	case EType::ConstantBuffer:
+		mBufferCPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+		if (mBufferCPU == nullptr)
+			return false;
+		break;
+
+	case EType::ReadbackBuffer:
+		JPH_ASSERT(inData == nullptr, "Can't upload data to a readback buffer");
+		mBufferCPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_READBACK, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+		if (mBufferCPU == nullptr)
+			return false;
+		break;
+
+	case EType::Buffer:
+		JPH_ASSERT(inData != nullptr);
+		mBufferCPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+		mBufferGPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+		if (mBufferCPU == nullptr || mBufferGPU == nullptr)
+			return false;
+		mNeedsSync = true;
+		break;
+
+	case EType::RWBuffer:
+		if (inData != nullptr)
+		{
+			mBufferCPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE, buffer_size);
+			if (mBufferCPU == nullptr)
+				return false;
+			mNeedsSync = true;
+		}
+		mBufferGPU = mComputeSystem->CreateD3DResource(D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, buffer_size);
+		if (mBufferGPU == nullptr)
+			return false;
+		break;
+	}
+
+	// Copy data to upload buffer
+	if (inData != nullptr)
+	{
+		void *data = nullptr;
+		D3D12_RANGE range = { 0, 0 }; // We're not going to read
+		mBufferCPU->Map(0, &range, &data);
+		memcpy(data, inData, size_t(buffer_size));
+		mBufferCPU->Unmap(0, nullptr);
+	}
+
+	return true;
+}
+
+bool ComputeBufferDX12::Barrier(ID3D12GraphicsCommandList *inCommandList, D3D12_RESOURCE_STATES inTo) const
+{
+	// Check if state changed
+	if (mCurrentState == inTo)
+		return false;
+
+	// Only buffers in GPU memory can change state
+	if (mType != ComputeBuffer::EType::Buffer && mType != ComputeBuffer::EType::RWBuffer)
+		return true;
+
+	D3D12_RESOURCE_BARRIER barrier;
+	barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+	barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+	barrier.Transition.pResource = GetResourceGPU();
+	barrier.Transition.StateBefore = mCurrentState;
+	barrier.Transition.StateAfter = inTo;
+	barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+	inCommandList->ResourceBarrier(1, &barrier);
+
+	mCurrentState = inTo;
+	return true;
+}
+
+void ComputeBufferDX12::RWBarrier(ID3D12GraphicsCommandList *inCommandList)
+{
+	JPH_ASSERT(mCurrentState == D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+
+	D3D12_RESOURCE_BARRIER barrier;
+	barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
+	barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+	barrier.Transition.pResource = GetResourceGPU();
+	inCommandList->ResourceBarrier(1, &barrier);
+}
+
+bool ComputeBufferDX12::SyncCPUToGPU(ID3D12GraphicsCommandList *inCommandList) const
+{
+	if (!mNeedsSync)
+		return false;
+
+	Barrier(inCommandList, D3D12_RESOURCE_STATE_COPY_DEST);
+
+	inCommandList->CopyResource(GetResourceGPU(), GetResourceCPU());
+
+	mNeedsSync = false;
+	return true;
+}
+
+void *ComputeBufferDX12::MapInternal(EMode inMode)
+{
+	void *mapped_resource = nullptr;
+
+	switch (inMode)
+	{
+	case EMode::Read:
+		JPH_ASSERT(mType == EType::ReadbackBuffer);
+		if (HRFailed(mBufferCPU->Map(0, nullptr, &mapped_resource)))
+			return nullptr;
+		break;
+
+	case EMode::Write:
+		{
+			JPH_ASSERT(mType == EType::UploadBuffer || mType == EType::ConstantBuffer);
+			D3D12_RANGE range = { 0, 0 }; // We're not going to read
+			if (HRFailed(mBufferCPU->Map(0, &range, &mapped_resource)))
+				return nullptr;
+			mNeedsSync = true;
+		}
+		break;
+	}
+
+	return mapped_resource;
+}
+
+void ComputeBufferDX12::UnmapInternal()
+{
+	mBufferCPU->Unmap(0, nullptr);
+}
+
+ComputeBufferResult ComputeBufferDX12::CreateReadBackBuffer() const
+{
+	return mComputeSystem->CreateComputeBuffer(EType::ReadbackBuffer, mSize, mStride);
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeBufferDX12.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeBufferDX12.h
@ -0,0 +1,51 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeBuffer.h>
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/IncludeDX12.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeSystemDX12;
+
+/// Buffer that can be read from / written to by a compute shader
+class JPH_EXPORT ComputeBufferDX12 final : public ComputeBuffer
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+									ComputeBufferDX12(ComputeSystemDX12 *inComputeSystem, EType inType, uint64 inSize, uint inStride);
+
+	bool							Initialize(const void *inData);
+
+	ID3D12Resource *				GetResourceCPU() const									{ return mBufferCPU.Get(); }
+	ID3D12Resource *				GetResourceGPU() const									{ return mBufferGPU.Get(); }
+	ComPtr<ID3D12Resource>			ReleaseResourceCPU() const								{ return std::move(mBufferCPU); }
+
+	bool							Barrier(ID3D12GraphicsCommandList *inCommandList, D3D12_RESOURCE_STATES inTo) const;
+	void							RWBarrier(ID3D12GraphicsCommandList *inCommandList);
+	bool							SyncCPUToGPU(ID3D12GraphicsCommandList *inCommandList) const;
+
+	ComputeBufferResult				CreateReadBackBuffer() const override;
+
+private:
+	virtual void *					MapInternal(EMode inMode) override;
+	virtual void					UnmapInternal() override;
+
+	ComputeSystemDX12 *				mComputeSystem;
+	mutable ComPtr<ID3D12Resource>	mBufferCPU;
+	ComPtr<ID3D12Resource>			mBufferGPU;
+	mutable bool					mNeedsSync = false;										///< If this buffer needs to be synced from CPU to GPU
+	mutable D3D12_RESOURCE_STATES	mCurrentState = D3D12_RESOURCE_STATE_COPY_DEST;			///< State of the GPU buffer so we can do proper barriers
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeQueueDX12.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeQueueDX12.cpp
@ -0,0 +1,221 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/ComputeQueueDX12.h>
+#include <Jolt/Compute/DX12/ComputeShaderDX12.h>
+#include <Jolt/Compute/DX12/ComputeBufferDX12.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeQueueDX12::~ComputeQueueDX12()
+{
+	Wait();
+
+	if (mFenceEvent != INVALID_HANDLE_VALUE)
+		CloseHandle(mFenceEvent);
+}
+
+bool ComputeQueueDX12::Initialize(ID3D12Device *inDevice, D3D12_COMMAND_LIST_TYPE inType, ComputeQueueResult &outResult)
+{
+	D3D12_COMMAND_QUEUE_DESC queue_desc = {};
+	queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+	queue_desc.Type = inType;
+	queue_desc.Priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH;
+	if (HRFailed(inDevice->CreateCommandQueue(&queue_desc, IID_PPV_ARGS(&mCommandQueue)), outResult))
+		return false;
+
+	if (HRFailed(inDevice->CreateCommandAllocator(inType, IID_PPV_ARGS(&mCommandAllocator)), outResult))
+		return false;
+
+	// Create the command list
+	if (HRFailed(inDevice->CreateCommandList(0, inType, mCommandAllocator.Get(), nullptr, IID_PPV_ARGS(&mCommandList)), outResult))
+		return false;
+
+	// Command lists are created in the recording state, but there is nothing to record yet. The main loop expects it to be closed, so close it now
+	if (HRFailed(mCommandList->Close(), outResult))
+		return false;
+
+	// Create synchronization object
+	if (HRFailed(inDevice->CreateFence(mFenceValue, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&mFence)), outResult))
+		return false;
+
+	// Increment fence value so we don't skip waiting the first time a command list is executed
+	mFenceValue++;
+
+	// Create an event handle to use for frame synchronization
+	mFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+	if (HRFailed(HRESULT_FROM_WIN32(GetLastError()), outResult))
+		return false;
+
+	return true;
+}
+
+ID3D12GraphicsCommandList *ComputeQueueDX12::Start()
+{
+	JPH_ASSERT(!mIsExecuting);
+
+	if (!mIsStarted)
+	{
+		// Reset the allocator
+		if (HRFailed(mCommandAllocator->Reset()))
+			return nullptr;
+
+		// Reset the command list
+		if (HRFailed(mCommandList->Reset(mCommandAllocator.Get(), nullptr)))
+			return nullptr;
+
+		// Now we have started recording commands
+		mIsStarted = true;
+	}
+
+	return mCommandList.Get();
+}
+
+void ComputeQueueDX12::SetShader(const ComputeShader *inShader)
+{
+	ID3D12GraphicsCommandList *command_list = Start();
+	mShader = static_cast<const ComputeShaderDX12 *>(inShader);
+	command_list->SetPipelineState(mShader->GetPipelineState());
+	command_list->SetComputeRootSignature(mShader->GetRootSignature());
+}
+
+void ComputeQueueDX12::SyncCPUToGPU(const ComputeBufferDX12 *inBuffer)
+{
+	// Ensure that any CPU writes are visible to the GPU
+	if (inBuffer->SyncCPUToGPU(mCommandList.Get())
+		&& (inBuffer->GetType() == ComputeBuffer::EType::Buffer || inBuffer->GetType() == ComputeBuffer::EType::RWBuffer))
+	{
+		// After the first upload, the CPU buffer is no longer needed for Buffer and RWBuffer types
+		mDelayedFreedBuffers.emplace_back(inBuffer->ReleaseResourceCPU());
+	}
+}
+
+void ComputeQueueDX12::SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::ConstantBuffer);
+
+	ID3D12GraphicsCommandList *command_list = Start();
+	const ComputeBufferDX12 *buffer = static_cast<const ComputeBufferDX12 *>(inBuffer);
+	command_list->SetComputeRootConstantBufferView(mShader->NameToIndex(inName), buffer->GetResourceCPU()->GetGPUVirtualAddress());
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueDX12::SetBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::UploadBuffer || inBuffer->GetType() == ComputeBuffer::EType::Buffer || inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+
+	ID3D12GraphicsCommandList *command_list = Start();
+	const ComputeBufferDX12 *buffer = static_cast<const ComputeBufferDX12 *>(inBuffer);
+	uint parameter_index = mShader->NameToIndex(inName);
+	SyncCPUToGPU(buffer);
+	buffer->Barrier(command_list, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+	command_list->SetComputeRootShaderResourceView(parameter_index, buffer->GetResourceGPU()->GetGPUVirtualAddress());
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueDX12::SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+
+	ID3D12GraphicsCommandList *command_list = Start();
+	ComputeBufferDX12 *buffer = static_cast<ComputeBufferDX12 *>(inBuffer);
+	uint parameter_index = mShader->NameToIndex(inName);
+	SyncCPUToGPU(buffer);
+	if (!buffer->Barrier(command_list, D3D12_RESOURCE_STATE_UNORDERED_ACCESS) && inBarrier == EBarrier::Yes)
+		buffer->RWBarrier(command_list);
+	command_list->SetComputeRootUnorderedAccessView(parameter_index, buffer->GetResourceGPU()->GetGPUVirtualAddress());
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueDX12::ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc)
+{
+	if (inDst == nullptr || inSrc == nullptr)
+		return;
+	JPH_ASSERT(inDst->GetType() == ComputeBuffer::EType::ReadbackBuffer);
+
+	ID3D12GraphicsCommandList *command_list = Start();
+	ComputeBufferDX12 *dst = static_cast<ComputeBufferDX12 *>(inDst);
+	const ComputeBufferDX12 *src = static_cast<const ComputeBufferDX12 *>(inSrc);
+	dst->Barrier(command_list, D3D12_RESOURCE_STATE_COPY_DEST);
+	src->Barrier(command_list, D3D12_RESOURCE_STATE_COPY_SOURCE);
+	command_list->CopyResource(dst->GetResourceCPU(), src->GetResourceGPU());
+
+	mUsedBuffers.insert(src);
+	mUsedBuffers.insert(dst);
+}
+
+void ComputeQueueDX12::Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ)
+{
+	ID3D12GraphicsCommandList *command_list = Start();
+	command_list->Dispatch(inThreadGroupsX, inThreadGroupsY, inThreadGroupsZ);
+}
+
+void ComputeQueueDX12::Execute()
+{
+	JPH_ASSERT(mIsStarted);
+	JPH_ASSERT(!mIsExecuting);
+
+	// Close the command list
+	if (HRFailed(mCommandList->Close()))
+		return;
+
+	// Execute the command list
+	ID3D12CommandList *command_lists[] = { mCommandList.Get() };
+	mCommandQueue->ExecuteCommandLists((UINT)std::size(command_lists), command_lists);
+
+	// Schedule a Signal command in the queue
+	if (HRFailed(mCommandQueue->Signal(mFence.Get(), mFenceValue)))
+		return;
+
+	// Clear the current shader
+	mShader = nullptr;
+
+	// Mark that we're executing
+	mIsExecuting = true;
+}
+
+void ComputeQueueDX12::Wait()
+{
+	// Check if we've been started
+	if (mIsExecuting)
+	{
+		if (mFence->GetCompletedValue() < mFenceValue)
+		{
+			// Wait until the fence has been processed
+			if (HRFailed(mFence->SetEventOnCompletion(mFenceValue, mFenceEvent)))
+				return;
+			WaitForSingleObjectEx(mFenceEvent, INFINITE, FALSE);
+		}
+
+		// Increment the fence value
+		mFenceValue++;
+
+		// Buffers can be freed now
+		mUsedBuffers.clear();
+
+		// Free buffers
+		mDelayedFreedBuffers.clear();
+
+		// Done executing
+		mIsExecuting = false;
+		mIsStarted = false;
+	}
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeQueueDX12.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeQueueDX12.h
@ -0,0 +1,61 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/ComputeQueue.h>
+#include <Jolt/Compute/DX12/ComputeShaderDX12.h>
+#include <Jolt/Core/UnorderedSet.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeBufferDX12;
+
+/// A command queue for DirectX for executing compute workloads on the GPU.
+class JPH_EXPORT ComputeQueueDX12 final : public ComputeQueue
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Destructor
+	virtual								~ComputeQueueDX12() override;
+
+	/// Initialize the queue
+	bool								Initialize(ID3D12Device *inDevice, D3D12_COMMAND_LIST_TYPE inType, ComputeQueueResult &outResult);
+
+	/// Start the command list (requires waiting until the previous one is finished)
+	ID3D12GraphicsCommandList *			Start();
+
+	// See: ComputeQueue
+	virtual void						SetShader(const ComputeShader *inShader) override;
+	virtual void						SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void						SetBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void 						SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier = EBarrier::Yes) override;
+	virtual void						ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc) override;
+	virtual void						Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ) override;
+	virtual void						Execute() override;
+	virtual void						Wait() override;
+
+private:
+	/// Copy the CPU buffer to the GPU buffer if needed
+	void								SyncCPUToGPU(const ComputeBufferDX12 *inBuffer);
+
+	ComPtr<ID3D12CommandQueue>			mCommandQueue;								///< The command queue that will hold command lists
+	ComPtr<ID3D12CommandAllocator>		mCommandAllocator;							///< Allocator that holds the memory for the commands
+	ComPtr<ID3D12GraphicsCommandList>	mCommandList;								///< The command list that will hold the render commands / state changes
+	HANDLE								mFenceEvent = INVALID_HANDLE_VALUE;			///< Fence event, used to wait for rendering to complete
+	ComPtr<ID3D12Fence>					mFence;										///< Fence object, used to signal the fence event
+	UINT64								mFenceValue = 0;							///< Current fence value, each time we need to wait we will signal the fence with this value, wait for it and then increase the value
+	RefConst<ComputeShaderDX12>			mShader = nullptr;							///< Current active shader
+	bool								mIsStarted = false;							///< If the command list has been started (reset) and is ready to record commands
+	bool								mIsExecuting = false;						///< If a command list is currently executing on the queue
+	UnorderedSet<RefConst<ComputeBuffer>> mUsedBuffers;								///< Buffers that are in use by the current execution, these will be retained until execution is finished so that we don't free buffers that are in use
+	Array<ComPtr<ID3D12Resource>>		mDelayedFreedBuffers;						///< Buffers freed during the execution
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeShaderDX12.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeShaderDX12.h
@ -0,0 +1,54 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/ComputeShader.h>
+#include <Jolt/Compute/DX12/IncludeDX12.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Compute shader handle for DirectX
+class JPH_EXPORT ComputeShaderDX12 : public ComputeShader
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+									ComputeShaderDX12(ComPtr<ID3DBlob> inShader, ComPtr<ID3D12RootSignature> inRootSignature, ComPtr<ID3D12PipelineState> inPipelineState, Array<String> &&inBindingNames, UnorderedMap<string_view, uint> &&inNameToIndex, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) :
+		ComputeShader(inGroupSizeX, inGroupSizeY, inGroupSizeZ),
+		mShader(inShader),
+		mRootSignature(inRootSignature),
+		mPipelineState(inPipelineState),
+		mBindingNames(std::move(inBindingNames)),
+		mNameToIndex(std::move(inNameToIndex))
+	{
+	}
+
+	/// Get index of shader parameter
+	uint							NameToIndex(const char *inName) const
+	{
+		UnorderedMap<string_view, uint>::const_iterator it = mNameToIndex.find(inName);
+		JPH_ASSERT(it != mNameToIndex.end());
+		return it->second;
+	}
+
+	/// Getters
+	ID3D12PipelineState *			GetPipelineState() const				{ return mPipelineState.Get(); }
+	ID3D12RootSignature *			GetRootSignature() const				{ return mRootSignature.Get(); }
+
+private:
+	ComPtr<ID3DBlob>				mShader;								///< The compiled shader
+	ComPtr<ID3D12RootSignature>		mRootSignature;							///< The root signature for this shader
+	ComPtr<ID3D12PipelineState>		mPipelineState;							///< The pipeline state object for this shader
+	Array<String>					mBindingNames;							///< A list of binding names, mNameToIndex points to these strings
+	UnorderedMap<string_view, uint>	mNameToIndex;							///< Maps names to indices for the shader parameters, using a string_view so we can do find() without an allocation
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12.cpp
@ -0,0 +1,443 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/ComputeSystemDX12.h>
+#include <Jolt/Compute/DX12/ComputeQueueDX12.h>
+#include <Jolt/Compute/DX12/ComputeShaderDX12.h>
+#include <Jolt/Compute/DX12/ComputeBufferDX12.h>
+#include <Jolt/Core/StringTools.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+JPH_MSVC_SUPPRESS_WARNING(5204) // 'X': class has virtual functions, but its trivial destructor is not virtual; instances of objects derived from this class may not be destructed correctly
+JPH_MSVC2026_PLUS_SUPPRESS_WARNING(4865) // wingdi.h(2806,1): '<unnamed-enum-DISPLAYCONFIG_OUTPUT_TECHNOLOGY_OTHER>': the underlying type will change from 'int' to '__int64' when '/Zc:enumTypes' is specified on the command line
+#include <fstream>
+#include <d3dcompiler.h>
+#include <dxcapi.h>
+#ifdef JPH_DEBUG
+	#include <d3d12sdklayers.h>
+#endif
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemDX12)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemDX12, ComputeSystem)
+}
+
+void ComputeSystemDX12::Initialize(ID3D12Device *inDevice, EDebug inDebug)
+{
+	mDevice = inDevice;
+	mDebug = inDebug;
+}
+
+void ComputeSystemDX12::Shutdown()
+{
+	mDevice.Reset();
+}
+
+ComPtr<ID3D12Resource> ComputeSystemDX12::CreateD3DResource(D3D12_HEAP_TYPE inHeapType, D3D12_RESOURCE_STATES inResourceState, D3D12_RESOURCE_FLAGS inFlags, uint64 inSize)
+{
+	// Create a new resource
+	D3D12_RESOURCE_DESC desc;
+	desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+	desc.Alignment = 0;
+	desc.Width = inSize;
+	desc.Height = 1;
+	desc.DepthOrArraySize = 1;
+	desc.MipLevels = 1;
+	desc.Format = DXGI_FORMAT_UNKNOWN;
+	desc.SampleDesc.Count = 1;
+	desc.SampleDesc.Quality = 0;
+	desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+	desc.Flags = inFlags;
+
+	D3D12_HEAP_PROPERTIES heap_properties = {};
+	heap_properties.Type = inHeapType;
+	heap_properties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+	heap_properties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+	heap_properties.CreationNodeMask = 1;
+	heap_properties.VisibleNodeMask = 1;
+
+	ComPtr<ID3D12Resource> resource;
+	if (HRFailed(mDevice->CreateCommittedResource(&heap_properties, D3D12_HEAP_FLAG_NONE, &desc, inResourceState, nullptr, IID_PPV_ARGS(&resource))))
+		return nullptr;
+	return resource;
+}
+
+ComputeShaderResult ComputeSystemDX12::CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ)
+{
+	ComputeShaderResult result;
+
+	// Read shader source file
+	Array<uint8> data;
+	String error;
+	String file_name = String(inName) + ".hlsl";
+	if (!mShaderLoader(file_name.c_str(), data, error))
+	{
+		result.SetError(error);
+		return result;
+	}
+
+#ifndef JPH_USE_DXC // Use FXC, the old shader compiler?
+
+	UINT flags = D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_WARNINGS_ARE_ERRORS | D3DCOMPILE_ALL_RESOURCES_BOUND;
+#ifdef JPH_DEBUG
+	flags |= D3DCOMPILE_SKIP_OPTIMIZATION;
+#else
+	flags |= D3DCOMPILE_OPTIMIZATION_LEVEL3;
+#endif
+	if (mDebug == EDebug::DebugSymbols)
+		flags |= D3DCOMPILE_DEBUG;
+
+	const D3D_SHADER_MACRO defines[] =
+	{
+		{ nullptr, nullptr }
+	};
+
+	// Handles loading include files through the shader loader
+	struct IncludeHandler : public ID3DInclude
+	{
+								IncludeHandler(const ShaderLoader &inShaderLoader) : mShaderLoader(inShaderLoader) { }
+		virtual					~IncludeHandler() = default;
+
+		STDMETHOD				(Open)(D3D_INCLUDE_TYPE, LPCSTR inFileName, LPCVOID, LPCVOID *outData, UINT *outNumBytes) override
+		{
+			// Read the header file
+			Array<uint8> file_data;
+			String error;
+			if (!mShaderLoader(inFileName, file_data, error))
+				return E_FAIL;
+			if (file_data.empty())
+			{
+				*outData = nullptr;
+				*outNumBytes = 0;
+				return S_OK;
+			}
+
+			// Copy to a new memory block
+			void *mem = CoTaskMemAlloc(file_data.size());
+			if (mem == nullptr)
+				return E_OUTOFMEMORY;
+			memcpy(mem, file_data.data(), file_data.size());
+			*outData = mem;
+			*outNumBytes = (UINT)file_data.size();
+			return S_OK;
+		}
+
+		STDMETHOD				(Close)(LPCVOID inData) override
+		{
+			if (inData != nullptr)
+				CoTaskMemFree(const_cast<void *>(inData));
+			return S_OK;
+		}
+
+	private:
+		const ShaderLoader &	mShaderLoader;
+	};
+	IncludeHandler include_handler(mShaderLoader);
+
+	// Compile source
+	ComPtr<ID3DBlob> shader_blob, error_blob;
+	if (FAILED(D3DCompile(&data[0],
+				(uint)data.size(),
+				file_name.c_str(),
+				defines,
+				&include_handler,
+				"main",
+				"cs_5_0",
+				flags,
+				0,
+				shader_blob.GetAddressOf(),
+				error_blob.GetAddressOf())))
+	{
+		if (error_blob)
+			result.SetError((const char *)error_blob->GetBufferPointer());
+		else
+			result.SetError("Shader compile error");
+		return result;
+	}
+
+	// Get shader description
+	ComPtr<ID3D12ShaderReflection> reflector;
+	if (FAILED(D3DReflect(shader_blob->GetBufferPointer(), shader_blob->GetBufferSize(), IID_PPV_ARGS(&reflector))))
+	{
+		result.SetError("Failed to reflect shader");
+		return result;
+	}
+
+#else
+
+	ComPtr<IDxcUtils> utils;
+	DxcCreateInstance(CLSID_DxcUtils, IID_PPV_ARGS(utils.GetAddressOf()));
+
+	// Custom include handler that forwards include loads to mShaderLoader
+	struct DxcIncludeHandler : public IDxcIncludeHandler
+	{
+								DxcIncludeHandler(IDxcUtils *inUtils, const ShaderLoader &inLoader) : mUtils(inUtils), mShaderLoader(inLoader) { }
+		virtual					~DxcIncludeHandler() = default;
+
+		STDMETHODIMP			QueryInterface(REFIID riid, void **ppvObject) override
+		{
+			JPH_ASSERT(false);
+			return E_NOINTERFACE;
+		}
+
+		STDMETHODIMP_(ULONG)	AddRef(void) override
+		{
+			// Allocated on the stack, we don't do ref counting
+			return 1;
+		}
+
+		STDMETHODIMP_(ULONG)	Release(void) override
+		{
+			// Allocated on the stack, we don't do ref counting
+			return 1;
+		}
+
+		// IDxcIncludeHandler::LoadSource uses IDxcBlob**
+		STDMETHODIMP			LoadSource(LPCWSTR inFilename, IDxcBlob **outIncludeSource) override
+		{
+			*outIncludeSource = nullptr;
+
+			// Convert to UTF-8
+			char file_name[MAX_PATH];
+			WideCharToMultiByte(CP_UTF8, 0, inFilename, -1, file_name, sizeof(file_name), nullptr, nullptr);
+
+			// Load the header
+			Array<uint8> file_data;
+			String error;
+			if (!mShaderLoader(file_name, file_data, error))
+				return E_FAIL;
+
+			// Create a blob from the loaded data
+			ComPtr<IDxcBlobEncoding> blob_encoder;
+			HRESULT hr = mUtils->CreateBlob(file_data.empty()? nullptr : file_data.data(), (uint)file_data.size(), CP_UTF8, blob_encoder.GetAddressOf());
+			if (FAILED(hr))
+				return hr;
+
+			// Return as IDxcBlob
+			*outIncludeSource = blob_encoder.Detach();
+			return S_OK;
+		}
+
+		IDxcUtils *				mUtils;
+		const ShaderLoader &	mShaderLoader;
+	};
+	DxcIncludeHandler include_handler(utils.Get(), mShaderLoader);
+
+	ComPtr<IDxcBlobEncoding> source;
+	if (HRFailed(utils->CreateBlob(data.data(), (uint)data.size(), CP_UTF8, source.GetAddressOf()), result))
+		return result;
+
+	ComPtr<IDxcCompiler3> compiler;
+	DxcCreateInstance(CLSID_DxcCompiler, IID_PPV_ARGS(compiler.GetAddressOf()));
+
+	Array<LPCWSTR> arguments;
+	arguments.push_back(L"-E");
+	arguments.push_back(L"main");
+	arguments.push_back(L"-T");
+	arguments.push_back(L"cs_6_0");
+	arguments.push_back(DXC_ARG_WARNINGS_ARE_ERRORS);
+	arguments.push_back(DXC_ARG_OPTIMIZATION_LEVEL3);
+	arguments.push_back(DXC_ARG_ALL_RESOURCES_BOUND);
+	if (mDebug == EDebug::DebugSymbols)
+	{
+		arguments.push_back(DXC_ARG_DEBUG);
+		arguments.push_back(L"-Qembed_debug");
+	}
+
+	// Provide file name so tools know what the original shader was called (the actual source comes from the blob)
+	wchar_t w_file_name[MAX_PATH];
+	MultiByteToWideChar(CP_UTF8, 0, file_name.c_str(), -1, w_file_name, MAX_PATH);
+	arguments.push_back(w_file_name);
+
+	// Compile the shader
+	DxcBuffer source_buffer;
+	source_buffer.Ptr = source->GetBufferPointer();
+	source_buffer.Size = source->GetBufferSize();
+	source_buffer.Encoding = 0;
+	ComPtr<IDxcResult> compile_result;
+	if (FAILED(compiler->Compile(&source_buffer, arguments.data(), (uint32)arguments.size(), &include_handler, IID_PPV_ARGS(compile_result.GetAddressOf()))))
+	{
+		result.SetError("Failed to compile shader");
+		return result;
+	}
+
+	// Check for compilation errors
+	ComPtr<IDxcBlobUtf8> errors;
+	compile_result->GetOutput(DXC_OUT_ERRORS, IID_PPV_ARGS(errors.GetAddressOf()), nullptr);
+	if (errors != nullptr && errors->GetStringLength() > 0)
+	{
+		result.SetError((const char *)errors->GetBufferPointer());
+		return result;
+	}
+
+	// Get the compiled shader code
+	ComPtr<ID3DBlob> shader_blob;
+	if (HRFailed(compile_result->GetOutput(DXC_OUT_OBJECT, IID_PPV_ARGS(shader_blob.GetAddressOf()), nullptr), result))
+		return result;
+
+	// Get reflection data
+	ComPtr<IDxcBlob> reflection_data;
+	if (HRFailed(compile_result->GetOutput(DXC_OUT_REFLECTION, IID_PPV_ARGS(reflection_data.GetAddressOf()), nullptr), result))
+		return result;
+	DxcBuffer reflection_buffer;
+	reflection_buffer.Ptr = reflection_data->GetBufferPointer();
+	reflection_buffer.Size = reflection_data->GetBufferSize();
+	reflection_buffer.Encoding = 0;
+	ComPtr<ID3D12ShaderReflection> reflector;
+	if (HRFailed(utils->CreateReflection(&reflection_buffer, IID_PPV_ARGS(reflector.GetAddressOf())), result))
+		return result;
+
+#endif // JPH_USE_DXC
+
+	// Get the shader description
+	D3D12_SHADER_DESC shader_desc;
+	if (HRFailed(reflector->GetDesc(&shader_desc), result))
+		return result;
+
+	// Verify that the group sizes match the shader's thread group size
+	UINT thread_group_size_x, thread_group_size_y, thread_group_size_z;
+	if (HRFailed(reflector->GetThreadGroupSize(&thread_group_size_x, &thread_group_size_y, &thread_group_size_z), result))
+		return result;
+	JPH_ASSERT(inGroupSizeX == thread_group_size_x, "Group size X mismatch");
+	JPH_ASSERT(inGroupSizeY == thread_group_size_y, "Group size Y mismatch");
+	JPH_ASSERT(inGroupSizeZ == thread_group_size_z, "Group size Z mismatch");
+
+	// Convert parameters to root signature description
+	Array<String> binding_names;
+	binding_names.reserve(shader_desc.BoundResources);
+	UnorderedMap<string_view, uint> name_to_index;
+	Array<D3D12_ROOT_PARAMETER1> root_params;
+	for (UINT i = 0; i < shader_desc.BoundResources; ++i)
+	{
+		D3D12_SHADER_INPUT_BIND_DESC bind_desc;
+		reflector->GetResourceBindingDesc(i, &bind_desc);
+
+		D3D12_ROOT_PARAMETER1 param = {};
+		param.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+
+		switch (bind_desc.Type)
+		{
+		case D3D_SIT_CBUFFER:
+			param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
+			break;
+
+		case D3D_SIT_STRUCTURED:
+		case D3D_SIT_BYTEADDRESS:
+			param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV;
+			break;
+
+		case D3D_SIT_UAV_RWTYPED:
+		case D3D_SIT_UAV_RWSTRUCTURED:
+		case D3D_SIT_UAV_RWBYTEADDRESS:
+        case D3D_SIT_UAV_APPEND_STRUCTURED:
+        case D3D_SIT_UAV_CONSUME_STRUCTURED:
+		case D3D_SIT_UAV_RWSTRUCTURED_WITH_COUNTER:
+			param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV;
+			break;
+
+        case D3D_SIT_TBUFFER:
+        case D3D_SIT_TEXTURE:
+        case D3D_SIT_SAMPLER:
+        case D3D_SIT_RTACCELERATIONSTRUCTURE:
+		case D3D_SIT_UAV_FEEDBACKTEXTURE:
+			JPH_ASSERT(false, "Unsupported shader input type");
+			continue;
+		}
+
+		param.Descriptor.RegisterSpace = bind_desc.Space;
+		param.Descriptor.ShaderRegister = bind_desc.BindPoint;
+		param.Descriptor.Flags = D3D12_ROOT_DESCRIPTOR_FLAG_DATA_VOLATILE;
+
+		binding_names.push_back(bind_desc.Name); // Add all strings to a pool to keep them alive
+		name_to_index[string_view(binding_names.back())] = (uint)root_params.size();
+		root_params.push_back(param);
+	}
+
+	// Create the root signature
+	D3D12_VERSIONED_ROOT_SIGNATURE_DESC root_sig_desc = {};
+	root_sig_desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+	root_sig_desc.Desc_1_1.NumParameters = (UINT)root_params.size();
+	root_sig_desc.Desc_1_1.pParameters = root_params.data();
+	root_sig_desc.Desc_1_1.NumStaticSamplers = 0;
+	root_sig_desc.Desc_1_1.pStaticSamplers = nullptr;
+	root_sig_desc.Desc_1_1.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
+	ComPtr<ID3DBlob> serialized_sig;
+	ComPtr<ID3DBlob> root_sig_error_blob;
+	if (FAILED(D3D12SerializeVersionedRootSignature(&root_sig_desc, &serialized_sig, &root_sig_error_blob)))
+	{
+		if (root_sig_error_blob)
+		{
+			error = StringFormat("Failed to create root signature: %s", (const char *)root_sig_error_blob->GetBufferPointer());
+			result.SetError(error);
+		}
+		else
+			result.SetError("Failed to create root signature");
+		return result;
+	}
+	ComPtr<ID3D12RootSignature> root_sig;
+	if (FAILED(mDevice->CreateRootSignature(0, serialized_sig->GetBufferPointer(), serialized_sig->GetBufferSize(), IID_PPV_ARGS(&root_sig))))
+	{
+		result.SetError("Failed to create root signature");
+		return result;
+	}
+
+	// Create a pipeline state object from the root signature and the shader
+	ComPtr<ID3D12PipelineState> pipeline_state;
+	D3D12_COMPUTE_PIPELINE_STATE_DESC compute_state_desc = {};
+	compute_state_desc.pRootSignature = root_sig.Get();
+	compute_state_desc.CS = { shader_blob->GetBufferPointer(), shader_blob->GetBufferSize() };
+	if (FAILED(mDevice->CreateComputePipelineState(&compute_state_desc, IID_PPV_ARGS(&pipeline_state))))
+	{
+		result.SetError("Failed to create compute pipeline state");
+		return result;
+	}
+
+	// Set name on DX12 objects for easier debugging
+	wchar_t w_name[1024];
+	size_t converted_chars = 0;
+	mbstowcs_s(&converted_chars, w_name, 1024, inName, _TRUNCATE);
+	pipeline_state->SetName(w_name);
+
+	result.Set(new ComputeShaderDX12(shader_blob, root_sig, pipeline_state, std::move(binding_names), std::move(name_to_index), inGroupSizeX, inGroupSizeY, inGroupSizeZ));
+	return result;
+}
+
+ComputeBufferResult ComputeSystemDX12::CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData)
+{
+	ComputeBufferResult result;
+
+	Ref<ComputeBufferDX12> buffer = new ComputeBufferDX12(this, inType, inSize, inStride);
+	if (!buffer->Initialize(inData))
+	{
+		result.SetError("Failed to create compute buffer");
+		return result;
+	}
+
+	result.Set(buffer.GetPtr());
+	return result;
+}
+
+ComputeQueueResult ComputeSystemDX12::CreateComputeQueue()
+{
+	ComputeQueueResult result;
+
+	Ref<ComputeQueueDX12> queue = new ComputeQueueDX12();
+	if (!queue->Initialize(mDevice.Get(), D3D12_COMMAND_LIST_TYPE_COMPUTE, result))
+		return result;
+
+	result.Set(queue.GetPtr());
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12.h
@ -0,0 +1,52 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/UnorderedMap.h>
+#include <Jolt/Compute/ComputeSystem.h>
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/IncludeDX12.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Interface to run a workload on the GPU using DirectX 12.
+/// Minimal implementation that can integrate with your own DirectX 12 setup.
+class JPH_EXPORT ComputeSystemDX12 : public ComputeSystem
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemDX12)
+
+	/// How we want to compile our shaders
+	enum class EDebug
+	{
+		NoDebugSymbols,
+		DebugSymbols
+	};
+
+	/// Initialize / shutdown
+	void							Initialize(ID3D12Device *inDevice, EDebug inDebug);
+	void							Shutdown();
+
+	// See: ComputeSystem
+	virtual ComputeShaderResult  	CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) override;
+	virtual ComputeBufferResult  	CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData = nullptr) override;
+	virtual ComputeQueueResult  	CreateComputeQueue() override;
+
+	/// Access to the DX12 device
+	ID3D12Device *					GetDevice() const								{ return mDevice.Get(); }
+
+	// Function to create a ID3D12Resource on specified heap with specified state
+	ComPtr<ID3D12Resource>			CreateD3DResource(D3D12_HEAP_TYPE inHeapType, D3D12_RESOURCE_STATES inResourceState, D3D12_RESOURCE_FLAGS inFlags, uint64 inSize);
+
+private:
+	ComPtr<ID3D12Device>			mDevice;
+	EDebug							mDebug = EDebug::NoDebugSymbols;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12Impl.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12Impl.cpp
@ -0,0 +1,154 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/ComputeSystemDX12Impl.h>
+
+#ifdef JPH_DEBUG
+	#include <d3d12sdklayers.h>
+#endif
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemDX12Impl)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemDX12Impl, ComputeSystemDX12)
+}
+
+ComputeSystemDX12Impl::~ComputeSystemDX12Impl()
+{
+	Shutdown();
+	mDXGIFactory.Reset();
+
+#ifdef JPH_DEBUG
+	// Test for leaks
+	ComPtr<IDXGIDebug1> dxgi_debug;
+	if (SUCCEEDED(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&dxgi_debug))))
+		dxgi_debug->ReportLiveObjects(DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_ALL);
+#endif
+}
+
+bool ComputeSystemDX12Impl::Initialize(ComputeSystemResult &outResult)
+{
+#if defined(JPH_DEBUG)
+	// Enable the D3D12 debug layer
+	ComPtr<ID3D12Debug> debug_controller;
+	if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debug_controller))))
+		debug_controller->EnableDebugLayer();
+#endif
+
+	// Create DXGI factory
+	if (HRFailed(CreateDXGIFactory1(IID_PPV_ARGS(&mDXGIFactory)), outResult))
+		return false;
+
+	// Find adapter
+	ComPtr<IDXGIAdapter1> adapter;
+	ComPtr<ID3D12Device> device;
+
+	HRESULT result = E_FAIL;
+
+	// First check if we have the Windows 1803 IDXGIFactory6 interface
+	ComPtr<IDXGIFactory6> factory6;
+	if (SUCCEEDED(mDXGIFactory->QueryInterface(IID_PPV_ARGS(&factory6))))
+	{
+		for (int search_software = 0; search_software < 2 && device == nullptr; ++search_software)
+			for (UINT index = 0; factory6->EnumAdapterByGpuPreference(index, DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE, IID_PPV_ARGS(&adapter)) != DXGI_ERROR_NOT_FOUND; ++index)
+			{
+				DXGI_ADAPTER_DESC1 desc;
+				adapter->GetDesc1(&desc);
+
+				// We don't want software renderers in the first pass
+				int is_software = (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) != 0? 1 : 0;
+				if (search_software != is_software)
+					continue;
+
+				// Check to see whether the adapter supports Direct3D 12
+			#if defined(JPH_PLATFORM_WINDOWS) && defined(_DEBUG)
+				int prev_state = _CrtSetDbgFlag(0); // Temporarily disable leak detection as this call reports false positives
+			#endif
+				result = D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&device));
+			#if defined(JPH_PLATFORM_WINDOWS) && defined(_DEBUG)
+				_CrtSetDbgFlag(prev_state);
+			#endif
+				if (SUCCEEDED(result))
+					break;
+			}
+	}
+	else
+	{
+		// Fall back to the older method that may not get the fastest GPU
+		for (int search_software = 0; search_software < 2 && device == nullptr; ++search_software)
+			for (UINT index = 0; mDXGIFactory->EnumAdapters1(index, &adapter) != DXGI_ERROR_NOT_FOUND; ++index)
+			{
+				DXGI_ADAPTER_DESC1 desc;
+				adapter->GetDesc1(&desc);
+
+				// We don't want software renderers in the first pass
+				int is_software = (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) != 0? 1 : 0;
+				if (search_software != is_software)
+					continue;
+
+				// Check to see whether the adapter supports Direct3D 12
+			#if defined(JPH_PLATFORM_WINDOWS) && defined(_DEBUG)
+				int prev_state = _CrtSetDbgFlag(0); // Temporarily disable leak detection as this call reports false positives
+			#endif
+				result = D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&device));
+			#if defined(JPH_PLATFORM_WINDOWS) && defined(_DEBUG)
+				_CrtSetDbgFlag(prev_state);
+			#endif
+				if (SUCCEEDED(result))
+					break;
+			}
+	}
+
+	// Check if we managed to obtain a device
+	if (HRFailed(result, outResult))
+		return false;
+
+	// Initialize the compute interface
+	ComputeSystemDX12::Initialize(device.Get(), EDebug::DebugSymbols);
+
+#ifdef JPH_DEBUG
+	// Enable breaking on errors
+	ComPtr<ID3D12InfoQueue> info_queue;
+	if (SUCCEEDED(device.As(&info_queue)))
+	{
+		info_queue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, TRUE);
+		info_queue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, TRUE);
+		info_queue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_WARNING, TRUE);
+
+		// Disable an error that triggers on Windows 11 with a hybrid graphic system
+		// See: https://stackoverflow.com/questions/69805245/directx-12-application-is-crashing-in-windows-11
+		D3D12_MESSAGE_ID hide[] =
+		{
+			D3D12_MESSAGE_ID_RESOURCE_BARRIER_MISMATCHING_COMMAND_LIST_TYPE,
+		};
+		D3D12_INFO_QUEUE_FILTER filter = { };
+		filter.DenyList.NumIDs = static_cast<UINT>(std::size(hide));
+		filter.DenyList.pIDList = hide;
+		info_queue->AddStorageFilterEntries(&filter);
+	}
+#endif // JPH_DEBUG
+
+	return true;
+}
+
+ComputeSystemResult CreateComputeSystemDX12()
+{
+	ComputeSystemResult result;
+
+	Ref<ComputeSystemDX12Impl> compute = new ComputeSystemDX12Impl();
+	if (!compute->Initialize(result))
+		return result;
+
+	result.Set(compute.GetPtr());
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12Impl.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/ComputeSystemDX12Impl.h
@ -0,0 +1,33 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_DX12
+
+#include <Jolt/Compute/DX12/ComputeSystemDX12.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Implementation of ComputeSystemDX12 that fully initializes DirectX 12
+class JPH_EXPORT ComputeSystemDX12Impl : public ComputeSystemDX12
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemDX12Impl)
+
+	/// Destructor
+	virtual 						~ComputeSystemDX12Impl() override;
+
+	/// Initialize the compute system
+	bool							Initialize(ComputeSystemResult &outResult);
+
+	IDXGIFactory4 *					GetDXGIFactory() const						{ return mDXGIFactory.Get(); }
+
+private:
+	ComPtr<IDXGIFactory4>			mDXGIFactory;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_DX12
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/IncludeDX12.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/DX12/IncludeDX12.h
@ -0,0 +1,49 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/IncludeWindows.h>
+#include <Jolt/Core/StringTools.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+JPH_MSVC_SUPPRESS_WARNING(4265) // 'X': class has virtual functions, but its non-trivial destructor is not virtual; instances of this class may not be destructed correctly
+JPH_MSVC_SUPPRESS_WARNING(4625) // 'X': copy constructor was implicitly defined as deleted
+JPH_MSVC_SUPPRESS_WARNING(4626) // 'X': assignment operator was implicitly defined as deleted
+JPH_MSVC_SUPPRESS_WARNING(5204) // 'X': class has virtual functions, but its trivial destructor is not virtual; instances of objects derived from this class may not be destructed correctly
+JPH_MSVC_SUPPRESS_WARNING(5220) // 'X': a non-static data member with a volatile qualified type no longer implies
+JPH_MSVC2026_PLUS_SUPPRESS_WARNING(4865) // wingdi.h(2806,1): '<unnamed-enum-DISPLAYCONFIG_OUTPUT_TECHNOLOGY_OTHER>': the underlying type will change from 'int' to '__int64' when '/Zc:enumTypes' is specified on the command line
+#include <d3d12.h>
+#include <dxgi1_6.h>
+#include <dxgidebug.h>
+#include <wrl.h>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+using Microsoft::WRL::ComPtr;
+
+template <class Result>
+inline bool HRFailed(HRESULT inHR, Result &outResult)
+{
+	if (SUCCEEDED(inHR))
+		return false;
+
+	String error = StringFormat("Call failed with error code: %08X", inHR);
+	outResult.SetError(error);
+	JPH_ASSERT(false);
+	return true;
+}
+
+inline bool HRFailed(HRESULT inHR)
+{
+	if (SUCCEEDED(inHR))
+		return false;
+
+	Trace("Call failed with error code: %08X", inHR);
+	JPH_ASSERT(false);
+	return true;
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeBufferMTL.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeBufferMTL.h
@ -0,0 +1,39 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeSystemMTL.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Buffer that can be read from / written to by a compute shader
+class JPH_EXPORT ComputeBufferMTL final : public ComputeBuffer
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+									ComputeBufferMTL(ComputeSystemMTL *inComputeSystem, EType inType, uint64 inSize, uint inStride);
+	virtual							~ComputeBufferMTL() override;
+
+	bool							Initialize(const void *inData);
+
+	virtual ComputeBufferResult		CreateReadBackBuffer() const override;
+
+	id<MTLBuffer>					GetBuffer() const							{ return mBuffer; }
+
+private:
+	virtual void *					MapInternal(EMode inMode) override;
+	virtual void					UnmapInternal() override;
+
+	ComputeSystemMTL *				mComputeSystem;
+	id<MTLBuffer>					mBuffer;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeBufferMTL.mm
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeBufferMTL.mm
@ -0,0 +1,52 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeBufferMTL.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeBufferMTL::ComputeBufferMTL(ComputeSystemMTL *inComputeSystem, EType inType, uint64 inSize, uint inStride) :
+	ComputeBuffer(inType, inSize, inStride),
+	mComputeSystem(inComputeSystem)
+{
+}
+
+bool ComputeBufferMTL::Initialize(const void *inData)
+{
+	NSUInteger size = NSUInteger(mSize) * mStride;
+	if (inData != nullptr)
+		mBuffer = [mComputeSystem->GetDevice() newBufferWithBytes: inData length: size options: MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared | MTLResourceHazardTrackingModeTracked];
+	else
+		mBuffer = [mComputeSystem->GetDevice() newBufferWithLength: size options: MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared | MTLResourceHazardTrackingModeTracked];
+	return mBuffer != nil;
+}
+
+ComputeBufferMTL::~ComputeBufferMTL()
+{
+	[mBuffer release];
+}
+
+void *ComputeBufferMTL::MapInternal(EMode inMode)
+{
+	return mBuffer.contents;
+}
+
+void ComputeBufferMTL::UnmapInternal()
+{
+}
+
+ComputeBufferResult ComputeBufferMTL::CreateReadBackBuffer() const
+{
+	ComputeBufferResult result;
+	result.Set(const_cast<ComputeBufferMTL *>(this));
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeQueueMTL.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeQueueMTL.h
@ -0,0 +1,49 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_MTL
+
+#include <MetalKit/MetalKit.h>
+
+#include <Jolt/Compute/ComputeQueue.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeShaderMTL;
+
+/// A command queue for Metal for executing compute workloads on the GPU.
+class JPH_EXPORT ComputeQueueMTL final : public ComputeQueue
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor / destructor
+										ComputeQueueMTL(id<MTLDevice> inDevice);
+	virtual								~ComputeQueueMTL() override;
+
+	// See: ComputeQueue
+	virtual void						SetShader(const ComputeShader *inShader) override;
+	virtual void						SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void						SetBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void 						SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier = EBarrier::Yes) override;
+	virtual void						ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc) override;
+	virtual void						Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ) override;
+	virtual void						Execute() override;
+	virtual void						Wait() override;
+
+private:
+	void								BeginCommandBuffer();
+
+	id<MTLCommandQueue>					mCommandQueue;
+	id<MTLCommandBuffer> 				mCommandBuffer;
+	id<MTLComputeCommandEncoder>		mComputeEncoder;
+	RefConst<ComputeShaderMTL>			mShader;
+	bool								mIsExecuting = false;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeQueueMTL.mm
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeQueueMTL.mm
@ -0,0 +1,123 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeQueueMTL.h>
+#include <Jolt/Compute/MTL/ComputeShaderMTL.h>
+#include <Jolt/Compute/MTL/ComputeBufferMTL.h>
+#include <Jolt/Compute/MTL/ComputeSystemMTL.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeQueueMTL::~ComputeQueueMTL()
+{
+	Wait();
+
+	[mCommandQueue release];
+}
+
+ComputeQueueMTL::ComputeQueueMTL(id<MTLDevice> inDevice)
+{
+	// Create the command queue
+	mCommandQueue = [inDevice newCommandQueue];
+}
+
+void ComputeQueueMTL::BeginCommandBuffer()
+{
+	if (mCommandBuffer == nil)
+	{
+		// Start a new command buffer
+		mCommandBuffer = [mCommandQueue commandBuffer];
+		mComputeEncoder = [mCommandBuffer computeCommandEncoder];
+	}
+}
+
+void ComputeQueueMTL::SetShader(const ComputeShader *inShader)
+{
+	BeginCommandBuffer();
+
+	mShader = static_cast<const ComputeShaderMTL *>(inShader);
+
+	[mComputeEncoder setComputePipelineState: mShader->GetPipelineState()];
+}
+
+void ComputeQueueMTL::SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::ConstantBuffer);
+
+	BeginCommandBuffer();
+
+	const ComputeBufferMTL *buffer = static_cast<const ComputeBufferMTL *>(inBuffer);
+	[mComputeEncoder setBuffer: buffer->GetBuffer() offset: 0 atIndex: mShader->NameToBindingIndex(inName)];
+}
+
+void ComputeQueueMTL::SetBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::UploadBuffer || inBuffer->GetType() == ComputeBuffer::EType::Buffer || inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+
+	BeginCommandBuffer();
+
+	const ComputeBufferMTL *buffer = static_cast<const ComputeBufferMTL *>(inBuffer);
+	[mComputeEncoder setBuffer: buffer->GetBuffer() offset: 0 atIndex: mShader->NameToBindingIndex(inName)];
+}
+
+void ComputeQueueMTL::SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+
+	BeginCommandBuffer();
+
+	const ComputeBufferMTL *buffer = static_cast<const ComputeBufferMTL *>(inBuffer);
+	[mComputeEncoder setBuffer: buffer->GetBuffer() offset: 0 atIndex: mShader->NameToBindingIndex(inName)];
+}
+
+void ComputeQueueMTL::ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc)
+{
+	JPH_ASSERT(inDst == inSrc); // Since ComputeBuffer::CreateReadBackBuffer returns the same buffer, we don't need to copy
+}
+
+void ComputeQueueMTL::Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ)
+{
+	BeginCommandBuffer();
+
+	MTLSize thread_groups = MTLSizeMake(inThreadGroupsX, inThreadGroupsY, inThreadGroupsZ);
+	MTLSize group_size = MTLSizeMake(mShader->GetGroupSizeX(), mShader->GetGroupSizeY(), mShader->GetGroupSizeZ());
+	[mComputeEncoder dispatchThreadgroups: thread_groups threadsPerThreadgroup: group_size];
+}
+
+void ComputeQueueMTL::Execute()
+{
+	// End command buffer
+	if (mCommandBuffer == nil)
+		return;
+
+	[mComputeEncoder endEncoding];
+	[mCommandBuffer commit];
+	mShader = nullptr;
+	mIsExecuting = true;
+}
+
+void ComputeQueueMTL::Wait()
+{
+	if (!mIsExecuting)
+		return;
+
+	[mCommandBuffer waitUntilCompleted];
+	mComputeEncoder = nil;
+	mCommandBuffer = nil;
+	mIsExecuting = false;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeShaderMTL.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeShaderMTL.h
@ -0,0 +1,39 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_MTL
+
+#include <MetalKit/MetalKit.h>
+
+#include <Jolt/Compute/ComputeShader.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Compute shader handle for Metal
+class JPH_EXPORT ComputeShaderMTL : public ComputeShader
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+								ComputeShaderMTL(id<MTLComputePipelineState> inPipelineState, MTLComputePipelineReflection *inReflection, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ);
+	virtual						~ComputeShaderMTL() override 					{ [mPipelineState release]; }
+
+	/// Access to the function
+	id<MTLComputePipelineState>	GetPipelineState() const						{ return mPipelineState; }
+
+	/// Get index of buffer name
+	uint						NameToBindingIndex(const char *inName) const;
+
+private:
+	id<MTLComputePipelineState>	mPipelineState;
+	UnorderedMap<String, uint>	mNameToBindingIndex;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeShaderMTL.mm
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeShaderMTL.mm
@ -0,0 +1,34 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeShaderMTL.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeShaderMTL::ComputeShaderMTL(id<MTLComputePipelineState> inPipelineState, MTLComputePipelineReflection *inReflection, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) :
+	ComputeShader(inGroupSizeX, inGroupSizeY, inGroupSizeZ),
+	mPipelineState(inPipelineState)
+{
+	for (id<MTLBinding> binding in inReflection.bindings)
+	{
+		const char *name = [binding.name UTF8String];
+		uint index = uint(binding.index);
+		mNameToBindingIndex[name] = index;
+	}
+}
+
+uint ComputeShaderMTL::NameToBindingIndex(const char *inName) const
+{
+	UnorderedMap<String, uint>::const_iterator it = mNameToBindingIndex.find(inName);
+	JPH_ASSERT(it != mNameToBindingIndex.end());
+	return it->second;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTL.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTL.h
@ -0,0 +1,40 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeSystem.h>
+
+#ifdef JPH_USE_MTL
+
+#include <MetalKit/MetalKit.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Interface to run a workload on the GPU
+class JPH_EXPORT ComputeSystemMTL : public ComputeSystem
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemMTL)
+
+	// Initialize / shutdown the compute system
+	bool							Initialize(id<MTLDevice> inDevice);
+	void							Shutdown();
+
+	// See: ComputeSystem
+	virtual ComputeShaderResult		CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) override;
+	virtual ComputeBufferResult		CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData = nullptr) override;
+	virtual ComputeQueueResult		CreateComputeQueue() override;
+
+	/// Get the metal device
+	id<MTLDevice>					GetDevice() const						{ return mDevice; }
+
+private:
+	id<MTLDevice>					mDevice;
+	id<MTLLibrary>					mShaderLibrary;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTL.mm
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTL.mm
@ -0,0 +1,110 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeSystemMTL.h>
+#include <Jolt/Compute/MTL/ComputeBufferMTL.h>
+#include <Jolt/Compute/MTL/ComputeShaderMTL.h>
+#include <Jolt/Compute/MTL/ComputeQueueMTL.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemMTL)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemMTL, ComputeSystem)
+}
+
+bool ComputeSystemMTL::Initialize(id<MTLDevice> inDevice)
+{
+	mDevice = [inDevice retain];
+
+	return true;
+}
+
+void ComputeSystemMTL::Shutdown()
+{
+	[mShaderLibrary release];
+	[mDevice release];
+}
+
+ComputeShaderResult ComputeSystemMTL::CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ)
+{
+	ComputeShaderResult result;
+
+	if (mShaderLibrary == nil)
+	{
+		// Load the shader library containing all shaders
+		Array<uint8> *data = new Array<uint8>();
+		String error;
+		if (!mShaderLoader("Jolt.metallib", *data, error))
+		{
+			result.SetError(error);
+			delete data;
+			return result;
+		}
+
+		// Convert to dispatch data
+		dispatch_data_t data_dispatch = dispatch_data_create(data->data(), data->size(), nullptr, ^{ delete data; });
+
+		// Create the library
+		NSError *ns_error = nullptr;
+		mShaderLibrary = [mDevice newLibraryWithData: data_dispatch error: &ns_error];
+		if (ns_error != nil)
+		{
+			result.SetError("Failed to laod shader library");
+			return result;
+		}
+	}
+
+	// Get the shader function
+	id<MTLFunction> function = [mShaderLibrary newFunctionWithName: [NSString stringWithCString: inName encoding: NSUTF8StringEncoding]];
+	if (function == nil)
+	{
+		result.SetError("Failed to instantiate compute shader");
+		return result;
+	}
+
+	// Create the pipeline
+	NSError *error = nil;
+	MTLComputePipelineReflection *reflection = nil;
+	id<MTLComputePipelineState> pipeline_state = [mDevice newComputePipelineStateWithFunction: function options: MTLPipelineOptionBindingInfo | MTLPipelineOptionBufferTypeInfo reflection: &reflection error: &error];
+	if (error != nil || pipeline_state == nil)
+	{
+		result.SetError("Failed to create compute pipeline");
+		[function release];
+		return result;
+	}
+
+	result.Set(new ComputeShaderMTL(pipeline_state, reflection, inGroupSizeX, inGroupSizeY, inGroupSizeZ));
+	return result;
+}
+
+ComputeBufferResult ComputeSystemMTL::CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData)
+{
+	ComputeBufferResult result;
+
+	Ref<ComputeBufferMTL> buffer = new ComputeBufferMTL(this, inType, inSize, inStride);
+	if (!buffer->Initialize(inData))
+	{
+		result.SetError("Failed to create compute buffer");
+		return result;
+	}
+
+	result.Set(buffer.GetPtr());
+	return result;
+}
+
+ComputeQueueResult ComputeSystemMTL::CreateComputeQueue()
+{
+	ComputeQueueResult result;
+	result.Set(new ComputeQueueMTL(mDevice));
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTLImpl.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTLImpl.h
@ -0,0 +1,28 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeSystemMTL.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Interface to run a workload on the GPU that fully initializes Metal.
+class JPH_EXPORT ComputeSystemMTLImpl : public ComputeSystemMTL
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemMTLImpl)
+
+	/// Destructor
+	virtual							~ComputeSystemMTLImpl() override;
+
+	/// Initialize / shutdown the compute system
+	bool							Initialize();
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTLImpl.mm
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/MTL/ComputeSystemMTLImpl.mm
@ -0,0 +1,49 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_MTL
+
+#include <Jolt/Compute/MTL/ComputeSystemMTLImpl.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemMTLImpl)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemMTLImpl, ComputeSystemMTL)
+}
+
+ComputeSystemMTLImpl::~ComputeSystemMTLImpl()
+{
+	Shutdown();
+
+	[GetDevice() release];
+}
+
+bool ComputeSystemMTLImpl::Initialize()
+{
+	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+
+	return ComputeSystemMTL::Initialize(device);
+}
+
+ComputeSystemResult CreateComputeSystemMTL()
+{
+	ComputeSystemResult result;
+
+	Ref<ComputeSystemMTLImpl> compute = new ComputeSystemMTLImpl;
+	if (!compute->Initialize())
+	{
+		result.SetError("Failed to initialize compute system");
+		return result;
+	}
+
+	result.Set(compute.GetPtr());
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_MTL
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/BufferVK.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/BufferVK.h
@ -0,0 +1,42 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/VK/IncludeVK.h>
+#include <Jolt/Core/Reference.h>
+#include <Jolt/Core/NonCopyable.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Simple wrapper class to manage a Vulkan memory block
+class MemoryVK : public RefTarget<MemoryVK>, public NonCopyable
+{
+public:
+								~MemoryVK()
+	{
+		// We should have unmapped and freed the block before destruction
+		JPH_ASSERT(mMappedCount == 0);
+		JPH_ASSERT(mMemory == VK_NULL_HANDLE);
+	}
+
+	VkDeviceMemory				mMemory = VK_NULL_HANDLE;		///< The Vulkan memory handle
+	VkDeviceSize				mSize = 0;						///< Size of the memory block
+	VkDeviceSize				mBufferSize = 0;				///< Size of each of the buffers that this memory block has been divided into
+	VkMemoryPropertyFlags		mProperties = 0;				///< Vulkan memory properties used to allocate this block
+	int							mMappedCount = 0;				///< How often buffers using this memory block were mapped
+	void *						mMappedPtr = nullptr;			///< The CPU address of the memory block when mapped
+};
+
+/// Simple wrapper class to manage a Vulkan buffer
+class BufferVK
+{
+public:
+	Ref<MemoryVK>				mMemory;						///< The memory block that contains the buffer (note that filling this in is optional if you do your own buffer allocation)
+	VkBuffer					mBuffer = VK_NULL_HANDLE;		///< The Vulkan buffer handle
+	VkDeviceSize				mOffset = 0;					///< Offset in the memory block where the buffer starts
+	VkDeviceSize				mSize = 0;						///< Real size of the buffer
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeBufferVK.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeBufferVK.cpp
@ -0,0 +1,140 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeBufferVK.h>
+#include <Jolt/Compute/VK/ComputeSystemVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeBufferVK::ComputeBufferVK(ComputeSystemVK *inComputeSystem, EType inType, uint64 inSize, uint inStride) :
+	ComputeBuffer(inType, inSize, inStride),
+	mComputeSystem(inComputeSystem)
+{
+}
+
+bool ComputeBufferVK::Initialize(const void *inData)
+{
+	VkDeviceSize buffer_size = VkDeviceSize(mSize * mStride);
+
+	switch (mType)
+	{
+	case EType::Buffer:
+		JPH_ASSERT(inData != nullptr);
+		[[fallthrough]];
+
+	case EType::UploadBuffer:
+	case EType::RWBuffer:
+		if (!mComputeSystem->CreateBuffer(buffer_size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, mBufferCPU))
+			return false;
+		if (!mComputeSystem->CreateBuffer(buffer_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, mBufferGPU))
+			return false;
+		if (inData != nullptr)
+		{
+			void *data = mComputeSystem->MapBuffer(mBufferCPU);
+			memcpy(data, inData, size_t(buffer_size));
+			mComputeSystem->UnmapBuffer(mBufferCPU);
+			mNeedsSync = true;
+		}
+		break;
+
+	case EType::ConstantBuffer:
+		if (!mComputeSystem->CreateBuffer(buffer_size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, mBufferCPU))
+			return false;
+		if (inData != nullptr)
+		{
+			void* data = mComputeSystem->MapBuffer(mBufferCPU);
+			memcpy(data, inData, size_t(buffer_size));
+			mComputeSystem->UnmapBuffer(mBufferCPU);
+		}
+		break;
+
+	case EType::ReadbackBuffer:
+		JPH_ASSERT(inData == nullptr, "Can't upload data to a readback buffer");
+		if (!mComputeSystem->CreateBuffer(buffer_size, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, mBufferCPU))
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+ComputeBufferVK::~ComputeBufferVK()
+{
+	mComputeSystem->FreeBuffer(mBufferGPU);
+	mComputeSystem->FreeBuffer(mBufferCPU);
+}
+
+void ComputeBufferVK::Barrier(VkCommandBuffer inCommandBuffer, VkPipelineStageFlags inToStage, VkAccessFlagBits inToFlags, bool inForce) const
+{
+	if (mAccessStage == inToStage && mAccessFlagBits == inToFlags && !inForce)
+		return;
+
+	VkBufferMemoryBarrier b = {};
+	b.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+	b.srcAccessMask = mAccessFlagBits;
+	b.dstAccessMask = inToFlags;
+	b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+	b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+	b.buffer = mBufferGPU.mBuffer != VK_NULL_HANDLE? mBufferGPU.mBuffer : mBufferCPU.mBuffer;
+	b.offset = 0;
+	b.size = VK_WHOLE_SIZE;
+	vkCmdPipelineBarrier(inCommandBuffer, mAccessStage, inToStage, 0, 0, nullptr, 1, &b, 0, nullptr);
+
+	mAccessStage = inToStage;
+	mAccessFlagBits = inToFlags;
+}
+
+bool ComputeBufferVK::SyncCPUToGPU(VkCommandBuffer inCommandBuffer) const
+{
+	if (!mNeedsSync)
+		return false;
+
+	// Barrier before write
+	Barrier(inCommandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, false);
+
+	// Copy from CPU to GPU
+	VkBufferCopy copy = {};
+	copy.srcOffset = 0;
+	copy.dstOffset = 0;
+	copy.size = GetSize() * GetStride();
+	vkCmdCopyBuffer(inCommandBuffer, mBufferCPU.mBuffer, mBufferGPU.mBuffer, 1, &copy);
+
+	mNeedsSync = false;
+	return true;
+}
+
+void *ComputeBufferVK::MapInternal(EMode inMode)
+{
+	switch (inMode)
+	{
+	case EMode::Read:
+		JPH_ASSERT(mType == EType::ReadbackBuffer);
+		break;
+
+	case EMode::Write:
+		JPH_ASSERT(mType == EType::UploadBuffer || mType == EType::ConstantBuffer);
+		mNeedsSync = true;
+		break;
+	}
+
+	return mComputeSystem->MapBuffer(mBufferCPU);
+}
+
+void ComputeBufferVK::UnmapInternal()
+{
+	mComputeSystem->UnmapBuffer(mBufferCPU);
+}
+
+ComputeBufferResult ComputeBufferVK::CreateReadBackBuffer() const
+{
+	return mComputeSystem->CreateComputeBuffer(ComputeBuffer::EType::ReadbackBuffer, mSize, mStride);
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeBufferVK.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeBufferVK.h
@ -0,0 +1,52 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeBuffer.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/BufferVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeSystemVK;
+
+/// Buffer that can be read from / written to by a compute shader
+class JPH_EXPORT ComputeBufferVK final : public ComputeBuffer
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+									ComputeBufferVK(ComputeSystemVK *inComputeSystem, EType inType, uint64 inSize, uint inStride);
+	virtual							~ComputeBufferVK() override;
+
+	bool							Initialize(const void *inData);
+
+	virtual ComputeBufferResult		CreateReadBackBuffer() const override;
+
+	VkBuffer						GetBufferCPU() const									{ return mBufferCPU.mBuffer; }
+	VkBuffer						GetBufferGPU() const									{ return mBufferGPU.mBuffer; }
+	BufferVK						ReleaseBufferCPU() const								{ BufferVK tmp = mBufferCPU; mBufferCPU = BufferVK(); return tmp; }
+
+	void							Barrier(VkCommandBuffer inCommandBuffer, VkPipelineStageFlags inToStage, VkAccessFlagBits inToFlags, bool inForce) const;
+	bool							SyncCPUToGPU(VkCommandBuffer inCommandBuffer) const;
+
+private:
+	virtual void *					MapInternal(EMode inMode) override;
+	virtual void					UnmapInternal() override;
+
+	ComputeSystemVK *				mComputeSystem;
+	mutable BufferVK				mBufferCPU;
+	BufferVK						mBufferGPU;
+	mutable bool					mNeedsSync = false;										///< If this buffer needs to be synced from CPU to GPU
+	mutable VkAccessFlagBits		mAccessFlagBits = VK_ACCESS_SHADER_READ_BIT;			///< Access flags of the last usage, used for barriers
+	mutable VkPipelineStageFlags	mAccessStage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;	///< Pipeline stage of the last usage, used for barriers
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeQueueVK.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeQueueVK.cpp
@ -0,0 +1,304 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeQueueVK.h>
+#include <Jolt/Compute/VK/ComputeBufferVK.h>
+#include <Jolt/Compute/VK/ComputeSystemVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeQueueVK::~ComputeQueueVK()
+{
+	Wait();
+
+	VkDevice device = mComputeSystem->GetDevice();
+
+	if (mCommandBuffer != VK_NULL_HANDLE)
+		vkFreeCommandBuffers(device, mCommandPool, 1, &mCommandBuffer);
+
+	if (mCommandPool != VK_NULL_HANDLE)
+		vkDestroyCommandPool(device, mCommandPool, nullptr);
+
+	if (mDescriptorPool != VK_NULL_HANDLE)
+		vkDestroyDescriptorPool(device, mDescriptorPool, nullptr);
+
+	if (mFence != VK_NULL_HANDLE)
+		vkDestroyFence(device, mFence, nullptr);
+}
+
+bool ComputeQueueVK::Initialize(uint32 inComputeQueueIndex, ComputeQueueResult &outResult)
+{
+	// Get the queue
+	VkDevice device = mComputeSystem->GetDevice();
+	vkGetDeviceQueue(device, inComputeQueueIndex, 0, &mQueue);
+
+	// Create a command pool
+	VkCommandPoolCreateInfo pool_info = {};
+	pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+	pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+	pool_info.queueFamilyIndex = inComputeQueueIndex;
+	if (VKFailed(vkCreateCommandPool(device, &pool_info, nullptr, &mCommandPool), outResult))
+		return false;
+
+	// Create descriptor pool
+	VkDescriptorPoolSize descriptor_pool_sizes[] = {
+		{ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1024 },
+		{ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 16 * 1024 },
+	};
+	VkDescriptorPoolCreateInfo descriptor_info = {};
+	descriptor_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+	descriptor_info.poolSizeCount = (uint32)std::size(descriptor_pool_sizes);
+	descriptor_info.pPoolSizes = descriptor_pool_sizes;
+	descriptor_info.maxSets = 256;
+	if (VKFailed(vkCreateDescriptorPool(device, &descriptor_info, nullptr, &mDescriptorPool), outResult))
+		return false;
+
+	// Create a command buffer
+	VkCommandBufferAllocateInfo alloc_info = {};
+	alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+	alloc_info.commandPool = mCommandPool;
+	alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+	alloc_info.commandBufferCount = 1;
+	if (VKFailed(vkAllocateCommandBuffers(device, &alloc_info, &mCommandBuffer), outResult))
+		return false;
+
+	// Create a fence
+	VkFenceCreateInfo fence_info = {};
+	fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+	if (VKFailed(vkCreateFence(device, &fence_info, nullptr, &mFence), outResult))
+		return false;
+
+	return true;
+}
+
+bool ComputeQueueVK::BeginCommandBuffer()
+{
+	if (!mCommandBufferRecording)
+	{
+		VkCommandBufferBeginInfo begin_info = {};
+		begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+		begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+		if (VKFailed(vkBeginCommandBuffer(mCommandBuffer, &begin_info)))
+			return false;
+		mCommandBufferRecording = true;
+	}
+	return true;
+}
+
+void ComputeQueueVK::SetShader(const ComputeShader *inShader)
+{
+	mShader = static_cast<const ComputeShaderVK *>(inShader);
+	mBufferInfos = mShader->GetBufferInfos();
+}
+
+void ComputeQueueVK::SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::ConstantBuffer);
+
+	if (!BeginCommandBuffer())
+		return;
+
+	const ComputeBufferVK *buffer = static_cast<const ComputeBufferVK *>(inBuffer);
+	buffer->Barrier(mCommandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_UNIFORM_READ_BIT, false);
+
+	uint index = mShader->NameToBufferInfoIndex(inName);
+	JPH_ASSERT(mShader->GetLayoutBindings()[index].descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+	mBufferInfos[index].buffer = buffer->GetBufferCPU();
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueVK::SyncCPUToGPU(const ComputeBufferVK *inBuffer)
+{
+	// Ensure that any CPU writes are visible to the GPU
+	if (inBuffer->SyncCPUToGPU(mCommandBuffer)
+		&& (inBuffer->GetType() == ComputeBuffer::EType::Buffer || inBuffer->GetType()  == ComputeBuffer::EType::RWBuffer))
+	{
+		// After the first upload, the CPU buffer is no longer needed for Buffer and RWBuffer types
+		mDelayedFreedBuffers.push_back(inBuffer->ReleaseBufferCPU());
+	}
+}
+
+void ComputeQueueVK::SetBuffer(const char *inName, const ComputeBuffer *inBuffer)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::UploadBuffer || inBuffer->GetType() == ComputeBuffer::EType::Buffer || inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+
+	if (!BeginCommandBuffer())
+		return;
+
+	const ComputeBufferVK *buffer = static_cast<const ComputeBufferVK *>(inBuffer);
+	SyncCPUToGPU(buffer);
+	buffer->Barrier(mCommandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT, false);
+
+	uint index = mShader->NameToBufferInfoIndex(inName);
+	JPH_ASSERT(mShader->GetLayoutBindings()[index].descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+	mBufferInfos[index].buffer = buffer->GetBufferGPU();
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueVK::SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier)
+{
+	if (inBuffer == nullptr)
+		return;
+	JPH_ASSERT(inBuffer->GetType() == ComputeBuffer::EType::RWBuffer);
+
+	if (!BeginCommandBuffer())
+		return;
+
+	const ComputeBufferVK *buffer = static_cast<const ComputeBufferVK *>(inBuffer);
+	SyncCPUToGPU(buffer);
+	buffer->Barrier(mCommandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VkAccessFlagBits(VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT), inBarrier == EBarrier::Yes);
+
+	uint index = mShader->NameToBufferInfoIndex(inName);
+	JPH_ASSERT(mShader->GetLayoutBindings()[index].descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+	mBufferInfos[index].buffer = buffer->GetBufferGPU();
+
+	mUsedBuffers.insert(buffer);
+}
+
+void ComputeQueueVK::ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc)
+{
+	if (inDst == nullptr || inSrc == nullptr)
+		return;
+	JPH_ASSERT(inDst->GetType() == ComputeBuffer::EType::ReadbackBuffer);
+
+	if (!BeginCommandBuffer())
+		return;
+
+	const ComputeBufferVK *src_vk = static_cast<const ComputeBufferVK *>(inSrc);
+	const ComputeBufferVK *dst_vk = static_cast<ComputeBufferVK *>(inDst);
+
+	// Barrier to start reading from GPU buffer and writing to CPU buffer
+	src_vk->Barrier(mCommandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT, false);
+	dst_vk->Barrier(mCommandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, false);
+
+	// Copy
+	VkBufferCopy copy = {};
+	copy.srcOffset = 0;
+	copy.dstOffset = 0;
+	copy.size = src_vk->GetSize() * src_vk->GetStride();
+	vkCmdCopyBuffer(mCommandBuffer, src_vk->GetBufferGPU(), dst_vk->GetBufferCPU(), 1, &copy);
+
+	// Barrier to indicate that CPU can read from the buffer
+	dst_vk->Barrier(mCommandBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT, false);
+
+	mUsedBuffers.insert(src_vk);
+	mUsedBuffers.insert(dst_vk);
+}
+
+void ComputeQueueVK::Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ)
+{
+	if (!BeginCommandBuffer())
+		return;
+
+	vkCmdBindPipeline(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mShader->GetPipeline());
+
+	VkDevice device = mComputeSystem->GetDevice();
+	const Array<VkDescriptorSetLayoutBinding> &ds_bindings = mShader->GetLayoutBindings();
+	if (!ds_bindings.empty())
+	{
+		// Create a descriptor set
+		VkDescriptorSetAllocateInfo alloc_info = {};
+		alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+		alloc_info.descriptorPool = mDescriptorPool;
+		alloc_info.descriptorSetCount = 1;
+		VkDescriptorSetLayout ds_layout = mShader->GetDescriptorSetLayout();
+		alloc_info.pSetLayouts = &ds_layout;
+		VkDescriptorSet descriptor_set;
+		if (VKFailed(vkAllocateDescriptorSets(device, &alloc_info, &descriptor_set)))
+			return;
+
+		// Write the values to the descriptor set
+		Array<VkWriteDescriptorSet> writes;
+		writes.reserve(ds_bindings.size());
+		for (uint32 i = 0; i < (uint32)ds_bindings.size(); ++i)
+		{
+			VkWriteDescriptorSet w = {};
+			w.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+			w.dstSet = descriptor_set;
+			w.dstBinding = ds_bindings[i].binding;
+			w.dstArrayElement = 0;
+			w.descriptorCount = ds_bindings[i].descriptorCount;
+			w.descriptorType = ds_bindings[i].descriptorType;
+			w.pBufferInfo = &mBufferInfos[i];
+			writes.push_back(w);
+		}
+		vkUpdateDescriptorSets(device, (uint32)writes.size(), writes.data(), 0, nullptr);
+
+		// Bind the descriptor set
+		vkCmdBindDescriptorSets(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mShader->GetPipelineLayout(), 0, 1, &descriptor_set, 0, nullptr);
+	}
+
+	vkCmdDispatch(mCommandBuffer, inThreadGroupsX, inThreadGroupsY, inThreadGroupsZ);
+}
+
+void ComputeQueueVK::Execute()
+{
+	// End command buffer
+	if (!mCommandBufferRecording)
+		return;
+	if (VKFailed(vkEndCommandBuffer(mCommandBuffer)))
+		return;
+	mCommandBufferRecording = false;
+
+	// Reset fence
+	VkDevice device = mComputeSystem->GetDevice();
+	if (VKFailed(vkResetFences(device, 1, &mFence)))
+		return;
+
+	// Submit
+	VkSubmitInfo submit = {};
+	submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+	submit.commandBufferCount = 1;
+	submit.pCommandBuffers = &mCommandBuffer;
+	if (VKFailed(vkQueueSubmit(mQueue, 1, &submit, mFence)))
+		return;
+
+	// Clear the current shader
+	mShader = nullptr;
+
+	// Mark that we're executing
+	mIsExecuting = true;
+}
+
+void ComputeQueueVK::Wait()
+{
+	if (!mIsExecuting)
+		return;
+
+	// Wait for the work to complete
+	VkDevice device = mComputeSystem->GetDevice();
+	if (VKFailed(vkWaitForFences(device, 1, &mFence, VK_TRUE, UINT64_MAX)))
+		return;
+
+	// Reset command buffer so it can be reused
+	if (mCommandBuffer != VK_NULL_HANDLE)
+		vkResetCommandBuffer(mCommandBuffer, 0);
+
+	// Allow reusing the descriptors for next run
+	vkResetDescriptorPool(device, mDescriptorPool, 0);
+
+	// Buffers can be freed now
+	mUsedBuffers.clear();
+
+	// Free delayed buffers
+	for (BufferVK &buffer : mDelayedFreedBuffers)
+		mComputeSystem->FreeBuffer(buffer);
+	mDelayedFreedBuffers.clear();
+
+	mIsExecuting = false;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeQueueVK.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeQueueVK.h
@ -0,0 +1,66 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeQueue.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeShaderVK.h>
+#include <Jolt/Compute/VK/BufferVK.h>
+#include <Jolt/Core/UnorderedMap.h>
+#include <Jolt/Core/UnorderedSet.h>
+
+JPH_NAMESPACE_BEGIN
+
+class ComputeSystemVK;
+class ComputeBufferVK;
+
+/// A command queue for Vulkan for executing compute workloads on the GPU.
+class JPH_EXPORT ComputeQueueVK final : public ComputeQueue
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor / Destructor
+	explicit							ComputeQueueVK(ComputeSystemVK *inComputeSystem) : mComputeSystem(inComputeSystem) { }
+	virtual								~ComputeQueueVK() override;
+
+	/// Initialize the queue
+	bool								Initialize(uint32 inComputeQueueIndex, ComputeQueueResult &outResult);
+
+	// See: ComputeQueue
+	virtual void						SetShader(const ComputeShader *inShader) override;
+	virtual void						SetConstantBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void						SetBuffer(const char *inName, const ComputeBuffer *inBuffer) override;
+	virtual void 						SetRWBuffer(const char *inName, ComputeBuffer *inBuffer, EBarrier inBarrier = EBarrier::Yes) override;
+	virtual void						ScheduleReadback(ComputeBuffer *inDst, const ComputeBuffer *inSrc) override;
+	virtual void						Dispatch(uint inThreadGroupsX, uint inThreadGroupsY, uint inThreadGroupsZ) override;
+	virtual void						Execute() override;
+	virtual void						Wait() override;
+
+private:
+	bool								BeginCommandBuffer();
+
+	// Copy the CPU buffer to the GPU buffer if needed
+	void								SyncCPUToGPU(const ComputeBufferVK *inBuffer);
+
+	ComputeSystemVK *					mComputeSystem;
+	VkQueue								mQueue = VK_NULL_HANDLE;
+	VkCommandPool						mCommandPool = VK_NULL_HANDLE;
+	VkDescriptorPool					mDescriptorPool = VK_NULL_HANDLE;
+	VkCommandBuffer						mCommandBuffer = VK_NULL_HANDLE;
+	bool								mCommandBufferRecording = false;				///< If we are currently recording commands into the command buffer
+	VkFence								mFence = VK_NULL_HANDLE;
+	bool								mIsExecuting = false;							///< If Execute has been called and we are waiting for it to finish
+	RefConst<ComputeShaderVK>			mShader;										///< Shader that has been activated
+	Array<VkDescriptorBufferInfo>		mBufferInfos;									///< List of parameters that will be sent to the current shader
+	UnorderedSet<RefConst<ComputeBuffer>> mUsedBuffers;									///< Buffers that are in use by the current execution, these will be retained until execution is finished so that we don't free buffers that are in use
+	Array<BufferVK>						mDelayedFreedBuffers;							///< Hardware buffers that need to be freed after execution is done
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeShaderVK.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeShaderVK.cpp
@ -0,0 +1,232 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeShaderVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+ComputeShaderVK::~ComputeShaderVK()
+{
+	if (mShaderModule != VK_NULL_HANDLE)
+		vkDestroyShaderModule(mDevice, mShaderModule, nullptr);
+
+	if (mDescriptorSetLayout != VK_NULL_HANDLE)
+		vkDestroyDescriptorSetLayout(mDevice, mDescriptorSetLayout, nullptr);
+
+	if (mPipelineLayout != VK_NULL_HANDLE)
+		vkDestroyPipelineLayout(mDevice, mPipelineLayout, nullptr);
+
+	if (mPipeline != VK_NULL_HANDLE)
+		vkDestroyPipeline(mDevice, mPipeline, nullptr);
+}
+
+bool ComputeShaderVK::Initialize(const Array<uint8> &inSPVCode, VkBuffer inDummyBuffer, ComputeShaderResult &outResult)
+{
+	const uint32 *spv_words = reinterpret_cast<const uint32 *>(inSPVCode.data());
+	size_t spv_word_count = inSPVCode.size() / sizeof(uint32);
+
+	// Minimal SPIR-V parser to extract name to binding info
+	UnorderedMap<uint32, String> id_to_name;
+	UnorderedMap<uint32, uint32> id_to_binding;
+	UnorderedMap<uint32, VkDescriptorType> id_to_descriptor_type;
+	UnorderedMap<uint32, uint32> pointer_to_pointee;
+	UnorderedMap<uint32, uint32> var_to_ptr_type;
+	size_t i = 5; // Skip 5 word header
+	while (i < spv_word_count)
+	{
+		// Parse next word
+		uint32 word = spv_words[i];
+		uint16 opcode = uint16(word & 0xffff);
+		uint16 word_count = uint16(word >> 16);
+		if (word_count == 0 || i + word_count > spv_word_count)
+			break;
+
+		switch (opcode)
+		{
+		case 5: // OpName
+			if (word_count >= 2)
+			{
+				uint32 target_id = spv_words[i + 1];
+				const char* name = reinterpret_cast<const char*>(&spv_words[i + 2]);
+				if (*name != 0)
+					id_to_name.insert({ target_id, name });
+			}
+			break;
+
+		case 16: // OpExecutionMode
+			if (word_count >= 6)
+			{
+				uint32 execution_mode = spv_words[i + 2];
+				if (execution_mode == 17) // LocalSize
+				{
+					// Assert that the group size provided matches the one in the shader
+					JPH_ASSERT(GetGroupSizeX() == spv_words[i + 3], "Group size X mismatch");
+					JPH_ASSERT(GetGroupSizeY() == spv_words[i + 4], "Group size Y mismatch");
+					JPH_ASSERT(GetGroupSizeZ() == spv_words[i + 5], "Group size Z mismatch");
+				}
+			}
+			break;
+
+		case 32: // OpTypePointer
+			if (word_count >= 4)
+			{
+				uint32 result_id = spv_words[i + 1];
+				uint32 type_id = spv_words[i + 3];
+				pointer_to_pointee.insert({ result_id, type_id });
+			}
+			break;
+
+		case 59: // OpVariable
+			if (word_count >= 3)
+			{
+				uint32 ptr_type_id = spv_words[i + 1];
+				uint32 result_id = spv_words[i + 2];
+				var_to_ptr_type.insert({ result_id, ptr_type_id });
+			}
+			break;
+
+		case 71: // OpDecorate
+			if (word_count >= 3)
+			{
+				uint32 target_id = spv_words[i + 1];
+				uint32 decoration = spv_words[i + 2];
+				if (decoration == 2) // Block
+				{
+					id_to_descriptor_type.insert({ target_id, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER });
+				}
+				else if (decoration == 3) // BufferBlock
+				{
+					id_to_descriptor_type.insert({ target_id, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER });
+				}
+				else if (decoration == 33 && word_count >= 4) // Binding
+				{
+					uint32 binding = spv_words[i + 3];
+					id_to_binding.insert({ target_id, binding });
+				}
+			}
+			break;
+
+		default:
+			break;
+		}
+
+		i += word_count;
+	}
+
+	// Build name to binding map
+	UnorderedMap<String, std::pair<uint32, VkDescriptorType>> name_to_binding;
+	for (const UnorderedMap<uint32, uint32>::value_type &entry : id_to_binding)
+	{
+		uint32 target_id = entry.first;
+		uint32 binding = entry.second;
+
+		// Get the name of the variable
+		UnorderedMap<uint32, String>::const_iterator it_name = id_to_name.find(target_id);
+		if (it_name != id_to_name.end())
+		{
+			// Find variable that links to the target
+			UnorderedMap<uint32, uint32>::const_iterator it_var_ptr = var_to_ptr_type.find(target_id);
+			if (it_var_ptr != var_to_ptr_type.end())
+			{
+				// Find type pointed at
+				uint32 ptr_type = it_var_ptr->second;
+				UnorderedMap<uint32, uint32>::const_iterator it_pointee = pointer_to_pointee.find(ptr_type);
+				if (it_pointee != pointer_to_pointee.end())
+				{
+					uint32 pointee_type = it_pointee->second;
+
+					// Find descriptor type
+					UnorderedMap<uint32, VkDescriptorType>::iterator it_descriptor_type = id_to_descriptor_type.find(pointee_type);
+					VkDescriptorType descriptor_type = it_descriptor_type != id_to_descriptor_type.end() ? it_descriptor_type->second : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+
+					name_to_binding.insert({ it_name->second, { binding, descriptor_type } });
+					continue;
+				}
+			}
+		}
+	}
+
+	// Create layout bindings and buffer infos
+	if (!name_to_binding.empty())
+	{
+		mLayoutBindings.reserve(name_to_binding.size());
+		mBufferInfos.reserve(name_to_binding.size());
+
+		mBindingNames.reserve(name_to_binding.size());
+		for (const UnorderedMap<String, std::pair<uint32, VkDescriptorType>>::value_type &b : name_to_binding)
+		{
+			const String &name = b.first;
+			uint binding = b.second.first;
+			VkDescriptorType descriptor_type = b.second.second;
+
+			VkDescriptorSetLayoutBinding l = {};
+			l.binding = binding;
+			l.descriptorCount = 1;
+			l.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+			l.descriptorType = descriptor_type;
+			mLayoutBindings.push_back(l);
+
+			mBindingNames.push_back(name); // Add all strings to a pool to keep them alive
+			mNameToBufferInfoIndex[string_view(mBindingNames.back())] = (uint32)mBufferInfos.size();
+
+			VkDescriptorBufferInfo bi = {};
+			bi.offset = 0;
+			bi.range = VK_WHOLE_SIZE;
+			bi.buffer = inDummyBuffer; // Avoid: The Vulkan spec states: If the nullDescriptor feature is not enabled, buffer must not be VK_NULL_HANDLE
+			mBufferInfos.push_back(bi);
+		}
+
+		// Create descriptor set layout
+		VkDescriptorSetLayoutCreateInfo layout_info = {};
+		layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+		layout_info.bindingCount = (uint32)mLayoutBindings.size();
+		layout_info.pBindings = mLayoutBindings.data();
+		if (VKFailed(vkCreateDescriptorSetLayout(mDevice, &layout_info, nullptr, &mDescriptorSetLayout), outResult))
+			return false;
+	}
+
+	// Create pipeline layout
+	VkPipelineLayoutCreateInfo pl_info = {};
+	pl_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+	pl_info.setLayoutCount = mDescriptorSetLayout != VK_NULL_HANDLE ? 1 : 0;
+	pl_info.pSetLayouts = mDescriptorSetLayout != VK_NULL_HANDLE ? &mDescriptorSetLayout : nullptr;
+	if (VKFailed(vkCreatePipelineLayout(mDevice, &pl_info, nullptr, &mPipelineLayout), outResult))
+		return false;
+
+	// Create shader module
+	VkShaderModuleCreateInfo create_info = {};
+	create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+	create_info.codeSize = inSPVCode.size();
+	create_info.pCode = spv_words;
+	if (VKFailed(vkCreateShaderModule(mDevice, &create_info, nullptr, &mShaderModule), outResult))
+		return false;
+
+	// Create compute pipeline
+	VkComputePipelineCreateInfo pipe_info = {};
+	pipe_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+	pipe_info.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+	pipe_info.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+	pipe_info.stage.module = mShaderModule;
+	pipe_info.stage.pName = "main";
+	pipe_info.layout = mPipelineLayout;
+	if (VKFailed(vkCreateComputePipelines(mDevice, VK_NULL_HANDLE, 1, &pipe_info, nullptr, &mPipeline), outResult))
+		return false;
+
+	return true;
+}
+
+uint32 ComputeShaderVK::NameToBufferInfoIndex(const char *inName) const
+{
+	UnorderedMap<string_view, uint>::const_iterator it = mNameToBufferInfoIndex.find(inName);
+	JPH_ASSERT(it != mNameToBufferInfoIndex.end());
+	return it->second;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeShaderVK.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeShaderVK.h
@ -0,0 +1,53 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeShader.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/IncludeVK.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Compute shader handle for Vulkan
+class JPH_EXPORT ComputeShaderVK : public ComputeShader
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor / destructor
+										ComputeShaderVK(VkDevice inDevice, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) : ComputeShader(inGroupSizeX, inGroupSizeY, inGroupSizeZ), mDevice(inDevice) { }
+	virtual								~ComputeShaderVK() override;
+
+	/// Initialize from SPIR-V code
+	bool								Initialize(const Array<uint8> &inSPVCode, VkBuffer inDummyBuffer, ComputeShaderResult &outResult);
+
+	/// Get index of parameter in buffer infos
+	uint32								NameToBufferInfoIndex(const char *inName) const;
+
+	/// Getters
+	VkPipeline							GetPipeline() const							{ return mPipeline; }
+	VkPipelineLayout					GetPipelineLayout() const					{ return mPipelineLayout; }
+	VkDescriptorSetLayout				GetDescriptorSetLayout() const				{ return mDescriptorSetLayout; }
+	const Array<VkDescriptorSetLayoutBinding> &GetLayoutBindings() const			{ return mLayoutBindings; }
+	const Array<VkDescriptorBufferInfo> &GetBufferInfos() const						{ return mBufferInfos; }
+
+private:
+	VkDevice							mDevice;
+	VkShaderModule						mShaderModule = VK_NULL_HANDLE;
+	VkPipelineLayout					mPipelineLayout = VK_NULL_HANDLE;
+	VkPipeline							mPipeline = VK_NULL_HANDLE;
+	VkDescriptorSetLayout				mDescriptorSetLayout = VK_NULL_HANDLE;
+	Array<String>						mBindingNames;								///< A list of binding names, mNameToBufferInfoIndex points to these strings
+	UnorderedMap<string_view, uint32>	mNameToBufferInfoIndex;						///< Binding name to buffer index, using a string_view so we can do find() without an allocation
+	Array<VkDescriptorSetLayoutBinding>	mLayoutBindings;
+	Array<VkDescriptorBufferInfo>		mBufferInfos;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVK.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVK.cpp
@ -0,0 +1,118 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeSystemVK.h>
+#include <Jolt/Compute/VK/ComputeShaderVK.h>
+#include <Jolt/Compute/VK/ComputeBufferVK.h>
+#include <Jolt/Compute/VK/ComputeQueueVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_ABSTRACT(ComputeSystemVK)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemVK, ComputeSystem)
+}
+
+bool ComputeSystemVK::Initialize(VkPhysicalDevice inPhysicalDevice, VkDevice inDevice, uint32 inComputeQueueIndex, ComputeSystemResult &outResult)
+{
+	mPhysicalDevice = inPhysicalDevice;
+	mDevice = inDevice;
+	mComputeQueueIndex = inComputeQueueIndex;
+
+	// Get function to set a debug name
+	mVkSetDebugUtilsObjectNameEXT = reinterpret_cast<PFN_vkSetDebugUtilsObjectNameEXT>(reinterpret_cast<void *>(vkGetDeviceProcAddr(mDevice, "vkSetDebugUtilsObjectNameEXT")));
+
+	if (!InitializeMemory())
+	{
+		outResult.SetError("Failed to initialize memory subsystem");
+		return false;
+	}
+
+	// Create the dummy buffer. This is used to bind to shaders for which we have no buffer. We can't rely on VK_EXT_robustness2 being available to set nullDescriptor = VK_TRUE (it is unavailable on macOS).
+	if (!CreateBuffer(1024, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, mDummyBuffer))
+	{
+		outResult.SetError("Failed to create dummy buffer");
+		return false;
+	}
+
+	return true;
+}
+
+void ComputeSystemVK::Shutdown()
+{
+	if (mDevice != VK_NULL_HANDLE)
+		vkDeviceWaitIdle(mDevice);
+
+	// Free the dummy buffer
+	FreeBuffer(mDummyBuffer);
+
+	ShutdownMemory();
+}
+
+ComputeShaderResult ComputeSystemVK::CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ)
+{
+	ComputeShaderResult result;
+
+	// Read shader source file
+	Array<uint8> data;
+	String file_name = String(inName) + ".spv";
+	String error;
+	if (!mShaderLoader(file_name.c_str(), data, error))
+	{
+		result.SetError(error);
+		return result;
+	}
+
+	Ref<ComputeShaderVK> shader = new ComputeShaderVK(mDevice, inGroupSizeX, inGroupSizeY, inGroupSizeZ);
+	if (!shader->Initialize(data, mDummyBuffer.mBuffer, result))
+		return result;
+
+	// Name the pipeline so we can easily find it in a profile
+	if (mVkSetDebugUtilsObjectNameEXT != nullptr)
+	{
+		VkDebugUtilsObjectNameInfoEXT info = {};
+		info.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT;
+		info.pNext = nullptr;
+		info.objectType = VK_OBJECT_TYPE_PIPELINE;
+		info.objectHandle = (uint64)shader->GetPipeline();
+		info.pObjectName = inName;
+		mVkSetDebugUtilsObjectNameEXT(mDevice, &info);
+	}
+
+	result.Set(shader.GetPtr());
+	return result;
+}
+
+ComputeBufferResult ComputeSystemVK::CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData)
+{
+	ComputeBufferResult result;
+
+	Ref<ComputeBufferVK> buffer = new ComputeBufferVK(this, inType, inSize, inStride);
+	if (!buffer->Initialize(inData))
+	{
+		result.SetError("Failed to create compute buffer");
+		return result;
+	}
+
+	result.Set(buffer.GetPtr());
+	return result;
+}
+
+ComputeQueueResult ComputeSystemVK::CreateComputeQueue()
+{
+	ComputeQueueResult result;
+	Ref<ComputeQueueVK> q = new ComputeQueueVK(this);
+	if (!q->Initialize(mComputeQueueIndex, result))
+		return result;
+	result.Set(q.GetPtr());
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVK.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVK.h
@ -0,0 +1,57 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Compute/ComputeSystem.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeQueueVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Interface to run a workload on the GPU using Vulkan.
+/// Minimal implementation that can integrate with your own Vulkan setup.
+class JPH_EXPORT ComputeSystemVK : public ComputeSystem
+{
+public:
+	JPH_DECLARE_RTTI_ABSTRACT(JPH_EXPORT, ComputeSystemVK)
+
+	// Initialize / shutdown the compute system
+	bool							Initialize(VkPhysicalDevice inPhysicalDevice, VkDevice inDevice, uint32 inComputeQueueIndex, ComputeSystemResult &outResult);
+	void							Shutdown();
+
+	// See: ComputeSystem
+	virtual ComputeShaderResult		CreateComputeShader(const char *inName, uint32 inGroupSizeX, uint32 inGroupSizeY, uint32 inGroupSizeZ) override;
+	virtual ComputeBufferResult		CreateComputeBuffer(ComputeBuffer::EType inType, uint64 inSize, uint inStride, const void *inData = nullptr) override;
+	virtual ComputeQueueResult		CreateComputeQueue() override;
+
+	/// Access to the Vulkan device
+	VkDevice						GetDevice() const												{ return mDevice; }
+
+	/// Allow the application to override buffer creation and memory mapping in case it uses its own allocator
+	virtual bool					CreateBuffer(VkDeviceSize inSize, VkBufferUsageFlags inUsage, VkMemoryPropertyFlags inProperties, BufferVK &outBuffer) = 0;
+	virtual void					FreeBuffer(BufferVK &ioBuffer) = 0;
+	virtual void *					MapBuffer(BufferVK &ioBuffer) = 0;
+	virtual void					UnmapBuffer(BufferVK &ioBuffer) = 0;
+
+protected:
+	/// Initialize / shutdown the memory subsystem
+	virtual bool					InitializeMemory() = 0;
+	virtual void					ShutdownMemory() = 0;
+
+	VkPhysicalDevice				mPhysicalDevice = VK_NULL_HANDLE;
+	VkDevice						mDevice = VK_NULL_HANDLE;
+	uint32							mComputeQueueIndex = 0;
+	PFN_vkSetDebugUtilsObjectNameEXT mVkSetDebugUtilsObjectNameEXT = nullptr;
+
+private:
+	// Buffer that can be bound when we have no buffer
+	BufferVK						mDummyBuffer;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKImpl.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKImpl.cpp
@ -0,0 +1,330 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeSystemVKImpl.h>
+#include <Jolt/Core/QuickSort.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemVKImpl)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemVKImpl, ComputeSystemVKWithAllocator)
+}
+
+#ifdef JPH_DEBUG
+
+static VKAPI_ATTR VkBool32 VKAPI_CALL sVulkanDebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT inSeverity, [[maybe_unused]] VkDebugUtilsMessageTypeFlagsEXT inType, const VkDebugUtilsMessengerCallbackDataEXT *inCallbackData, [[maybe_unused]] void *inUserData)
+{
+	if (inSeverity & (VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT))
+		Trace("VK: %s", inCallbackData->pMessage);
+	JPH_ASSERT((inSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) == 0);
+	return VK_FALSE;
+}
+
+#endif // JPH_DEBUG
+
+ComputeSystemVKImpl::~ComputeSystemVKImpl()
+{
+	ComputeSystemVK::Shutdown();
+
+	if (mDevice != VK_NULL_HANDLE)
+		vkDestroyDevice(mDevice, nullptr);
+
+#ifdef JPH_DEBUG
+	PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT = (PFN_vkDestroyDebugUtilsMessengerEXT)(void *)vkGetInstanceProcAddr(mInstance, "vkDestroyDebugUtilsMessengerEXT");
+	if (mInstance != VK_NULL_HANDLE && mDebugMessenger != VK_NULL_HANDLE && vkDestroyDebugUtilsMessengerEXT != nullptr)
+		vkDestroyDebugUtilsMessengerEXT(mInstance, mDebugMessenger, nullptr);
+#endif
+
+	if (mInstance != VK_NULL_HANDLE)
+		vkDestroyInstance(mInstance, nullptr);
+}
+
+bool ComputeSystemVKImpl::Initialize(ComputeSystemResult &outResult)
+{
+	// Required instance extensions
+	Array<const char *> required_instance_extensions;
+	required_instance_extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME);
+	required_instance_extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+#ifdef JPH_PLATFORM_MACOS
+	required_instance_extensions.push_back("VK_KHR_portability_enumeration");
+	required_instance_extensions.push_back("VK_KHR_get_physical_device_properties2");
+#endif
+	GetInstanceExtensions(required_instance_extensions);
+
+	// Required device extensions
+	Array<const char *> required_device_extensions;
+	required_device_extensions.push_back(VK_EXT_SCALAR_BLOCK_LAYOUT_EXTENSION_NAME);
+#ifdef JPH_PLATFORM_MACOS
+	required_device_extensions.push_back("VK_KHR_portability_subset"); // VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME
+#endif
+	GetDeviceExtensions(required_device_extensions);
+
+	// Query supported instance extensions
+	uint32 instance_extension_count = 0;
+	if (VKFailed(vkEnumerateInstanceExtensionProperties(nullptr, &instance_extension_count, nullptr), outResult))
+		return false;
+	Array<VkExtensionProperties> instance_extensions;
+	instance_extensions.resize(instance_extension_count);
+	if (VKFailed(vkEnumerateInstanceExtensionProperties(nullptr, &instance_extension_count, instance_extensions.data()), outResult))
+		return false;
+
+	// Query supported validation layers
+	uint32 validation_layer_count;
+	vkEnumerateInstanceLayerProperties(&validation_layer_count, nullptr);
+	Array<VkLayerProperties> validation_layers(validation_layer_count);
+	vkEnumerateInstanceLayerProperties(&validation_layer_count, validation_layers.data());
+
+	VkApplicationInfo app_info = {};
+	app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+	app_info.apiVersion = VK_API_VERSION_1_1;
+
+	// Create Vulkan instance
+	VkInstanceCreateInfo instance_create_info = {};
+	instance_create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+#ifdef JPH_PLATFORM_MACOS
+	instance_create_info.flags = VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+#endif
+	instance_create_info.pApplicationInfo = &app_info;
+
+#ifdef JPH_DEBUG
+	// Enable validation layer if supported
+	const char *desired_validation_layers[] = { "VK_LAYER_KHRONOS_validation" };
+	for (const VkLayerProperties &p : validation_layers)
+		if (strcmp(desired_validation_layers[0], p.layerName) == 0)
+		{
+			instance_create_info.enabledLayerCount = 1;
+			instance_create_info.ppEnabledLayerNames = desired_validation_layers;
+			break;
+		}
+
+	// Setup debug messenger callback if the extension is supported
+	VkDebugUtilsMessengerCreateInfoEXT messenger_create_info = {};
+	for (const VkExtensionProperties &ext : instance_extensions)
+		if (strcmp(VK_EXT_DEBUG_UTILS_EXTENSION_NAME, ext.extensionName) == 0)
+		{
+			messenger_create_info.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+			messenger_create_info.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+			messenger_create_info.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+			messenger_create_info.pfnUserCallback = sVulkanDebugCallback;
+			instance_create_info.pNext = &messenger_create_info;
+			required_instance_extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+			break;
+		}
+#endif
+
+	instance_create_info.enabledExtensionCount = (uint32)required_instance_extensions.size();
+	instance_create_info.ppEnabledExtensionNames = required_instance_extensions.data();
+	if (VKFailed(vkCreateInstance(&instance_create_info, nullptr, &mInstance), outResult))
+		return false;
+
+#ifdef JPH_DEBUG
+	// Finalize debug messenger callback
+	PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT = (PFN_vkCreateDebugUtilsMessengerEXT)(std::uintptr_t)vkGetInstanceProcAddr(mInstance, "vkCreateDebugUtilsMessengerEXT");
+	if (vkCreateDebugUtilsMessengerEXT != nullptr)
+		if (VKFailed(vkCreateDebugUtilsMessengerEXT(mInstance, &messenger_create_info, nullptr, &mDebugMessenger), outResult))
+			return false;
+#endif
+
+	// Notify that instance has been created
+	OnInstanceCreated();
+
+	// Select device
+	uint32 device_count = 0;
+	if (VKFailed(vkEnumeratePhysicalDevices(mInstance, &device_count, nullptr), outResult))
+		return false;
+	Array<VkPhysicalDevice> devices;
+	devices.resize(device_count);
+	if (VKFailed(vkEnumeratePhysicalDevices(mInstance, &device_count, devices.data()), outResult))
+		return false;
+	struct Device
+	{
+		VkPhysicalDevice		mPhysicalDevice;
+		String					mName;
+		VkSurfaceFormatKHR		mFormat;
+		uint32					mGraphicsQueueIndex;
+		uint32					mPresentQueueIndex;
+		uint32					mComputeQueueIndex;
+		int						mScore;
+	};
+	Array<Device> available_devices;
+	for (VkPhysicalDevice device : devices)
+	{
+		// Get device properties
+		VkPhysicalDeviceProperties properties;
+		vkGetPhysicalDeviceProperties(device, &properties);
+
+		// Test if it is an appropriate type
+		int score = 0;
+		switch (properties.deviceType)
+		{
+		case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU:
+			score = 30;
+			break;
+		case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU:
+			score = 20;
+			break;
+		case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU:
+			score = 10;
+			break;
+		case VK_PHYSICAL_DEVICE_TYPE_CPU:
+			score = 5;
+			break;
+		case VK_PHYSICAL_DEVICE_TYPE_OTHER:
+		case VK_PHYSICAL_DEVICE_TYPE_MAX_ENUM:
+			continue;
+		}
+
+		// Check if the device supports all our required extensions
+		uint32 device_extension_count;
+		vkEnumerateDeviceExtensionProperties(device, nullptr, &device_extension_count, nullptr);
+		Array<VkExtensionProperties> available_extensions;
+		available_extensions.resize(device_extension_count);
+		vkEnumerateDeviceExtensionProperties(device, nullptr, &device_extension_count, available_extensions.data());
+		int found_extensions = 0;
+		for (const char *required_device_extension : required_device_extensions)
+			for (const VkExtensionProperties &ext : available_extensions)
+				if (strcmp(required_device_extension, ext.extensionName) == 0)
+				{
+					found_extensions++;
+					break;
+				}
+		if (found_extensions != int(required_device_extensions.size()))
+			continue;
+
+		// Find the right queues
+		uint32 queue_family_count = 0;
+		vkGetPhysicalDeviceQueueFamilyProperties(device, &queue_family_count, nullptr);
+		Array<VkQueueFamilyProperties> queue_families;
+		queue_families.resize(queue_family_count);
+		vkGetPhysicalDeviceQueueFamilyProperties(device, &queue_family_count, queue_families.data());
+		uint32 graphics_queue = ~uint32(0);
+		uint32 present_queue = ~uint32(0);
+		uint32 compute_queue = ~uint32(0);
+		for (uint32 i = 0; i < uint32(queue_families.size()); ++i)
+		{
+			if (queue_families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT)
+			{
+				graphics_queue = i;
+
+				if (queue_families[i].queueFlags & VK_QUEUE_COMPUTE_BIT)
+					compute_queue = i;
+			}
+
+			if (HasPresentSupport(device, i))
+				present_queue = i;
+
+			if (graphics_queue != ~uint32(0) && present_queue != ~uint32(0) && compute_queue != ~uint32(0))
+				break;
+		}
+		if (graphics_queue == ~uint32(0) || present_queue == ~uint32(0) || compute_queue == ~uint32(0))
+			continue;
+
+		// Select surface format
+		VkSurfaceFormatKHR selected_format = SelectFormat(device);
+		if (selected_format.format == VK_FORMAT_UNDEFINED)
+			continue;
+
+		// Add the device
+		available_devices.push_back({ device, properties.deviceName, selected_format, graphics_queue, present_queue, compute_queue, score });
+	}
+	if (available_devices.empty())
+	{
+		outResult.SetError("No suitable Vulkan device found");
+		return false;
+	}
+
+	// Sort the devices by score
+	QuickSort(available_devices.begin(), available_devices.end(), [](const Device &inLHS, const Device &inRHS) {
+		return inLHS.mScore > inRHS.mScore;
+	});
+	const Device &selected_device = available_devices[0];
+
+	// Create device
+	float queue_priority = 1.0f;
+	VkDeviceQueueCreateInfo queue_create_info[3] = {};
+	for (VkDeviceQueueCreateInfo &q : queue_create_info)
+	{
+		q.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+		q.queueCount = 1;
+		q.pQueuePriorities = &queue_priority;
+	}
+	uint32 num_queues = 0;
+	queue_create_info[num_queues++].queueFamilyIndex = selected_device.mGraphicsQueueIndex;
+	bool found = false;
+	for (uint32 i = 0; i < num_queues; ++i)
+		if (queue_create_info[i].queueFamilyIndex == selected_device.mPresentQueueIndex)
+		{
+			found = true;
+			break;
+		}
+	if (!found)
+		queue_create_info[num_queues++].queueFamilyIndex = selected_device.mPresentQueueIndex;
+	found = false;
+	for (uint32 i = 0; i < num_queues; ++i)
+		if (queue_create_info[i].queueFamilyIndex == selected_device.mComputeQueueIndex)
+		{
+			found = true;
+			break;
+		}
+	if (!found)
+		queue_create_info[num_queues++].queueFamilyIndex = selected_device.mComputeQueueIndex;
+
+	VkPhysicalDeviceScalarBlockLayoutFeatures enable_scalar_block = {};
+	enable_scalar_block.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES;
+	enable_scalar_block.scalarBlockLayout = VK_TRUE;
+
+	VkPhysicalDeviceFeatures2 enabled_features2 = {};
+	enabled_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+	GetEnabledFeatures(enabled_features2);
+	enable_scalar_block.pNext = enabled_features2.pNext;
+	enabled_features2.pNext = &enable_scalar_block;
+
+	VkDeviceCreateInfo device_create_info = {};
+	device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+	device_create_info.queueCreateInfoCount = num_queues;
+	device_create_info.pQueueCreateInfos = queue_create_info;
+	device_create_info.enabledLayerCount = instance_create_info.enabledLayerCount;
+	device_create_info.ppEnabledLayerNames = instance_create_info.ppEnabledLayerNames;
+	device_create_info.enabledExtensionCount = uint32(required_device_extensions.size());
+	device_create_info.ppEnabledExtensionNames = required_device_extensions.data();
+	device_create_info.pNext = &enabled_features2;
+	device_create_info.pEnabledFeatures = nullptr;
+
+	VkDevice device = VK_NULL_HANDLE;
+	if (VKFailed(vkCreateDevice(selected_device.mPhysicalDevice, &device_create_info, nullptr, &device), outResult))
+		return false;
+
+	// Get the queues
+	mGraphicsQueueIndex = selected_device.mGraphicsQueueIndex;
+	mPresentQueueIndex = selected_device.mPresentQueueIndex;
+	vkGetDeviceQueue(device, mGraphicsQueueIndex, 0, &mGraphicsQueue);
+	vkGetDeviceQueue(device, mPresentQueueIndex, 0, &mPresentQueue);
+
+	// Store selected format
+	mSelectedFormat = selected_device.mFormat;
+
+	// Initialize the compute system
+	return ComputeSystemVK::Initialize(selected_device.mPhysicalDevice, device, selected_device.mComputeQueueIndex, outResult);
+}
+
+ComputeSystemResult CreateComputeSystemVK()
+{
+	ComputeSystemResult result;
+
+	Ref<ComputeSystemVKImpl> compute = new ComputeSystemVKImpl;
+	if (!compute->Initialize(result))
+		return result;
+
+	result.Set(compute.GetPtr());
+	return result;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKImpl.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKImpl.h
@ -0,0 +1,57 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeSystemVKWithAllocator.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Implementation of ComputeSystemVK that fully initializes Vulkan
+class JPH_EXPORT ComputeSystemVKImpl : public ComputeSystemVKWithAllocator
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemVKImpl)
+
+	/// Destructor
+	virtual							~ComputeSystemVKImpl() override;
+
+	/// Initialize the compute system
+	bool							Initialize(ComputeSystemResult &outResult);
+
+protected:
+	/// Override to perform actions once the instance has been created
+	virtual void					OnInstanceCreated()																{ /* Do nothing */ }
+
+	/// Override to add platform specific instance extensions
+	virtual void					GetInstanceExtensions(Array<const char *> &outExtensions)						{ /* Add nothing */ }
+
+	/// Override to add platform specific device extensions
+	virtual void					GetDeviceExtensions(Array<const char *> &outExtensions)							{ /* Add nothing */ }
+
+	/// Override  to enable specific features
+	virtual void					GetEnabledFeatures(VkPhysicalDeviceFeatures2 &ioFeatures)						{ /* Add nothing */ }
+
+	/// Override to check for present support on a given device and queue family
+	virtual bool					HasPresentSupport(VkPhysicalDevice inDevice, uint32 inQueueFamilyIndex)			{ return true; }
+
+	/// Override to select the surface format
+	virtual VkSurfaceFormatKHR		SelectFormat(VkPhysicalDevice inDevice)											{ return { VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR }; }
+
+	VkInstance						mInstance = VK_NULL_HANDLE;
+#ifdef JPH_DEBUG
+	VkDebugUtilsMessengerEXT		mDebugMessenger = VK_NULL_HANDLE;
+#endif
+	uint32							mGraphicsQueueIndex = 0;
+	uint32							mPresentQueueIndex = 0;
+	VkQueue							mGraphicsQueue = VK_NULL_HANDLE;
+	VkQueue							mPresentQueue = VK_NULL_HANDLE;
+	VkSurfaceFormatKHR				mSelectedFormat;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKWithAllocator.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKWithAllocator.cpp
@ -0,0 +1,172 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeSystemVKWithAllocator.h>
+#include <Jolt/Compute/VK/ComputeShaderVK.h>
+#include <Jolt/Compute/VK/ComputeBufferVK.h>
+#include <Jolt/Compute/VK/ComputeQueueVK.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_RTTI_VIRTUAL(ComputeSystemVKWithAllocator)
+{
+	JPH_ADD_BASE_CLASS(ComputeSystemVKWithAllocator, ComputeSystemVK)
+}
+
+bool ComputeSystemVKWithAllocator::InitializeMemory()
+{
+	// Get memory properties
+	vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mMemoryProperties);
+
+	return true;
+}
+
+void ComputeSystemVKWithAllocator::ShutdownMemory()
+{
+	// Free all memory
+	for (const MemoryCache::value_type &mc : mMemoryCache)
+		for (const Memory &m : mc.second)
+			if (m.mOffset == 0)
+				FreeMemory(*m.mMemory);
+	mMemoryCache.clear();
+}
+
+uint32 ComputeSystemVKWithAllocator::FindMemoryType(uint32 inTypeFilter, VkMemoryPropertyFlags inProperties) const
+{
+	for (uint32 i = 0; i < mMemoryProperties.memoryTypeCount; i++)
+		if ((inTypeFilter & (1 << i))
+			&& (mMemoryProperties.memoryTypes[i].propertyFlags & inProperties) == inProperties)
+			return i;
+
+	JPH_ASSERT(false, "Failed to find memory type!");
+	return 0;
+}
+
+void ComputeSystemVKWithAllocator::AllocateMemory(VkDeviceSize inSize, uint32 inMemoryTypeBits, VkMemoryPropertyFlags inProperties, MemoryVK &ioMemory)
+{
+	JPH_ASSERT(ioMemory.mMemory == VK_NULL_HANDLE);
+
+	ioMemory.mSize = inSize;
+	ioMemory.mProperties = inProperties;
+
+	VkMemoryAllocateInfo alloc_info = {};
+	alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+	alloc_info.allocationSize = inSize;
+	alloc_info.memoryTypeIndex = FindMemoryType(inMemoryTypeBits, inProperties);
+	vkAllocateMemory(mDevice, &alloc_info, nullptr, &ioMemory.mMemory);
+}
+
+void ComputeSystemVKWithAllocator::FreeMemory(MemoryVK &ioMemory)
+{
+	vkFreeMemory(mDevice, ioMemory.mMemory, nullptr);
+	ioMemory.mMemory = VK_NULL_HANDLE;
+}
+
+bool ComputeSystemVKWithAllocator::CreateBuffer(VkDeviceSize inSize, VkBufferUsageFlags inUsage, VkMemoryPropertyFlags inProperties, BufferVK &outBuffer)
+{
+	// Create a new buffer
+	outBuffer.mSize = inSize;
+
+	VkBufferCreateInfo create_info = {};
+	create_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+	create_info.size = inSize;
+	create_info.usage = inUsage;
+	create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+	if (VKFailed(vkCreateBuffer(mDevice, &create_info, nullptr, &outBuffer.mBuffer)))
+	{
+		outBuffer.mBuffer = VK_NULL_HANDLE;
+		return false;
+	}
+
+	VkMemoryRequirements mem_requirements;
+	vkGetBufferMemoryRequirements(mDevice, outBuffer.mBuffer, &mem_requirements);
+
+	if (mem_requirements.size > cMaxAllocSize)
+	{
+		// Allocate block directly
+		Ref<MemoryVK> memory_vk = new MemoryVK();
+		memory_vk->mBufferSize = mem_requirements.size;
+		AllocateMemory(mem_requirements.size, mem_requirements.memoryTypeBits, inProperties, *memory_vk);
+		outBuffer.mMemory = memory_vk;
+		outBuffer.mOffset = 0;
+	}
+	else
+	{
+		// Round allocation to the next power of 2 so that we can use a simple block based allocator
+		VkDeviceSize buffer_size = max(VkDeviceSize(GetNextPowerOf2(uint32(mem_requirements.size))), cMinAllocSize);
+
+		// Ensure that we have memory available from the right pool
+		Array<Memory> &mem_array = mMemoryCache[{ buffer_size, inProperties }];
+		if (mem_array.empty())
+		{
+			// Allocate a bigger block
+			Ref<MemoryVK> memory_vk = new MemoryVK();
+			memory_vk->mBufferSize = buffer_size;
+			AllocateMemory(cBlockSize, mem_requirements.memoryTypeBits, inProperties, *memory_vk);
+
+			// Divide into sub blocks
+			for (VkDeviceSize offset = 0; offset < cBlockSize; offset += buffer_size)
+				mem_array.push_back({ memory_vk, offset });
+		}
+
+		// Claim memory from the pool
+		Memory &memory = mem_array.back();
+		outBuffer.mMemory = memory.mMemory;
+		outBuffer.mOffset = memory.mOffset;
+		mem_array.pop_back();
+	}
+
+	// Bind the memory to the buffer
+	vkBindBufferMemory(mDevice, outBuffer.mBuffer, outBuffer.mMemory->mMemory, outBuffer.mOffset);
+	return true;
+}
+
+void ComputeSystemVKWithAllocator::FreeBuffer(BufferVK &ioBuffer)
+{
+	if (ioBuffer.mBuffer != VK_NULL_HANDLE)
+	{
+		// Destroy the buffer
+		vkDestroyBuffer(mDevice, ioBuffer.mBuffer, nullptr);
+		ioBuffer.mBuffer = VK_NULL_HANDLE;
+
+		// Hand the memory back to the cache
+		VkDeviceSize buffer_size = ioBuffer.mMemory->mBufferSize;
+		if (buffer_size > cMaxAllocSize)
+			FreeMemory(*ioBuffer.mMemory);
+		else
+			mMemoryCache[{ buffer_size, ioBuffer.mMemory->mProperties }].push_back({ ioBuffer.mMemory, ioBuffer.mOffset });
+
+		ioBuffer = BufferVK();
+	}
+}
+
+void *ComputeSystemVKWithAllocator::MapBuffer(BufferVK& ioBuffer)
+{
+	if (++ioBuffer.mMemory->mMappedCount == 1
+		&& VKFailed(vkMapMemory(mDevice, ioBuffer.mMemory->mMemory, 0, VK_WHOLE_SIZE, 0, &ioBuffer.mMemory->mMappedPtr)))
+	{
+		ioBuffer.mMemory->mMappedCount = 0;
+		return nullptr;
+	}
+
+	return static_cast<uint8 *>(ioBuffer.mMemory->mMappedPtr) + ioBuffer.mOffset;
+}
+
+void ComputeSystemVKWithAllocator::UnmapBuffer(BufferVK& ioBuffer)
+{
+	JPH_ASSERT(ioBuffer.mMemory->mMappedCount > 0);
+	if (--ioBuffer.mMemory->mMappedCount == 0)
+	{
+		vkUnmapMemory(mDevice, ioBuffer.mMemory->mMemory);
+		ioBuffer.mMemory->mMappedPtr = nullptr;
+	}
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKWithAllocator.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/ComputeSystemVKWithAllocator.h
@ -0,0 +1,70 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_VK
+
+#include <Jolt/Compute/VK/ComputeSystemVK.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// This extends ComputeSystemVK to provide a default implementation for memory allocation and mapping.
+/// It uses a simple block based allocator to reduce the number of allocations done to Vulkan.
+class JPH_EXPORT ComputeSystemVKWithAllocator : public ComputeSystemVK
+{
+public:
+	JPH_DECLARE_RTTI_VIRTUAL(JPH_EXPORT, ComputeSystemVKWithAllocator)
+
+	/// Allow the application to override buffer creation and memory mapping in case it uses its own allocator
+	virtual bool					CreateBuffer(VkDeviceSize inSize, VkBufferUsageFlags inUsage, VkMemoryPropertyFlags inProperties, BufferVK &outBuffer) override;
+	virtual void					FreeBuffer(BufferVK &ioBuffer) override;
+	virtual void *					MapBuffer(BufferVK &ioBuffer) override;
+	virtual void					UnmapBuffer(BufferVK &ioBuffer) override;
+
+protected:
+	virtual bool					InitializeMemory() override;
+	virtual void					ShutdownMemory() override;
+
+	uint32							FindMemoryType(uint32 inTypeFilter, VkMemoryPropertyFlags inProperties) const;
+	void							AllocateMemory(VkDeviceSize inSize, uint32 inMemoryTypeBits, VkMemoryPropertyFlags inProperties, MemoryVK &ioMemory);
+	void							FreeMemory(MemoryVK &ioMemory);
+
+	VkPhysicalDeviceMemoryProperties mMemoryProperties;
+
+private:
+	// Smaller allocations (from cMinAllocSize to cMaxAllocSize) will be done in blocks of cBlockSize bytes.
+	// We do this because there is a limit to the number of allocations that we can make in Vulkan.
+	static constexpr VkDeviceSize	cMinAllocSize = 512;
+	static constexpr VkDeviceSize	cMaxAllocSize = 65536;
+	static constexpr VkDeviceSize	cBlockSize = 524288;
+
+	struct MemoryKey
+	{
+		bool						operator == (const MemoryKey &inRHS) const
+		{
+			return mSize == inRHS.mSize && mProperties == inRHS.mProperties;
+		}
+
+		VkDeviceSize				mSize;
+		VkMemoryPropertyFlags		mProperties;
+	};
+
+	JPH_MAKE_HASH_STRUCT(MemoryKey, MemoryKeyHasher, t.mProperties, t.mSize)
+
+	struct Memory
+	{
+		Ref<MemoryVK>				mMemory;
+		VkDeviceSize				mOffset;
+	};
+
+	using MemoryCache = UnorderedMap<MemoryKey, Array<Memory>, MemoryKeyHasher>;
+
+	MemoryCache						mMemoryCache;
+};
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/IncludeVK.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Compute/VK/IncludeVK.h
@ -0,0 +1,44 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/StringTools.h>
+
+#ifdef JPH_USE_VK
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic")
+
+#include <vulkan/vulkan.h>
+
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+inline bool VKFailed(VkResult inResult)
+{
+	if (inResult == VK_SUCCESS)
+		return false;
+
+	Trace("Vulkan call failed with error code: %d", (int)inResult);
+	JPH_ASSERT(false);
+	return true;
+}
+
+template <class Result>
+inline bool VKFailed(VkResult inResult, Result &outResult)
+{
+	if (inResult == VK_SUCCESS)
+		return false;
+
+	String error = StringFormat("Vulkan call failed with error code: %d", (int)inResult);
+	outResult.SetError(error);
+	JPH_ASSERT(false);
+	return true;
+}
+
+JPH_NAMESPACE_END
+
+#endif // JPH_USE_VK
--- a/lib/haxejolt/JoltPhysics/Jolt/ConfigurationString.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/ConfigurationString.h
@ -0,0 +1,112 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2023 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Construct a string that lists the most important configuration settings
+inline const char *GetConfigurationString()
+{
+	return JPH_IF_SINGLE_PRECISION_ELSE("Single", "Double") " precision "
+#if defined(JPH_CPU_X86)
+		"x86 "
+#elif defined(JPH_CPU_ARM)
+		"ARM "
+#elif defined(JPH_CPU_RISCV)
+		"RISC-V "
+#elif defined(JPH_CPU_PPC)
+		"PowerPC "
+	#ifdef JPH_CPU_BIG_ENDIAN
+		"(Big Endian) "
+	#else
+		"(Little Endian) "
+	#endif
+#elif defined(JPH_CPU_LOONGARCH)
+		"LoongArch "
+#elif defined(JPH_CPU_E2K)
+		"E2K "
+#elif defined(JPH_CPU_WASM)
+		"WASM "
+#else
+	#error Unknown CPU architecture
+#endif
+#if JPH_CPU_ARCH_BITS == 64
+		"64-bit "
+#elif JPH_CPU_ARCH_BITS == 32
+		"32-bit "
+#endif
+		"with instructions: "
+#ifdef JPH_USE_NEON
+		"NEON "
+#endif
+#ifdef JPH_USE_SSE
+		"SSE2 "
+#endif
+#ifdef JPH_USE_SSE4_1
+		"SSE4.1 "
+#endif
+#ifdef JPH_USE_SSE4_2
+		"SSE4.2 "
+#endif
+#ifdef JPH_USE_AVX
+		"AVX "
+#endif
+#ifdef JPH_USE_AVX2
+		"AVX2 "
+#endif
+#ifdef JPH_USE_AVX512
+		"AVX512 "
+#endif
+#ifdef JPH_USE_F16C
+		"F16C "
+#endif
+#ifdef JPH_USE_LZCNT
+		"LZCNT "
+#endif
+#ifdef JPH_USE_TZCNT
+		"TZCNT "
+#endif
+#ifdef JPH_USE_FMADD
+		"FMADD "
+#endif
+#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		"(Cross Platform Deterministic) "
+#endif
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		"(FP Exceptions) "
+#endif
+#ifdef JPH_DEBUG_RENDERER
+		"(Debug Renderer) "
+#endif
+#ifdef JPH_PROFILE_ENABLED
+		"(Profile) "
+#endif
+#ifdef JPH_EXTERNAL_PROFILE
+		"(External Profile) "
+#endif
+#if defined(JPH_OBJECT_LAYER_BITS) && JPH_OBJECT_LAYER_BITS == 32
+		"(32-bit ObjectLayer) "
+#else
+		"(16-bit ObjectLayer) "
+#endif
+#ifdef JPH_ENABLE_ASSERTS
+		"(Assertions) "
+#endif
+#ifdef JPH_OBJECT_STREAM
+		"(ObjectStream) "
+#endif
+#ifdef JPH_DEBUG
+		"(Debug) "
+#endif
+#if defined(__cpp_rtti) && __cpp_rtti
+		"(C++ RTTI) "
+#endif
+#if defined(__cpp_exceptions) && __cpp_exceptions
+		"(C++ Exceptions) "
+#endif
+		;
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/ARMNeon.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/ARMNeon.h
@ -0,0 +1,94 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_NEON
+
+// Constructing NEON values
+#ifdef JPH_COMPILER_MSVC
+	#define JPH_NEON_INT32x4(v1, v2, v3, v4) { int64_t(v1) + (int64_t(v2) << 32), int64_t(v3) + (int64_t(v4) << 32) }
+	#define JPH_NEON_UINT32x4(v1, v2, v3, v4) { uint64_t(v1) + (uint64_t(v2) << 32), uint64_t(v3) + (uint64_t(v4) << 32) }
+	#define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { int64_t(v1) + (int64_t(v2) << 8) + (int64_t(v3) << 16) + (int64_t(v4) << 24) + (int64_t(v5) << 32) + (int64_t(v6) << 40) + (int64_t(v7) << 48) + (int64_t(v8) << 56), int64_t(v9) + (int64_t(v10) << 8) + (int64_t(v11) << 16) + (int64_t(v12) << 24) + (int64_t(v13) << 32) + (int64_t(v14) << 40) + (int64_t(v15) << 48) + (int64_t(v16) << 56) }
+	#define JPH_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { uint64_t(v1) + (uint64_t(v2) << 8) + (uint64_t(v3) << 16) + (uint64_t(v4) << 24) + (uint64_t(v5) << 32) + (uint64_t(v6) << 40) + (uint64_t(v7) << 48) + (uint64_t(v8) << 56), uint64_t(v9) + (uint64_t(v10) << 8) + (uint64_t(v11) << 16) + (uint64_t(v12) << 24) + (uint64_t(v13) << 32) + (uint64_t(v14) << 40) + (uint64_t(v15) << 48) + (uint64_t(v16) << 56) }
+#else
+	#define JPH_NEON_INT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
+	#define JPH_NEON_UINT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
+	#define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
+	#define JPH_NEON_UINT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
+#endif
+
+// MSVC and GCC prior to version 12 don't define __builtin_shufflevector
+#if defined(JPH_COMPILER_MSVC) || (defined(JPH_COMPILER_GCC) && __GNUC__ < 12)
+	JPH_NAMESPACE_BEGIN
+
+	// Generic shuffle vector template
+	template <unsigned I1, unsigned I2, unsigned I3, unsigned I4>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4(float32x4_t inV1, float32x4_t inV2)
+	{
+		float32x4_t ret;
+		ret = vmovq_n_f32(vgetq_lane_f32(I1 >= 4? inV2 : inV1, I1 & 0b11));
+		ret = vsetq_lane_f32(vgetq_lane_f32(I2 >= 4? inV2 : inV1, I2 & 0b11), ret, 1);
+		ret = vsetq_lane_f32(vgetq_lane_f32(I3 >= 4? inV2 : inV1, I3 & 0b11), ret, 2);
+		ret = vsetq_lane_f32(vgetq_lane_f32(I4 >= 4? inV2 : inV1, I4 & 0b11), ret, 3);
+		return ret;
+	}
+
+	// Specializations
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return vcombine_f32(vget_low_f32(inV1), vdup_lane_f32(vget_high_f32(inV1), 0));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return vcombine_f32(vget_low_f32(inV1), vdup_lane_f32(vget_high_f32(inV1), 1));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 3>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return inV1;
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vrev64_f32(vget_high_f32(inV1)));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return vcombine_f32(vdup_lane_f32(vget_high_f32(inV1), 0), vrev64_f32(vget_low_f32(inV1)));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 0, 1>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return vcombine_f32(vget_high_f32(inV1), vget_low_f32(inV1));
+	}
+
+	// Used extensively by cross product
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 0>(float32x4_t inV1, float32x4_t inV2)
+	{
+		static uint8x16_t table = JPH_NEON_UINT8x16(0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03);
+		return vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(inV1), table));
+	}
+
+	// Shuffle a vector
+	#define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) NeonShuffleFloat32x4<index1, index2, index3, index4>(vec1, vec2)
+	#define JPH_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) vreinterpretq_u32_f32((NeonShuffleFloat32x4<index1, index2, index3, index4>(vreinterpretq_f32_u32(vec1), vreinterpretq_f32_u32(vec2))))
+
+	JPH_NAMESPACE_END
+#else
+	// Shuffle a vector
+	#define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
+	#define JPH_NEON_SHUFFLE_U32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
+#endif
+
+#endif // JPH_USE_NEON
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Array.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Array.h
@ -0,0 +1,713 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/STLAllocator.h>
+#include <Jolt/Core/HashCombine.h>
+
+#ifdef JPH_USE_STD_VECTOR
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <vector>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+template <class T, class Allocator = STLAllocator<T>> using Array = std::vector<T, Allocator>;
+
+JPH_NAMESPACE_END
+
+#else
+
+JPH_NAMESPACE_BEGIN
+
+/// Simple replacement for std::vector
+///
+/// Major differences:
+/// - Memory is not initialized to zero (this was causing a lot of page faults when deserializing large MeshShapes / HeightFieldShapes)
+/// - Iterators are simple pointers (for now)
+/// - No exception safety
+/// - No specialization like std::vector<bool> has
+/// - Not all functions have been implemented
+template <class T, class Allocator = STLAllocator<T>>
+class [[nodiscard]] Array : private Allocator
+{
+public:
+	using value_type = T;
+	using allocator_type = Allocator;
+	using size_type = size_t;
+	using difference_type = typename Allocator::difference_type;
+	using pointer = T *;
+	using const_pointer = const T *;
+	using reference = T &;
+	using const_reference = const T &;
+
+	using const_iterator = const T *;
+	using iterator = T *;
+
+	/// An iterator that traverses the array in reverse order
+	class rev_it
+	{
+	public:
+		/// Constructor
+							rev_it() = default;
+		explicit			rev_it(T *inValue)				: mValue(inValue) { }
+
+		/// Copying
+							rev_it(const rev_it &) = default;
+		rev_it &			operator = (const rev_it &) = default;
+
+		/// Comparison
+		bool				operator == (const rev_it &inRHS) const { return mValue == inRHS.mValue; }
+		bool				operator != (const rev_it &inRHS) const { return mValue != inRHS.mValue; }
+
+		/// Arithmetics
+		rev_it &			operator ++ ()					{ --mValue; return *this; }
+		rev_it				operator ++ (int)				{ return rev_it(mValue--); }
+		rev_it &			operator -- ()					{ ++mValue; return *this; }
+		rev_it				operator -- (int)				{ return rev_it(mValue++); }
+
+		rev_it				operator + (int inValue) const	{ return rev_it(mValue - inValue); }
+		rev_it				operator - (int inValue) const	{ return rev_it(mValue + inValue); }
+
+		rev_it &			operator += (int inValue)		{ mValue -= inValue; return *this; }
+		rev_it &			operator -= (int inValue)		{ mValue += inValue; return *this; }
+
+		/// Access
+		T &					operator * () const				{ return *mValue; }
+		T &					operator -> () const			{ return *mValue; }
+
+	private:
+		T *					mValue;
+	};
+
+	/// A const iterator that traverses the array in reverse order
+	class crev_it
+	{
+	public:
+		/// Constructor
+							crev_it() = default;
+		explicit			crev_it(const T *inValue)		: mValue(inValue) { }
+
+		/// Copying
+							crev_it(const crev_it &) = default;
+		explicit			crev_it(const rev_it &inValue)	: mValue(inValue.mValue) { }
+		crev_it &			operator = (const crev_it &) = default;
+		crev_it &			operator = (const rev_it &inRHS) { mValue = inRHS.mValue; return *this; }
+
+		/// Comparison
+		bool				operator == (const crev_it &inRHS) const { return mValue == inRHS.mValue; }
+		bool				operator != (const crev_it &inRHS) const { return mValue != inRHS.mValue; }
+
+		/// Arithmetics
+		crev_it &			operator ++ ()					{ --mValue; return *this; }
+		crev_it				operator ++ (int)				{ return crev_it(mValue--); }
+		crev_it &			operator -- ()					{ ++mValue; return *this; }
+		crev_it				operator -- (int)				{ return crev_it(mValue++); }
+
+		crev_it				operator + (int inValue)		{ return crev_it(mValue - inValue); }
+		crev_it				operator - (int inValue)		{ return crev_it(mValue + inValue); }
+
+		crev_it &			operator += (int inValue)		{ mValue -= inValue; return *this; }
+		crev_it &			operator -= (int inValue)		{ mValue += inValue; return *this; }
+
+		/// Access
+		const T &			operator * () const				{ return *mValue; }
+		const T &			operator -> () const			{ return *mValue; }
+
+	private:
+		const T *			mValue;
+	};
+
+	using reverse_iterator = rev_it;
+	using const_reverse_iterator = crev_it;
+
+private:
+	/// Move elements from one location to another
+	inline void				move(pointer inDestination, pointer inSource, size_type inCount)
+	{
+		if constexpr (std::is_trivially_copyable<T>())
+			memmove(inDestination, inSource, inCount * sizeof(T));
+		else
+		{
+			if (inDestination < inSource)
+			{
+				for (T *destination_end = inDestination + inCount; inDestination < destination_end; ++inDestination, ++inSource)
+				{
+					new (inDestination) T(std::move(*inSource));
+					inSource->~T();
+				}
+			}
+			else
+			{
+				for (T *destination = inDestination + inCount - 1, *source = inSource + inCount - 1; destination >= inDestination; --destination, --source)
+				{
+					new (destination) T(std::move(*source));
+					source->~T();
+				}
+			}
+		}
+	}
+
+	/// Reallocate the data block to inNewCapacity
+	inline void				reallocate(size_type inNewCapacity)
+	{
+		JPH_ASSERT(inNewCapacity > 0 && inNewCapacity >= mSize);
+
+		pointer ptr;
+		if constexpr (AllocatorHasReallocate<Allocator>::sValue)
+		{
+			// Reallocate data block
+			ptr = get_allocator().reallocate(mElements, mCapacity, inNewCapacity);
+		}
+		else
+		{
+			// Copy data to a new location
+			ptr = get_allocator().allocate(inNewCapacity);
+			if (mElements != nullptr)
+			{
+				move(ptr, mElements, mSize);
+				get_allocator().deallocate(mElements, mCapacity);
+			}
+		}
+		mElements = ptr;
+		mCapacity = inNewCapacity;
+	}
+
+	/// Destruct elements [inStart, inEnd - 1]
+	inline void				destruct(size_type inStart, size_type inEnd)
+	{
+		if constexpr (!std::is_trivially_destructible<T>())
+			if (inStart < inEnd)
+				for (T *element = mElements + inStart, *element_end = mElements + inEnd; element < element_end; ++element)
+					element->~T();
+	}
+
+public:
+	/// Reserve array space
+	inline void				reserve(size_type inNewSize)
+	{
+		if (mCapacity < inNewSize)
+			reallocate(inNewSize);
+	}
+
+	/// Resize array to new length
+	inline void				resize(size_type inNewSize)
+	{
+		destruct(inNewSize, mSize);
+		reserve(inNewSize);
+
+		if constexpr (!std::is_trivially_constructible<T>())
+			for (T *element = mElements + mSize, *element_end = mElements + inNewSize; element < element_end; ++element)
+				new (element) T;
+		mSize = inNewSize;
+	}
+
+	/// Resize array to new length and initialize all elements with inValue
+	inline void				resize(size_type inNewSize, const T &inValue)
+	{
+		JPH_ASSERT(&inValue < mElements || &inValue >= mElements + mSize, "Can't pass an element from the array to resize");
+
+		destruct(inNewSize, mSize);
+		reserve(inNewSize);
+
+		for (T *element = mElements + mSize, *element_end = mElements + inNewSize; element < element_end; ++element)
+			new (element) T(inValue);
+		mSize = inNewSize;
+	}
+
+	/// Destruct all elements and set length to zero
+	inline void				clear()
+	{
+		destruct(0, mSize);
+		mSize = 0;
+	}
+
+private:
+	/// Grow the array by at least inAmount elements
+	inline void				grow(size_type inAmount = 1)
+	{
+		size_type min_size = mSize + inAmount;
+		if (min_size > mCapacity)
+		{
+			size_type new_capacity = max(min_size, mCapacity * 2);
+			reserve(new_capacity);
+		}
+	}
+
+	/// Free memory
+	inline void				deallocate()
+	{
+		get_allocator().deallocate(mElements, mCapacity);
+		mElements = nullptr;
+		mCapacity = 0;
+	}
+
+	/// Destroy all elements and free memory
+	inline void				destroy()
+	{
+		if (mElements != nullptr)
+		{
+			clear();
+			deallocate();
+		}
+	}
+
+public:
+	/// Replace the contents of this array with inBegin .. inEnd
+	template <class Iterator>
+	inline void				assign(Iterator inBegin, Iterator inEnd)
+	{
+		clear();
+		reserve(size_type(std::distance(inBegin, inEnd)));
+
+		for (Iterator element = inBegin; element != inEnd; ++element)
+			new (&mElements[mSize++]) T(*element);
+	}
+
+	/// Replace the contents of this array with inList
+	inline void				assign(std::initializer_list<T> inList)
+	{
+		clear();
+		reserve(size_type(inList.size()));
+
+		for (const T &v : inList)
+			new (&mElements[mSize++]) T(v);
+	}
+
+	/// Default constructor
+							Array() = default;
+
+	/// Constructor with allocator
+	explicit inline			Array(const Allocator &inAllocator) :
+		Allocator(inAllocator)
+	{
+	}
+
+	/// Constructor with length
+	explicit inline			Array(size_type inLength, const Allocator &inAllocator = { }) :
+		Allocator(inAllocator)
+	{
+		resize(inLength);
+	}
+
+	/// Constructor with length and value
+	inline					Array(size_type inLength, const T &inValue, const Allocator &inAllocator = { }) :
+		Allocator(inAllocator)
+	{
+		resize(inLength, inValue);
+	}
+
+	/// Constructor from initializer list
+	inline					Array(std::initializer_list<T> inList, const Allocator &inAllocator = { }) :
+		Allocator(inAllocator)
+	{
+		assign(inList);
+	}
+
+	/// Constructor from iterator
+	inline					Array(const_iterator inBegin, const_iterator inEnd, const Allocator &inAllocator = { }) :
+		Allocator(inAllocator)
+	{
+		assign(inBegin, inEnd);
+	}
+
+	/// Copy constructor
+	inline					Array(const Array<T, Allocator> &inRHS) :
+		Allocator(inRHS.get_allocator())
+	{
+		assign(inRHS.begin(), inRHS.end());
+	}
+
+	/// Move constructor
+	inline					Array(Array<T, Allocator> &&inRHS) noexcept :
+		Allocator(std::move(inRHS.get_allocator())),
+		mSize(inRHS.mSize),
+		mCapacity(inRHS.mCapacity),
+		mElements(inRHS.mElements)
+	{
+		inRHS.mSize = 0;
+		inRHS.mCapacity = 0;
+		inRHS.mElements = nullptr;
+	}
+
+	/// Destruct all elements
+	inline					~Array()
+	{
+		destroy();
+	}
+
+	/// Get the allocator
+	inline Allocator &		get_allocator()
+	{
+		return *this;
+	}
+
+	inline const Allocator &get_allocator() const
+	{
+		return *this;
+	}
+
+	/// Add element to the back of the array
+	inline void				push_back(const T &inValue)
+	{
+		JPH_ASSERT(&inValue < mElements || &inValue >= mElements + mSize, "Can't pass an element from the array to push_back");
+
+		grow();
+
+		T *element = mElements + mSize++;
+		new (element) T(inValue);
+	}
+
+	inline void				push_back(T &&inValue)
+	{
+		grow();
+
+		T *element = mElements + mSize++;
+		new (element) T(std::move(inValue));
+	}
+
+	/// Construct element at the back of the array
+	template <class... A>
+	inline T &				emplace_back(A &&... inValue)
+	{
+		grow();
+
+		T *element = mElements + mSize++;
+		new (element) T(std::forward<A>(inValue)...);
+		return *element;
+	}
+
+	/// Remove element from the back of the array
+	inline void				pop_back()
+	{
+		JPH_ASSERT(mSize > 0);
+		mElements[--mSize].~T();
+	}
+
+	/// Returns true if there are no elements in the array
+	inline bool				empty() const
+	{
+		return mSize == 0;
+	}
+
+	/// Returns amount of elements in the array
+	inline size_type		size() const
+	{
+		return mSize;
+	}
+
+	/// Returns maximum amount of elements the array can hold
+	inline size_type		capacity() const
+	{
+		return mCapacity;
+	}
+
+	/// Reduce the capacity of the array to match its size
+	void					shrink_to_fit()
+	{
+		if (mElements != nullptr)
+		{
+			if (mSize == 0)
+				deallocate();
+			else if (mCapacity > mSize)
+				reallocate(mSize);
+		}
+	}
+
+	/// Swap the contents of two arrays
+	void					swap(Array<T, Allocator> &inRHS) noexcept
+	{
+		std::swap(get_allocator(), inRHS.get_allocator());
+		std::swap(mSize, inRHS.mSize);
+		std::swap(mCapacity, inRHS.mCapacity);
+		std::swap(mElements, inRHS.mElements);
+	}
+
+	template <class Iterator>
+	void					insert(const_iterator inPos, Iterator inBegin, Iterator inEnd)
+	{
+		size_type num_elements = size_type(std::distance(inBegin, inEnd));
+		if (num_elements > 0)
+		{
+			// After grow() inPos may be invalid
+			size_type first_element = inPos - mElements;
+
+			grow(num_elements);
+
+			T *element_begin = mElements + first_element;
+			T *element_end = element_begin + num_elements;
+			move(element_end, element_begin, mSize - first_element);
+
+			for (T *element = element_begin; element < element_end; ++element, ++inBegin)
+				new (element) T(*inBegin);
+
+			mSize += num_elements;
+		}
+	}
+
+	void					insert(const_iterator inPos, const T &inValue)
+	{
+		JPH_ASSERT(&inValue < mElements || &inValue >= mElements + mSize, "Can't pass an element from the array to insert");
+
+		// After grow() inPos may be invalid
+		size_type first_element = inPos - mElements;
+
+		grow();
+
+		T *element = mElements + first_element;
+		move(element + 1, element, mSize - first_element);
+
+		new (element) T(inValue);
+		mSize++;
+	}
+
+	/// Remove one element from the array
+	iterator				erase(const_iterator inIter)
+	{
+		size_type p = size_type(inIter - begin());
+		JPH_ASSERT(p < mSize);
+		mElements[p].~T();
+		if (p + 1 < mSize)
+			move(mElements + p, mElements + p + 1, mSize - p - 1);
+		--mSize;
+		return const_cast<iterator>(inIter);
+	}
+
+	/// Remove multiple element from the array
+	iterator				erase(const_iterator inBegin, const_iterator inEnd)
+	{
+		size_type p = size_type(inBegin - begin());
+		size_type n = size_type(inEnd - inBegin);
+		JPH_ASSERT(inEnd <= end());
+		destruct(p, p + n);
+		if (p + n < mSize)
+			move(mElements + p, mElements + p + n, mSize - p - n);
+		mSize -= n;
+		return const_cast<iterator>(inBegin);
+	}
+
+	/// Iterators
+	inline const_iterator	begin() const
+	{
+		return mElements;
+	}
+
+	inline const_iterator	end() const
+	{
+		return mElements + mSize;
+	}
+
+	inline crev_it			rbegin() const
+	{
+		return crev_it(mElements + mSize - 1);
+	}
+
+	inline crev_it			rend() const
+	{
+		return crev_it(mElements - 1);
+	}
+
+	inline const_iterator	cbegin() const
+	{
+		return begin();
+	}
+
+	inline const_iterator	cend() const
+	{
+		return end();
+	}
+
+	inline crev_it			crbegin() const
+	{
+		return rbegin();
+	}
+
+	inline crev_it			crend() const
+	{
+		return rend();
+	}
+
+	inline iterator			begin()
+	{
+		return mElements;
+	}
+
+	inline iterator			end()
+	{
+		return mElements + mSize;
+	}
+
+	inline rev_it			rbegin()
+	{
+		return rev_it(mElements + mSize - 1);
+	}
+
+	inline rev_it			rend()
+	{
+		return rev_it(mElements - 1);
+	}
+
+	inline const T *		data() const
+	{
+		return mElements;
+	}
+
+	inline T *				data()
+	{
+		return mElements;
+	}
+
+	/// Access element
+	inline T &				operator [] (size_type inIdx)
+	{
+		JPH_ASSERT(inIdx < mSize);
+		return mElements[inIdx];
+	}
+
+	inline const T &		operator [] (size_type inIdx) const
+	{
+		JPH_ASSERT(inIdx < mSize);
+		return mElements[inIdx];
+	}
+
+	/// Access element
+	inline T &				at(size_type inIdx)
+	{
+		JPH_ASSERT(inIdx < mSize);
+		return mElements[inIdx];
+	}
+
+	inline const T &		at(size_type inIdx) const
+	{
+		JPH_ASSERT(inIdx < mSize);
+		return mElements[inIdx];
+	}
+
+	/// First element in the array
+	inline const T &		front() const
+	{
+		JPH_ASSERT(mSize > 0);
+		return mElements[0];
+	}
+
+	inline T &				front()
+	{
+		JPH_ASSERT(mSize > 0);
+		return mElements[0];
+	}
+
+	/// Last element in the array
+	inline const T &		back() const
+	{
+		JPH_ASSERT(mSize > 0);
+		return mElements[mSize - 1];
+	}
+
+	inline T &				back()
+	{
+		JPH_ASSERT(mSize > 0);
+		return mElements[mSize - 1];
+	}
+
+	/// Assignment operator
+	Array<T, Allocator> &	operator = (const Array<T, Allocator> &inRHS)
+	{
+		if (static_cast<const void *>(this) != static_cast<const void *>(&inRHS))
+			assign(inRHS.begin(), inRHS.end());
+
+		return *this;
+	}
+
+	/// Assignment move operator
+	Array<T, Allocator> &	operator = (Array<T, Allocator> &&inRHS) noexcept
+	{
+		if (static_cast<const void *>(this) != static_cast<const void *>(&inRHS))
+		{
+			destroy();
+
+			get_allocator() = std::move(inRHS.get_allocator());
+
+			mSize = inRHS.mSize;
+			mCapacity = inRHS.mCapacity;
+			mElements = inRHS.mElements;
+
+			inRHS.mSize = 0;
+			inRHS.mCapacity = 0;
+			inRHS.mElements = nullptr;
+		}
+
+		return *this;
+	}
+
+	/// Assignment operator
+	Array<T, Allocator> &	operator = (std::initializer_list<T> inRHS)
+	{
+		assign(inRHS);
+
+		return *this;
+	}
+
+	/// Comparing arrays
+	bool					operator == (const Array<T, Allocator> &inRHS) const
+	{
+		if (mSize != inRHS.mSize)
+			return false;
+		for (size_type i = 0; i < mSize; ++i)
+			if (!(mElements[i] == inRHS.mElements[i]))
+				return false;
+		return true;
+	}
+
+	bool					operator != (const Array<T, Allocator> &inRHS) const
+	{
+		if (mSize != inRHS.mSize)
+			return true;
+		for (size_type i = 0; i < mSize; ++i)
+			if (mElements[i] != inRHS.mElements[i])
+				return true;
+		return false;
+	}
+
+	/// Get hash for this array
+	uint64					GetHash() const
+	{
+		// Hash length first
+		uint64 ret = Hash<uint32> { } (uint32(size()));
+
+		// Then hash elements
+		for (const T *element = mElements, *element_end = mElements + mSize; element < element_end; ++element)
+			HashCombine(ret, *element);
+
+		return ret;
+	}
+
+private:
+	size_type				mSize = 0;
+	size_type				mCapacity = 0;
+	T *						mElements = nullptr;
+};
+
+JPH_NAMESPACE_END
+
+JPH_SUPPRESS_WARNING_PUSH
+JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat")
+
+namespace std
+{
+	/// Declare std::hash for Array
+	template <class T, class Allocator>
+	struct hash<JPH::Array<T, Allocator>>
+	{
+		size_t operator () (const JPH::Array<T, Allocator> &inRHS) const
+		{
+			return std::size_t(inRHS.GetHash());
+		}
+	};
+}
+
+JPH_SUPPRESS_WARNING_POP
+
+#endif // JPH_USE_STD_VECTOR
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Atomics.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Atomics.h
@ -0,0 +1,44 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <atomic>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+// Things we're using from STL
+using std::atomic;
+using std::memory_order;
+using std::memory_order_relaxed;
+using std::memory_order_acquire;
+using std::memory_order_release;
+using std::memory_order_acq_rel;
+using std::memory_order_seq_cst;
+
+/// Atomically compute the min(ioAtomic, inValue) and store it in ioAtomic, returns true if value was updated
+template <class T>
+bool AtomicMin(atomic<T> &ioAtomic, const T inValue, const memory_order inMemoryOrder = memory_order_seq_cst)
+{
+	T cur_value = ioAtomic.load(memory_order_relaxed);
+	while (cur_value > inValue)
+		if (ioAtomic.compare_exchange_weak(cur_value, inValue, inMemoryOrder))
+			return true;
+	return false;
+}
+
+/// Atomically compute the max(ioAtomic, inValue) and store it in ioAtomic, returns true if value was updated
+template <class T>
+bool AtomicMax(atomic<T> &ioAtomic, const T inValue, const memory_order inMemoryOrder = memory_order_seq_cst)
+{
+	T cur_value = ioAtomic.load(memory_order_relaxed);
+	while (cur_value < inValue)
+		if (ioAtomic.compare_exchange_weak(cur_value, inValue, inMemoryOrder))
+			return true;
+	return false;
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/BinaryHeap.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/BinaryHeap.h
@ -0,0 +1,96 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Push a new element into a binary max-heap.
+/// [inBegin, inEnd - 1) must be a a valid heap. Element inEnd - 1 will be inserted into the heap. The heap will be [inBegin, inEnd) after this call.
+/// inPred is a function that returns true if the first element is less or equal than the second element.
+/// See: https://en.wikipedia.org/wiki/Binary_heap
+template <typename Iterator, typename Pred>
+void BinaryHeapPush(Iterator inBegin, Iterator inEnd, Pred inPred)
+{
+	using diff_t = typename std::iterator_traits<Iterator>::difference_type;
+	using elem_t = typename std::iterator_traits<Iterator>::value_type;
+
+	// New heap size
+	diff_t count = std::distance(inBegin, inEnd);
+
+	// Start from the last element
+	diff_t current = count - 1;
+	while (current > 0)
+	{
+		// Get current element
+		elem_t &current_elem = *(inBegin + current);
+
+		// Get parent element
+		diff_t parent = (current - 1) >> 1;
+		elem_t &parent_elem = *(inBegin + parent);
+
+		// Sort them so that the parent is larger than the child
+		if (inPred(parent_elem, current_elem))
+		{
+			std::swap(parent_elem, current_elem);
+			current = parent;
+		}
+		else
+		{
+			// When there's no change, we're done
+			break;
+		}
+	}
+}
+
+/// Pop an element from a binary max-heap.
+/// [inBegin, inEnd) must be a valid heap. The largest element will be removed from the heap. The heap will be [inBegin, inEnd - 1) after this call.
+/// inPred is a function that returns true if the first element is less or equal than the second element.
+/// See: https://en.wikipedia.org/wiki/Binary_heap
+template <typename Iterator, typename Pred>
+void BinaryHeapPop(Iterator inBegin, Iterator inEnd, Pred inPred)
+{
+	using diff_t = typename std::iterator_traits<Iterator>::difference_type;
+
+	// Begin by moving the highest element to the end, this is the popped element
+	std::swap(*(inEnd - 1), *inBegin);
+
+	// New heap size
+	diff_t count = std::distance(inBegin, inEnd) - 1;
+
+	// Start from the root
+	diff_t largest = 0;
+	for (;;)
+	{
+		// Get first child
+		diff_t child = (largest << 1) + 1;
+
+		// Check if we're beyond the end of the heap, if so the 2nd child is also beyond the end
+		if (child >= count)
+			break;
+
+		// Remember the largest element from the previous iteration
+		diff_t prev_largest = largest;
+
+		// Check if first child is bigger, if so select it
+		if (inPred(*(inBegin + largest), *(inBegin + child)))
+			largest = child;
+
+		// Switch to the second child
+		++child;
+
+		// Check if second child is bigger, if so select it
+		if (child < count && inPred(*(inBegin + largest), *(inBegin + child)))
+			largest = child;
+
+		// If there was no change, we're done
+		if (prev_largest == largest)
+			break;
+
+		// Swap element
+		std::swap(*(inBegin + prev_largest), *(inBegin + largest));
+	}
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/ByteBuffer.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/ByteBuffer.h
@ -0,0 +1,74 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/STLAlignedAllocator.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Underlying data type for ByteBuffer
+using ByteBufferVector = Array<uint8, STLAlignedAllocator<uint8, JPH_CACHE_LINE_SIZE>>;
+
+/// Simple byte buffer, aligned to a cache line
+class ByteBuffer : public ByteBufferVector
+{
+public:
+	/// Align the size to a multiple of inSize, returns the length after alignment
+	size_t			Align(size_t inSize)
+	{
+		// Assert power of 2
+		JPH_ASSERT(IsPowerOf2(inSize));
+
+		// Calculate new size and resize buffer
+		size_t s = AlignUp(size(), inSize);
+		resize(s, 0);
+
+		return s;
+	}
+
+	/// Allocate block of data of inSize elements and return the pointer
+	template <class Type>
+	Type *			Allocate(size_t inSize = 1)
+	{
+		// Reserve space
+		size_t s = size();
+		resize(s + inSize * sizeof(Type));
+
+		// Get data pointer
+		Type *data = reinterpret_cast<Type *>(&at(s));
+
+		// Construct elements
+		for (Type *d = data, *d_end = data + inSize; d < d_end; ++d)
+			new (d) Type;
+
+		// Return pointer
+		return data;
+	}
+
+	/// Append inData to the buffer
+	template <class Type>
+	void			AppendVector(const Array<Type> &inData)
+	{
+		size_t size = inData.size() * sizeof(Type);
+		uint8 *data = Allocate<uint8>(size);
+		memcpy(data, &inData[0], size);
+	}
+
+	/// Get object at inPosition (an offset in bytes)
+	template <class Type>
+	const Type *	Get(size_t inPosition) const
+	{
+		return reinterpret_cast<const Type *>(&at(inPosition));
+	}
+
+	/// Get object at inPosition (an offset in bytes)
+	template <class Type>
+	Type *			Get(size_t inPosition)
+	{
+		return reinterpret_cast<Type *>(&at(inPosition));
+	}
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Color.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Color.cpp
@ -0,0 +1,38 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/Color.h>
+
+JPH_NAMESPACE_BEGIN
+
+// Predefined colors
+const Color Color::sBlack(0, 0, 0);
+const Color Color::sDarkRed(128, 0, 0);
+const Color Color::sRed(255, 0, 0);
+const Color Color::sDarkGreen(0, 128, 0);
+const Color Color::sGreen(0, 255, 0);
+const Color Color::sDarkBlue(0, 0, 128);
+const Color Color::sBlue(0, 0, 255);
+const Color Color::sYellow(255, 255, 0);
+const Color Color::sPurple(255, 0, 255);
+const Color Color::sCyan(0, 255, 255);
+const Color Color::sOrange(255, 128, 0);
+const Color Color::sDarkOrange(128, 64, 0);
+const Color Color::sGrey(128, 128, 128);
+const Color Color::sLightGrey(192, 192, 192);
+const Color Color::sWhite(255, 255, 255);
+
+// Generated by: http://phrogz.net/css/distinct-colors.html (this algo: https://en.wikipedia.org/wiki/Color_difference#CMC_l:c_.281984.29)
+static constexpr Color sColors[] = { Color(255, 0, 0), Color(204, 143, 102), Color(226, 242, 0), Color(41, 166, 124), Color(0, 170, 255), Color(69, 38, 153), Color(153, 38, 130), Color(229, 57, 80), Color(204, 0, 0), Color(255, 170, 0), Color(85, 128, 0), Color(64, 255, 217), Color(0, 75, 140), Color(161, 115, 230), Color(242, 61, 157), Color(178, 101, 89), Color(140, 94, 0), Color(181, 217, 108), Color(64, 242, 255), Color(77, 117, 153), Color(157, 61, 242), Color(140, 0, 56), Color(127, 57, 32), Color(204, 173, 51), Color(64, 255, 64), Color(38, 145, 153), Color(0, 102, 255), Color(242, 0, 226), Color(153, 77, 107), Color(229, 92, 0), Color(140, 126, 70), Color(0, 179, 71), Color(0, 194, 242), Color(27, 0, 204), Color(230, 115, 222), Color(127, 0, 17) };
+
+Color Color::sGetDistinctColor(int inIndex)
+{
+	JPH_ASSERT(inIndex >= 0);
+
+	return sColors[inIndex % (sizeof(sColors) / sizeof(uint32))];
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Color.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Color.h
@ -0,0 +1,98 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+class Color;
+
+/// Type to use for passing arguments to a function
+using ColorArg = Color;
+
+/// Class that holds an RGBA color with 8-bits per component
+class JPH_EXPORT_GCC_BUG_WORKAROUND [[nodiscard]] Color
+{
+public:
+	/// Constructors
+							Color() = default; ///< Intentionally not initialized for performance reasons
+							Color(const Color &inRHS) = default;
+	Color &					operator = (const Color &inRHS) = default;
+	explicit constexpr		Color(uint32 inColor)													: mU32(inColor) { }
+	constexpr				Color(uint8 inRed, uint8 inGreen, uint8 inBlue, uint8 inAlpha = 255)	: r(inRed), g(inGreen), b(inBlue), a(inAlpha) { }
+	constexpr				Color(ColorArg inRHS, uint8 inAlpha)									: r(inRHS.r), g(inRHS.g), b(inRHS.b), a(inAlpha) { }
+
+	/// Comparison
+	inline bool				operator == (ColorArg inRHS) const										{ return mU32 == inRHS.mU32; }
+	inline bool				operator != (ColorArg inRHS) const										{ return mU32 != inRHS.mU32; }
+
+	/// Convert to uint32
+	uint32					GetUInt32() const														{ return mU32; }
+
+	/// Element access, 0 = red, 1 = green, 2 = blue, 3 = alpha
+	inline uint8			operator () (uint inIdx) const											{ JPH_ASSERT(inIdx < 4); return (&r)[inIdx]; }
+	inline uint8 &			operator () (uint inIdx)												{ JPH_ASSERT(inIdx < 4); return (&r)[inIdx]; }
+
+	/// Multiply two colors
+	inline Color			operator * (const Color &inRHS) const									{ return Color(uint8((uint32(r) * inRHS.r) >> 8), uint8((uint32(g) * inRHS.g) >> 8), uint8((uint32(b) * inRHS.b) >> 8), uint8((uint32(a) * inRHS.a) >> 8)); }
+
+	/// Multiply color with intensity in the range [0, 1]
+	inline Color			operator * (float inIntensity) const									{ return Color(uint8(r * inIntensity), uint8(g * inIntensity), uint8(b * inIntensity), a); }
+
+	/// Convert to Vec4 with range [0, 1]
+	inline Vec4				ToVec4() const															{ return Vec4(r, g, b, a) / 255.0f; }
+
+	/// Get grayscale intensity of color
+	inline uint8			GetIntensity() const													{ return uint8((uint32(r) * 54 + g * 183 + b * 19) >> 8); }
+
+	/// Get a visually distinct color
+	static Color			sGetDistinctColor(int inIndex);
+
+	/// Get a color value on the gradient from green through yellow to red
+	/// @param inValue Value in the range [0, 1], 0 = green, 0.5 = yellow, 1 = red
+	static Color			sGreenRedGradient(float inValue)
+	{
+		if (inValue < 0.0f)
+			return Color::sGreen;
+		else if (inValue < 0.5f)
+			return Color(uint8(510.0f * inValue), 255, 0);
+		else if (inValue < 1.0f)
+			return Color(255, uint8(510.0f * (1.0f - inValue)), 0);
+		else
+			return Color::sRed;
+	}
+
+	/// Predefined colors
+	static const Color		sBlack;
+	static const Color		sDarkRed;
+	static const Color		sRed;
+	static const Color		sDarkGreen;
+	static const Color		sGreen;
+	static const Color		sDarkBlue;
+	static const Color		sBlue;
+	static const Color		sYellow;
+	static const Color		sPurple;
+	static const Color		sCyan;
+	static const Color		sOrange;
+	static const Color		sDarkOrange;
+	static const Color		sGrey;
+	static const Color		sLightGrey;
+	static const Color		sWhite;
+
+	union
+	{
+		uint32				mU32;																	///< Combined value for red, green, blue and alpha
+		struct
+		{
+			uint8			r;																		///< Red channel
+			uint8			g;																		///< Green channel
+			uint8			b;																		///< Blue channel
+			uint8			a;																		///< Alpha channel
+		};
+	};
+};
+
+static_assert(std::is_trivial<Color>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Core.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Core.h
@ -0,0 +1,662 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+// Jolt library version
+#define JPH_VERSION_MAJOR 5
+#define JPH_VERSION_MINOR 5
+#define JPH_VERSION_PATCH 1
+
+// Determine which features the library was compiled with
+#ifdef JPH_DOUBLE_PRECISION
+	#define JPH_VERSION_FEATURE_BIT_1 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_1 0
+#endif
+#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+	#define JPH_VERSION_FEATURE_BIT_2 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_2 0
+#endif
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	#define JPH_VERSION_FEATURE_BIT_3 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_3 0
+#endif
+#ifdef JPH_PROFILE_ENABLED
+	#define JPH_VERSION_FEATURE_BIT_4 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_4 0
+#endif
+#ifdef JPH_EXTERNAL_PROFILE
+	#define JPH_VERSION_FEATURE_BIT_5 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_5 0
+#endif
+#ifdef JPH_DEBUG_RENDERER
+	#define JPH_VERSION_FEATURE_BIT_6 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_6 0
+#endif
+#ifdef JPH_DISABLE_TEMP_ALLOCATOR
+	#define JPH_VERSION_FEATURE_BIT_7 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_7 0
+#endif
+#ifdef JPH_DISABLE_CUSTOM_ALLOCATOR
+	#define JPH_VERSION_FEATURE_BIT_8 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_8 0
+#endif
+#if defined(JPH_OBJECT_LAYER_BITS) && JPH_OBJECT_LAYER_BITS == 32
+	#define JPH_VERSION_FEATURE_BIT_9 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_9 0
+#endif
+#ifdef JPH_ENABLE_ASSERTS
+	#define JPH_VERSION_FEATURE_BIT_10 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_10 0
+#endif
+#ifdef JPH_OBJECT_STREAM
+	#define JPH_VERSION_FEATURE_BIT_11 1
+#else
+	#define JPH_VERSION_FEATURE_BIT_11 0
+#endif
+#define JPH_VERSION_FEATURES (uint64(JPH_VERSION_FEATURE_BIT_1) | (JPH_VERSION_FEATURE_BIT_2 << 1) | (JPH_VERSION_FEATURE_BIT_3 << 2) | (JPH_VERSION_FEATURE_BIT_4 << 3) | (JPH_VERSION_FEATURE_BIT_5 << 4) | (JPH_VERSION_FEATURE_BIT_6 << 5) | (JPH_VERSION_FEATURE_BIT_7 << 6) | (JPH_VERSION_FEATURE_BIT_8 << 7) | (JPH_VERSION_FEATURE_BIT_9 << 8) | (JPH_VERSION_FEATURE_BIT_10 << 9) | (JPH_VERSION_FEATURE_BIT_11 << 10))
+
+// Combine the version and features in a single ID
+#define JPH_VERSION_ID ((JPH_VERSION_FEATURES << 24) | (JPH_VERSION_MAJOR << 16) | (JPH_VERSION_MINOR << 8) | JPH_VERSION_PATCH)
+
+// Determine platform
+#if defined(JPH_PLATFORM_BLUE)
+	// Correct define already defined, this overrides everything else
+#elif defined(_WIN32) || defined(_WIN64)
+	#include <winapifamily.h>
+	#if WINAPI_FAMILY == WINAPI_FAMILY_APP
+		#define JPH_PLATFORM_WINDOWS_UWP // Building for Universal Windows Platform
+	#endif
+	#define JPH_PLATFORM_WINDOWS
+#elif defined(__ANDROID__) // Android is linux too, so that's why we check it first
+	#define JPH_PLATFORM_ANDROID
+#elif defined(__linux__)
+	#define JPH_PLATFORM_LINUX
+#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+	#define JPH_PLATFORM_BSD
+#elif defined(__APPLE__)
+	#include <TargetConditionals.h>
+	#if defined(TARGET_OS_IPHONE) && !TARGET_OS_IPHONE
+		#define JPH_PLATFORM_MACOS
+	#else
+		#define JPH_PLATFORM_IOS
+	#endif
+#elif defined(__EMSCRIPTEN__)
+	#define JPH_PLATFORM_WASM
+#endif
+
+// Platform helper macros
+#ifdef JPH_PLATFORM_ANDROID
+	#define JPH_IF_NOT_ANDROID(x)
+#else
+	#define JPH_IF_NOT_ANDROID(x) x
+#endif
+
+// Determine compiler
+#if defined(__clang__)
+	#define JPH_COMPILER_CLANG
+#elif defined(__GNUC__)
+	#define JPH_COMPILER_GCC
+#elif defined(_MSC_VER)
+	#define JPH_COMPILER_MSVC
+#endif
+
+#if defined(__MINGW64__) || defined (__MINGW32__)
+	#define JPH_COMPILER_MINGW
+#endif
+
+// Detect CPU architecture
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64EC)
+	// ARM CPU architecture
+	#define JPH_CPU_ARM
+	#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+		#define JPH_CPU_ARCH_BITS 64
+		#define JPH_USE_NEON
+		#define JPH_VECTOR_ALIGNMENT 16
+		#define JPH_DVECTOR_ALIGNMENT 32
+	#else
+		#define JPH_CPU_ARCH_BITS 32
+		#define JPH_VECTOR_ALIGNMENT 8 // 32-bit ARM does not support aligning on the stack on 16 byte boundaries
+		#define JPH_DVECTOR_ALIGNMENT 8
+	#endif
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+	// X86 CPU architecture
+	#define JPH_CPU_X86
+	#if defined(__x86_64__) || defined(_M_X64)
+		#define JPH_CPU_ARCH_BITS 64
+	#else
+		#define JPH_CPU_ARCH_BITS 32
+	#endif
+	#define JPH_USE_SSE
+	#define JPH_VECTOR_ALIGNMENT 16
+	#define JPH_DVECTOR_ALIGNMENT 32
+
+	// Detect enabled instruction sets
+	#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && !defined(JPH_USE_AVX512)
+		#define JPH_USE_AVX512
+	#endif
+	#if (defined(__AVX2__) || defined(JPH_USE_AVX512)) && !defined(JPH_USE_AVX2)
+		#define JPH_USE_AVX2
+	#endif
+	#if (defined(__AVX__) || defined(JPH_USE_AVX2)) && !defined(JPH_USE_AVX)
+		#define JPH_USE_AVX
+	#endif
+	#if (defined(__SSE4_2__) || defined(JPH_USE_AVX)) && !defined(JPH_USE_SSE4_2)
+		#define JPH_USE_SSE4_2
+	#endif
+	#if (defined(__SSE4_1__) || defined(JPH_USE_SSE4_2)) && !defined(JPH_USE_SSE4_1)
+		#define JPH_USE_SSE4_1
+	#endif
+	#if (defined(__F16C__) || defined(JPH_USE_AVX2)) && !defined(JPH_USE_F16C)
+		#define JPH_USE_F16C
+	#endif
+	#if (defined(__LZCNT__) || defined(JPH_USE_AVX2)) && !defined(JPH_USE_LZCNT)
+		#define JPH_USE_LZCNT
+	#endif
+	#if (defined(__BMI__) || defined(JPH_USE_AVX2)) && !defined(JPH_USE_TZCNT)
+		#define JPH_USE_TZCNT
+	#endif
+	#ifndef JPH_CROSS_PLATFORM_DETERMINISTIC // FMA is not compatible with cross platform determinism
+		#if defined(JPH_COMPILER_CLANG) || defined(JPH_COMPILER_GCC)
+			#if defined(__FMA__) && !defined(JPH_USE_FMADD)
+				#define JPH_USE_FMADD
+			#endif
+		#elif defined(JPH_COMPILER_MSVC)
+			#if defined(__AVX2__) && !defined(JPH_USE_FMADD) // AVX2 also enables fused multiply add
+				#define JPH_USE_FMADD
+			#endif
+		#else
+			#error Undefined compiler
+		#endif
+	#endif
+#elif defined(__riscv)
+	// RISC-V CPU architecture
+	#define JPH_CPU_RISCV
+	#if __riscv_xlen == 64
+		#define JPH_CPU_ARCH_BITS 64
+		#define JPH_VECTOR_ALIGNMENT 16
+		#define JPH_DVECTOR_ALIGNMENT 32
+	#else
+		#define JPH_CPU_ARCH_BITS 32
+		#define JPH_VECTOR_ALIGNMENT 16
+		#define JPH_DVECTOR_ALIGNMENT 8
+	#endif
+#elif defined(JPH_PLATFORM_WASM)
+	// WebAssembly CPU architecture
+	#define JPH_CPU_WASM
+	#if defined(__wasm64__)
+		#define JPH_CPU_ARCH_BITS 64
+	#else
+		#define JPH_CPU_ARCH_BITS 32
+	#endif
+	#define JPH_VECTOR_ALIGNMENT 16
+	#define JPH_DVECTOR_ALIGNMENT 32
+	#ifdef __wasm_simd128__
+		#define JPH_USE_SSE
+		#define JPH_USE_SSE4_1
+		#define JPH_USE_SSE4_2
+	#endif
+#elif defined(__powerpc__) || defined(__powerpc64__)
+	// PowerPC CPU architecture
+	#define JPH_CPU_PPC
+	#if defined(__powerpc64__)
+		#define JPH_CPU_ARCH_BITS 64
+	#else
+		#define JPH_CPU_ARCH_BITS 32
+	#endif
+	#ifdef _BIG_ENDIAN
+		#define JPH_CPU_BIG_ENDIAN
+	#endif
+	#define JPH_VECTOR_ALIGNMENT 16
+	#define JPH_DVECTOR_ALIGNMENT 8
+#elif defined(__loongarch__)
+	// LoongArch CPU architecture
+	#define JPH_CPU_LOONGARCH
+	#if defined(__loongarch64)
+		#define JPH_CPU_ARCH_BITS 64
+	#else
+		#define JPH_CPU_ARCH_BITS 32
+	#endif
+	#define JPH_VECTOR_ALIGNMENT 16
+	#define JPH_DVECTOR_ALIGNMENT 8
+#elif defined(__e2k__)
+	// E2K CPU architecture (MCST Elbrus 2000)
+	#define JPH_CPU_E2K
+	#define JPH_CPU_ARCH_BITS 64
+	#define JPH_VECTOR_ALIGNMENT 16
+	#define JPH_DVECTOR_ALIGNMENT 32
+
+	// Compiler flags on e2k arch determine CPU features
+	#if defined(__SSE__) && !defined(JPH_USE_SSE)
+		#define JPH_USE_SSE
+	#endif
+#else
+	#error Unsupported CPU architecture
+#endif
+
+// If this define is set, Jolt is compiled as a shared library
+#ifdef JPH_SHARED_LIBRARY
+	#ifdef JPH_BUILD_SHARED_LIBRARY
+		// While building the shared library, we must export these symbols
+		#if defined(JPH_PLATFORM_WINDOWS) && !defined(JPH_COMPILER_MINGW)
+			#define JPH_EXPORT __declspec(dllexport)
+		#else
+			#define JPH_EXPORT __attribute__ ((visibility ("default")))
+			#if defined(JPH_COMPILER_GCC)
+				// Prevents an issue with GCC attribute parsing (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69585)
+				#define JPH_EXPORT_GCC_BUG_WORKAROUND [[gnu::visibility("default")]]
+			#endif
+		#endif
+	#else
+		// When linking against Jolt, we must import these symbols
+		#if defined(JPH_PLATFORM_WINDOWS) && !defined(JPH_COMPILER_MINGW)
+			#define JPH_EXPORT __declspec(dllimport)
+		#else
+			#define JPH_EXPORT __attribute__ ((visibility ("default")))
+			#if defined(JPH_COMPILER_GCC)
+				// Prevents an issue with GCC attribute parsing (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69585)
+				#define JPH_EXPORT_GCC_BUG_WORKAROUND [[gnu::visibility("default")]]
+			#endif
+		#endif
+	#endif
+#else
+	// If the define is not set, we use static linking and symbols don't need to be imported or exported
+	#define JPH_EXPORT
+#endif
+
+#ifndef JPH_EXPORT_GCC_BUG_WORKAROUND
+	#define JPH_EXPORT_GCC_BUG_WORKAROUND JPH_EXPORT
+#endif
+
+// Macro used by the RTTI macros to not export a function
+#define JPH_NO_EXPORT
+
+// Pragmas to store / restore the warning state and to disable individual warnings
+#ifdef JPH_COMPILER_CLANG
+#define JPH_PRAGMA(x)					_Pragma(#x)
+#define JPH_SUPPRESS_WARNING_PUSH		JPH_PRAGMA(clang diagnostic push)
+#define JPH_SUPPRESS_WARNING_POP		JPH_PRAGMA(clang diagnostic pop)
+#define JPH_CLANG_SUPPRESS_WARNING(w)	JPH_PRAGMA(clang diagnostic ignored w)
+#if __clang_major__ >= 13
+	#define JPH_CLANG_13_PLUS_SUPPRESS_WARNING(w) JPH_CLANG_SUPPRESS_WARNING(w)
+#else
+	#define JPH_CLANG_13_PLUS_SUPPRESS_WARNING(w)
+#endif
+#if __clang_major__ >= 16
+	#define JPH_CLANG_16_PLUS_SUPPRESS_WARNING(w) JPH_CLANG_SUPPRESS_WARNING(w)
+#else
+	#define JPH_CLANG_16_PLUS_SUPPRESS_WARNING(w)
+#endif
+#else
+#define JPH_CLANG_SUPPRESS_WARNING(w)
+#define JPH_CLANG_13_PLUS_SUPPRESS_WARNING(w)
+#define JPH_CLANG_16_PLUS_SUPPRESS_WARNING(w)
+#endif
+#ifdef JPH_COMPILER_GCC
+#define JPH_PRAGMA(x)					_Pragma(#x)
+#define JPH_SUPPRESS_WARNING_PUSH		JPH_PRAGMA(GCC diagnostic push)
+#define JPH_SUPPRESS_WARNING_POP		JPH_PRAGMA(GCC diagnostic pop)
+#define JPH_GCC_SUPPRESS_WARNING(w)		JPH_PRAGMA(GCC diagnostic ignored w)
+#else
+#define JPH_GCC_SUPPRESS_WARNING(w)
+#endif
+#ifdef JPH_COMPILER_MSVC
+#define JPH_PRAGMA(x)					__pragma(x)
+#define JPH_SUPPRESS_WARNING_PUSH		JPH_PRAGMA(warning (push))
+#define JPH_SUPPRESS_WARNING_POP		JPH_PRAGMA(warning (pop))
+#define JPH_MSVC_SUPPRESS_WARNING(w)	JPH_PRAGMA(warning (disable : w))
+#if _MSC_VER >= 1920 && _MSC_VER < 1930
+	#define JPH_MSVC2019_SUPPRESS_WARNING(w) JPH_MSVC_SUPPRESS_WARNING(w)
+#else
+	#define JPH_MSVC2019_SUPPRESS_WARNING(w)
+#endif
+#if _MSC_VER >= 1950
+#define JPH_MSVC2026_PLUS_SUPPRESS_WARNING(w) JPH_MSVC_SUPPRESS_WARNING(w)
+#else
+#define JPH_MSVC2026_PLUS_SUPPRESS_WARNING(w)
+#endif
+#else
+#define JPH_MSVC_SUPPRESS_WARNING(w)
+#define JPH_MSVC2019_SUPPRESS_WARNING(w)
+#define JPH_MSVC2026_PLUS_SUPPRESS_WARNING(w)
+#endif
+
+// Disable common warnings triggered by Jolt when compiling with -Wall
+#define JPH_SUPPRESS_WARNINGS																	\
+	JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat")												\
+	JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic")										\
+	JPH_CLANG_SUPPRESS_WARNING("-Wfloat-equal")													\
+	JPH_CLANG_SUPPRESS_WARNING("-Wsign-conversion")												\
+	JPH_CLANG_SUPPRESS_WARNING("-Wold-style-cast")												\
+	JPH_CLANG_SUPPRESS_WARNING("-Wgnu-anonymous-struct")										\
+	JPH_CLANG_SUPPRESS_WARNING("-Wnested-anon-types")											\
+	JPH_CLANG_SUPPRESS_WARNING("-Wglobal-constructors")											\
+	JPH_CLANG_SUPPRESS_WARNING("-Wexit-time-destructors")										\
+	JPH_CLANG_SUPPRESS_WARNING("-Wnonportable-system-include-path")								\
+	JPH_CLANG_SUPPRESS_WARNING("-Wlanguage-extension-token")									\
+	JPH_CLANG_SUPPRESS_WARNING("-Wunused-parameter")											\
+	JPH_CLANG_SUPPRESS_WARNING("-Wformat-nonliteral")											\
+	JPH_CLANG_SUPPRESS_WARNING("-Wcovered-switch-default")										\
+	JPH_CLANG_SUPPRESS_WARNING("-Wcast-align")													\
+	JPH_CLANG_SUPPRESS_WARNING("-Winvalid-offsetof")											\
+	JPH_CLANG_SUPPRESS_WARNING("-Wgnu-zero-variadic-macro-arguments")							\
+	JPH_CLANG_SUPPRESS_WARNING("-Wdocumentation-unknown-command")								\
+	JPH_CLANG_SUPPRESS_WARNING("-Wctad-maybe-unsupported")										\
+	JPH_CLANG_SUPPRESS_WARNING("-Wswitch-default")												\
+	JPH_CLANG_13_PLUS_SUPPRESS_WARNING("-Wdeprecated-copy")										\
+	JPH_CLANG_13_PLUS_SUPPRESS_WARNING("-Wdeprecated-copy-with-dtor")							\
+	JPH_CLANG_16_PLUS_SUPPRESS_WARNING("-Wunsafe-buffer-usage")									\
+	JPH_IF_NOT_ANDROID(JPH_CLANG_SUPPRESS_WARNING("-Wimplicit-int-float-conversion"))			\
+																								\
+	JPH_GCC_SUPPRESS_WARNING("-Wcomment")														\
+	JPH_GCC_SUPPRESS_WARNING("-Winvalid-offsetof")												\
+	JPH_GCC_SUPPRESS_WARNING("-Wclass-memaccess")												\
+	JPH_GCC_SUPPRESS_WARNING("-Wpedantic")														\
+	JPH_GCC_SUPPRESS_WARNING("-Wunused-parameter")												\
+	JPH_GCC_SUPPRESS_WARNING("-Wmaybe-uninitialized")											\
+																								\
+	JPH_MSVC_SUPPRESS_WARNING(4619) /* #pragma warning: there is no warning number 'XXXX' */	\
+	JPH_MSVC_SUPPRESS_WARNING(4514) /* 'X' : unreferenced inline function has been removed */	\
+	JPH_MSVC_SUPPRESS_WARNING(4710) /* 'X' : function not inlined */							\
+	JPH_MSVC_SUPPRESS_WARNING(4711) /* function 'X' selected for automatic inline expansion */	\
+	JPH_MSVC_SUPPRESS_WARNING(4714) /* function 'X' marked as __forceinline not inlined */		\
+	JPH_MSVC_SUPPRESS_WARNING(4820) /* 'X': 'Y' bytes padding added after data member 'Z' */	\
+	JPH_MSVC_SUPPRESS_WARNING(4100) /* 'X' : unreferenced formal parameter */					\
+	JPH_MSVC_SUPPRESS_WARNING(4626) /* 'X' : assignment operator was implicitly defined as deleted because a base class assignment operator is inaccessible or deleted */ \
+	JPH_MSVC_SUPPRESS_WARNING(5027) /* 'X' : move assignment operator was implicitly defined as deleted because a base class move assignment operator is inaccessible or deleted */ \
+	JPH_MSVC_SUPPRESS_WARNING(4365) /* 'argument' : conversion from 'X' to 'Y', signed / unsigned mismatch */ \
+	JPH_MSVC_SUPPRESS_WARNING(4324) /* 'X' : structure was padded due to alignment specifier */ \
+	JPH_MSVC_SUPPRESS_WARNING(4625) /* 'X' : copy constructor was implicitly defined as deleted because a base class copy constructor is inaccessible or deleted */ \
+	JPH_MSVC_SUPPRESS_WARNING(5026) /* 'X': move constructor was implicitly defined as deleted because a base class move constructor is inaccessible or deleted */ \
+	JPH_MSVC_SUPPRESS_WARNING(4623) /* 'X' : default constructor was implicitly defined as deleted */ \
+	JPH_MSVC_SUPPRESS_WARNING(4201) /* nonstandard extension used: nameless struct/union */		\
+	JPH_MSVC_SUPPRESS_WARNING(4371) /* 'X': layout of class may have changed from a previous version of the compiler due to better packing of member 'Y' */ \
+	JPH_MSVC_SUPPRESS_WARNING(5045) /* Compiler will insert Spectre mitigation for memory load if /Qspectre switch specified */ \
+	JPH_MSVC_SUPPRESS_WARNING(4583) /* 'X': destructor is not implicitly called */				\
+	JPH_MSVC_SUPPRESS_WARNING(4582) /* 'X': constructor is not implicitly called */				\
+	JPH_MSVC_SUPPRESS_WARNING(5219) /* implicit conversion from 'X' to 'Y', possible loss of data  */ \
+	JPH_MSVC_SUPPRESS_WARNING(4826) /* Conversion from 'X *' to 'JPH::uint64' is sign-extended. This may cause unexpected runtime behavior. (32-bit) */ \
+	JPH_MSVC_SUPPRESS_WARNING(5264) /* 'X': 'const' variable is not used */						\
+	JPH_MSVC_SUPPRESS_WARNING(4251) /* class 'X' needs to have DLL-interface to be used by clients of class 'Y' */ \
+	JPH_MSVC_SUPPRESS_WARNING(4738) /* storing 32-bit float result in memory, possible loss of performance */ \
+	JPH_MSVC2019_SUPPRESS_WARNING(5246) /* the initialization of a subobject should be wrapped in braces */
+
+// OS-specific includes
+#if defined(JPH_PLATFORM_WINDOWS)
+	#define JPH_BREAKPOINT		__debugbreak()
+#elif defined(JPH_PLATFORM_BLUE)
+	// Configuration for a popular game console.
+	// This file is not distributed because it would violate an NDA.
+	// Creating one should only be a couple of minutes of work if you have the documentation for the platform
+	// (you only need to define JPH_BREAKPOINT, JPH_PLATFORM_BLUE_GET_TICKS, JPH_PLATFORM_BLUE_MUTEX*, JPH_PLATFORM_BLUE_RWLOCK*, JPH_PLATFORM_BLUE_SEMAPHORE* and include the right header).
+	#include <Jolt/Core/PlatformBlue.h>
+#elif defined(JPH_PLATFORM_LINUX) || defined(JPH_PLATFORM_ANDROID) || defined(JPH_PLATFORM_MACOS) || defined(JPH_PLATFORM_IOS) || defined(JPH_PLATFORM_BSD)
+	#if defined(JPH_CPU_X86)
+		#define JPH_BREAKPOINT	__asm volatile ("int $0x3")
+	#elif defined(JPH_CPU_ARM) || defined(JPH_CPU_RISCV) || defined(JPH_CPU_E2K) || defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
+		#define JPH_BREAKPOINT	__builtin_trap()
+	#else
+		#error Unknown CPU architecture
+	#endif
+#elif defined(JPH_PLATFORM_WASM)
+	#define JPH_BREAKPOINT		do { } while (false) // Not supported
+#else
+	#error Unknown platform
+#endif
+
+// Begin the JPH namespace
+#define JPH_NAMESPACE_BEGIN																		\
+	JPH_SUPPRESS_WARNING_PUSH																	\
+	JPH_SUPPRESS_WARNINGS																		\
+	namespace JPH {
+
+// End the JPH namespace
+#define JPH_NAMESPACE_END																		\
+	}																							\
+	JPH_SUPPRESS_WARNING_POP
+
+// Suppress warnings generated by the standard template library
+#define JPH_SUPPRESS_WARNINGS_STD_BEGIN															\
+	JPH_SUPPRESS_WARNING_PUSH																	\
+	JPH_MSVC_SUPPRESS_WARNING(4365)																\
+	JPH_MSVC_SUPPRESS_WARNING(4619)																\
+	JPH_MSVC_SUPPRESS_WARNING(4710)																\
+	JPH_MSVC_SUPPRESS_WARNING(4711)																\
+	JPH_MSVC_SUPPRESS_WARNING(4820)																\
+	JPH_MSVC_SUPPRESS_WARNING(4514)																\
+	JPH_MSVC_SUPPRESS_WARNING(5262)																\
+	JPH_MSVC_SUPPRESS_WARNING(5264)																\
+	JPH_MSVC_SUPPRESS_WARNING(4738)																\
+	JPH_MSVC_SUPPRESS_WARNING(5045)
+
+#define JPH_SUPPRESS_WARNINGS_STD_END															\
+	JPH_SUPPRESS_WARNING_POP
+
+// MSVC STL requires _HAS_EXCEPTIONS=0 if exceptions are turned off
+#if defined(JPH_COMPILER_MSVC) && (!defined(__cpp_exceptions) || !__cpp_exceptions) && !defined(_HAS_EXCEPTIONS)
+	#define _HAS_EXCEPTIONS 0
+#endif
+
+// Standard C++ includes
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <float.h>
+#include <limits.h>
+#include <string.h>
+#include <new>
+#include <utility>
+#include <cmath>
+#include <sstream>
+#include <functional>
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#if defined(JPH_COMPILER_MSVC) || (defined(JPH_COMPILER_CLANG) && defined(_MSC_VER)) // MSVC or clang-cl
+	#include <malloc.h> // for alloca
+#endif
+#if defined(JPH_USE_SSE)
+	#include <immintrin.h>
+#elif defined(JPH_USE_NEON)
+	#ifdef JPH_COMPILER_MSVC
+		#include <intrin.h>
+		#include <arm64_neon.h>
+	#else
+		#include <arm_neon.h>
+	#endif
+#endif
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+// Commonly used STL types
+using std::min;
+using std::max;
+using std::abs;
+using std::sqrt;
+using std::ceil;
+using std::floor;
+using std::trunc;
+using std::round;
+using std::fmod;
+using std::string_view;
+using std::function;
+using std::numeric_limits;
+using std::isfinite;
+using std::isnan;
+using std::ostream;
+using std::istream;
+
+// Standard types
+using uint = unsigned int;
+using uint8 = std::uint8_t;
+using uint16 = std::uint16_t;
+using uint32 = std::uint32_t;
+using uint64 = std::uint64_t;
+
+// Assert sizes of types
+static_assert(sizeof(uint) >= 4, "Invalid size of uint");
+static_assert(sizeof(uint8) == 1, "Invalid size of uint8");
+static_assert(sizeof(uint16) == 2, "Invalid size of uint16");
+static_assert(sizeof(uint32) == 4, "Invalid size of uint32");
+static_assert(sizeof(uint64) == 8, "Invalid size of uint64");
+
+// Determine if we want extra debugging code to be active
+#if !defined(NDEBUG) && !defined(JPH_NO_DEBUG)
+	#define JPH_DEBUG
+#endif
+
+// Define inline macro
+#if defined(JPH_NO_FORCE_INLINE)
+	#define JPH_INLINE inline
+#elif defined(JPH_COMPILER_CLANG)
+	#define JPH_INLINE __inline__ __attribute__((always_inline))
+#elif defined(JPH_COMPILER_GCC)
+	// On gcc 14 using always_inline in debug mode causes error: "inlining failed in call to 'always_inline' 'XXX': function not considered for inlining"
+	// See: https://github.com/jrouwe/JoltPhysics/issues/1096
+	#if __GNUC__ >= 14 && defined(JPH_DEBUG)
+		#define JPH_INLINE inline
+	#else
+		#define JPH_INLINE __inline__ __attribute__((always_inline))
+	#endif
+#elif defined(JPH_COMPILER_MSVC)
+	#define JPH_INLINE __forceinline
+#else
+	#error Undefined
+#endif
+
+// Default memory allocation alignment.
+// This define can be overridden in case the user provides an Allocate function that has a different alignment than the platform default.
+#ifndef JPH_DEFAULT_ALLOCATE_ALIGNMENT
+	#define JPH_DEFAULT_ALLOCATE_ALIGNMENT __STDCPP_DEFAULT_NEW_ALIGNMENT__
+#endif
+
+// Cache line size (used for aligning to cache line)
+#ifndef JPH_CACHE_LINE_SIZE
+	#define JPH_CACHE_LINE_SIZE 64
+#endif
+
+// Define macro to get current function name
+#if defined(JPH_COMPILER_CLANG) || defined(JPH_COMPILER_GCC)
+	#define JPH_FUNCTION_NAME	__PRETTY_FUNCTION__
+#elif defined(JPH_COMPILER_MSVC)
+	#define JPH_FUNCTION_NAME	__FUNCTION__
+#else
+	#error Undefined
+#endif
+
+// Stack allocation
+#define JPH_STACK_ALLOC(n)		alloca(n)
+
+// Shorthand for #ifdef JPH_DEBUG / #endif
+#ifdef JPH_DEBUG
+	#define JPH_IF_DEBUG(...)	__VA_ARGS__
+	#define JPH_IF_NOT_DEBUG(...)
+#else
+	#define JPH_IF_DEBUG(...)
+	#define JPH_IF_NOT_DEBUG(...) __VA_ARGS__
+#endif
+
+// Shorthand for #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED / #endif
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	#define JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(...)	__VA_ARGS__
+#else
+	#define JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(...)
+#endif
+
+// Helper macros to detect if we're running in single or double precision mode
+#ifdef JPH_DOUBLE_PRECISION
+	#define JPH_IF_SINGLE_PRECISION(...)
+	#define JPH_IF_SINGLE_PRECISION_ELSE(s, d) d
+	#define JPH_IF_DOUBLE_PRECISION(...) __VA_ARGS__
+#else
+	#define JPH_IF_SINGLE_PRECISION(...) __VA_ARGS__
+	#define JPH_IF_SINGLE_PRECISION_ELSE(s, d) s
+	#define JPH_IF_DOUBLE_PRECISION(...)
+#endif
+
+// Helper macro to detect if the debug renderer is active
+#ifdef JPH_DEBUG_RENDERER
+	#define JPH_IF_DEBUG_RENDERER(...) __VA_ARGS__
+	#define JPH_IF_NOT_DEBUG_RENDERER(...)
+#else
+	#define JPH_IF_DEBUG_RENDERER(...)
+	#define JPH_IF_NOT_DEBUG_RENDERER(...) __VA_ARGS__
+#endif
+
+// Macro to indicate that a parameter / variable is unused
+#define JPH_UNUSED(x)			(void)x
+
+// Macro to enable floating point precise mode and to disable fused multiply add instructions
+#if defined(JPH_COMPILER_GCC) || defined(JPH_CROSS_PLATFORM_DETERMINISTIC)
+	// We compile without -ffast-math and -ffp-contract=fast, so we don't need to disable anything
+	#define JPH_PRECISE_MATH_ON
+	#define JPH_PRECISE_MATH_OFF
+#elif defined(JPH_COMPILER_CLANG)
+	// We compile without -ffast-math because pragma float_control(precise, on) doesn't seem to actually negate all of the -ffast-math effects and causes the unit tests to fail (even if the pragma is added to all files)
+	// On clang 14 and later we can turn off float contraction through a pragma (before it was buggy), so if FMA is on we can disable it through this macro
+	#if (defined(JPH_CPU_ARM) && !defined(JPH_PLATFORM_ANDROID) && __clang_major__ >= 16) || (defined(JPH_CPU_X86) && __clang_major__ >= 14)
+		#define JPH_PRECISE_MATH_ON						\
+			_Pragma("float_control(precise, on, push)")	\
+			_Pragma("clang fp contract(off)")
+		#define JPH_PRECISE_MATH_OFF					\
+			_Pragma("float_control(pop)")
+	#elif __clang_major__ >= 14 && (defined(JPH_USE_FMADD) || defined(FP_FAST_FMA))
+		#define JPH_PRECISE_MATH_ON						\
+			_Pragma("clang fp contract(off)")
+		#define JPH_PRECISE_MATH_OFF					\
+			_Pragma("clang fp contract(on)")
+	#else
+		#define JPH_PRECISE_MATH_ON
+		#define JPH_PRECISE_MATH_OFF
+	#endif
+#elif defined(JPH_COMPILER_MSVC)
+	// Unfortunately there is no way to push the state of fp_contract, so we have to assume it was turned on before JPH_PRECISE_MATH_ON
+	#define JPH_PRECISE_MATH_ON							\
+		__pragma(float_control(precise, on, push))		\
+		__pragma(fp_contract(off))
+	#define JPH_PRECISE_MATH_OFF						\
+		__pragma(fp_contract(on))						\
+		__pragma(float_control(pop))
+#else
+	#error Undefined
+#endif
+
+// Check if Thread Sanitizer is enabled
+#ifdef __has_feature
+	#if __has_feature(thread_sanitizer)
+		#define JPH_TSAN_ENABLED
+	#endif
+#else
+	#ifdef __SANITIZE_THREAD__
+		#define JPH_TSAN_ENABLED
+	#endif
+#endif
+
+// Attribute to disable Thread Sanitizer for a particular function
+#ifdef JPH_TSAN_ENABLED
+	#define JPH_TSAN_NO_SANITIZE __attribute__((no_sanitize("thread")))
+#else
+	#define JPH_TSAN_NO_SANITIZE
+#endif
+
+// DirectX 12 is only supported on Windows
+#if defined(JPH_USE_DX12) && !defined(JPH_PLATFORM_WINDOWS)
+	#undef JPH_USE_DX12
+#endif // JPH_PLATFORM_WINDOWS
+
+// Metal is only supported on Apple platforms
+#if defined(JPH_USE_METAL) && !defined(JPH_PLATFORM_MACOS) && !defined(JPH_PLATFORM_IOS)
+	#undef JPH_USE_METAL
+#endif // !JPH_PLATFORM_MACOS && !JPH_PLATFORM_IOS
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/FPControlWord.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/FPControlWord.h
@ -0,0 +1,143 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/NonCopyable.h>
+
+JPH_NAMESPACE_BEGIN
+
+#if defined(JPH_CPU_WASM)
+
+// Not supported
+
+#elif defined(JPH_USE_SSE)
+
+/// Helper class that needs to be put on the stack to update the state of the floating point control word.
+/// This state is kept per thread.
+template <uint Value, uint Mask>
+class FPControlWord : public NonCopyable
+{
+public:
+				FPControlWord()
+	{
+		mPrevState = _mm_getcsr();
+		_mm_setcsr((mPrevState & ~Mask) | Value);
+	}
+
+				~FPControlWord()
+	{
+		_mm_setcsr((_mm_getcsr() & ~Mask) | (mPrevState & Mask));
+	}
+
+private:
+	uint		mPrevState;
+};
+
+#elif defined(JPH_CPU_ARM) && defined(JPH_COMPILER_MSVC)
+
+/// Helper class that needs to be put on the stack to update the state of the floating point control word.
+/// This state is kept per thread.
+template <unsigned int Value, unsigned int Mask>
+class FPControlWord : public NonCopyable
+{
+public:
+				FPControlWord()
+	{
+		// Read state before change
+		_controlfp_s(&mPrevState, 0, 0);
+
+		// Update the state
+		unsigned int dummy;
+		_controlfp_s(&dummy, Value, Mask);
+	}
+
+				~FPControlWord()
+	{
+		// Restore state
+		unsigned int dummy;
+		_controlfp_s(&dummy, mPrevState, Mask);
+	}
+
+private:
+	unsigned int mPrevState;
+};
+
+#elif defined(JPH_CPU_ARM) && defined(JPH_USE_NEON)
+
+/// Helper class that needs to be put on the stack to update the state of the floating point control word.
+/// This state is kept per thread.
+template <uint64 Value, uint64 Mask>
+class FPControlWord : public NonCopyable
+{
+public:
+				FPControlWord()
+	{
+		uint64 val;
+		asm volatile("mrs %0, fpcr" : "=r" (val));
+		mPrevState = val;
+		val &= ~Mask;
+		val |= Value;
+		asm volatile("msr fpcr, %0" : /* no output */ : "r" (val));
+	}
+
+				~FPControlWord()
+	{
+		uint64 val;
+		asm volatile("mrs %0, fpcr" : "=r" (val));
+		val &= ~Mask;
+		val |= mPrevState & Mask;
+		asm volatile("msr fpcr, %0" : /* no output */ : "r" (val));
+	}
+
+private:
+	uint64		mPrevState;
+};
+
+#elif defined(JPH_CPU_ARM)
+
+/// Helper class that needs to be put on the stack to update the state of the floating point control word.
+/// This state is kept per thread.
+template <uint32 Value, uint32 Mask>
+class FPControlWord : public NonCopyable
+{
+public:
+	FPControlWord()
+	{
+		uint32 val;
+		asm volatile("vmrs %0, fpscr" : "=r" (val));
+		mPrevState = val;
+		val &= ~Mask;
+		val |= Value;
+		asm volatile("vmsr fpscr, %0" : /* no output */ : "r" (val));
+	}
+
+	~FPControlWord()
+	{
+		uint32 val;
+		asm volatile("vmrs %0, fpscr" : "=r" (val));
+		val &= ~Mask;
+		val |= mPrevState & Mask;
+		asm volatile("vmsr fpscr, %0" : /* no output */ : "r" (val));
+	}
+
+private:
+	uint32		mPrevState;
+};
+
+#elif defined(JPH_CPU_RISCV)
+
+// RISC-V only implements manually checking if exceptions occurred by reading the fcsr register. It doesn't generate exceptions.
+
+#elif defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
+
+// Not implemented right now
+
+#else
+
+#error Unsupported CPU architecture
+
+#endif
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/FPException.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/FPException.h
@ -0,0 +1,96 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/FPControlWord.h>
+
+JPH_NAMESPACE_BEGIN
+
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+
+#if defined(JPH_CPU_WASM)
+
+// Not supported
+class FPExceptionsEnable { };
+class FPExceptionDisableInvalid { };
+class FPExceptionDisableDivByZero { };
+class FPExceptionDisableOverflow { };
+
+#elif defined(JPH_USE_SSE)
+
+/// Enable floating point divide by zero exception, overflow exceptions and exceptions on invalid numbers
+class FPExceptionsEnable : public FPControlWord<0, _MM_MASK_DIV_ZERO | _MM_MASK_INVALID | _MM_MASK_OVERFLOW> { };
+
+/// Disable invalid floating point value exceptions
+class FPExceptionDisableInvalid : public FPControlWord<_MM_MASK_INVALID, _MM_MASK_INVALID> { };
+
+/// Disable division by zero floating point exceptions
+class FPExceptionDisableDivByZero : public FPControlWord<_MM_MASK_DIV_ZERO, _MM_MASK_DIV_ZERO> { };
+
+/// Disable floating point overflow exceptions
+class FPExceptionDisableOverflow : public FPControlWord<_MM_MASK_OVERFLOW, _MM_MASK_OVERFLOW> { };
+
+#elif defined(JPH_CPU_ARM) && defined(JPH_COMPILER_MSVC)
+
+/// Enable floating point divide by zero exception, overflow exceptions and exceptions on invalid numbers
+class FPExceptionsEnable : public FPControlWord<0, _EM_INVALID | _EM_ZERODIVIDE | _EM_OVERFLOW> { };
+
+/// Disable invalid floating point value exceptions
+class FPExceptionDisableInvalid : public FPControlWord<_EM_INVALID, _EM_INVALID> { };
+
+/// Disable division by zero floating point exceptions
+class FPExceptionDisableDivByZero : public FPControlWord<_EM_ZERODIVIDE, _EM_ZERODIVIDE> { };
+
+/// Disable floating point overflow exceptions
+class FPExceptionDisableOverflow : public FPControlWord<_EM_OVERFLOW, _EM_OVERFLOW> { };
+
+#elif defined(JPH_CPU_ARM)
+
+/// Invalid operation exception bit
+static constexpr uint64 FP_IOE = 1 << 8;
+
+/// Enable divide by zero exception bit
+static constexpr uint64 FP_DZE = 1 << 9;
+
+/// Enable floating point overflow bit
+static constexpr uint64 FP_OFE = 1 << 10;
+
+/// Enable floating point divide by zero exception, overflow exceptions and exceptions on invalid numbers
+class FPExceptionsEnable : public FPControlWord<FP_IOE | FP_DZE | FP_OFE, FP_IOE | FP_DZE | FP_OFE> { };
+
+/// Disable invalid floating point value exceptions
+class FPExceptionDisableInvalid : public FPControlWord<0, FP_IOE> { };
+
+/// Disable division by zero floating point exceptions
+class FPExceptionDisableDivByZero : public FPControlWord<0, FP_DZE> { };
+
+/// Disable floating point overflow exceptions
+class FPExceptionDisableOverflow : public FPControlWord<0, FP_OFE> { };
+
+#elif defined(JPH_CPU_RISCV)
+
+#error "RISC-V only implements manually checking if exceptions occurred by reading the fcsr register. It doesn't generate exceptions. JPH_FLOATING_POINT_EXCEPTIONS_ENABLED must be disabled."
+
+#elif defined(JPH_CPU_PPC)
+
+#error PowerPC floating point exception handling to be implemented. JPH_FLOATING_POINT_EXCEPTIONS_ENABLED must be disabled.
+
+#else
+
+#error Unsupported CPU architecture
+
+#endif
+
+#else
+
+/// Dummy implementations
+class FPExceptionsEnable { };
+class FPExceptionDisableInvalid { };
+class FPExceptionDisableDivByZero { };
+class FPExceptionDisableOverflow { };
+
+#endif
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/FPFlushDenormals.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/FPFlushDenormals.h
@ -0,0 +1,43 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/FPControlWord.h>
+
+JPH_NAMESPACE_BEGIN
+
+#if defined(JPH_CPU_WASM) || defined(JPH_CPU_RISCV) || defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
+
+// Not supported
+class FPFlushDenormals { };
+
+#elif defined(JPH_USE_SSE)
+
+/// Helper class that needs to be put on the stack to enable flushing denormals to zero
+/// This can make floating point operations much faster when working with very small numbers
+class FPFlushDenormals : public FPControlWord<_MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_MASK> { };
+
+#elif defined(JPH_CPU_ARM) && defined(JPH_COMPILER_MSVC)
+
+/// Helper class that needs to be put on the stack to enable flushing denormals to zero
+/// This can make floating point operations much faster when working with very small numbers
+class FPFlushDenormals : public FPControlWord<_DN_FLUSH, _MCW_DN> { };
+
+#elif defined(JPH_CPU_ARM)
+
+/// Flush denormals to zero bit
+static constexpr uint64 FP_FZ = 1 << 24;
+
+/// Helper class that needs to be put on the stack to enable flushing denormals to zero
+/// This can make floating point operations much faster when working with very small numbers
+class FPFlushDenormals : public FPControlWord<FP_FZ, FP_FZ> { };
+
+#else
+
+#error Unsupported CPU architecture
+
+#endif
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Factory.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Factory.cpp
@ -0,0 +1,92 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/Factory.h>
+
+JPH_NAMESPACE_BEGIN
+
+Factory *Factory::sInstance = nullptr;
+
+void *Factory::CreateObject(const char *inName)
+{
+	const RTTI *ci = Find(inName);
+	return ci != nullptr? ci->CreateObject() : nullptr;
+}
+
+const RTTI *Factory::Find(const char *inName)
+{
+	ClassNameMap::iterator c = mClassNameMap.find(inName);
+	return c != mClassNameMap.end()? c->second : nullptr;
+}
+
+const RTTI *Factory::Find(uint32 inHash)
+{
+	ClassHashMap::iterator c = mClassHashMap.find(inHash);
+	return c != mClassHashMap.end()? c->second : nullptr;
+}
+
+bool Factory::Register(const RTTI *inRTTI)
+{
+	// Check if we already know the type
+	if (Find(inRTTI->GetName()) != nullptr)
+		return true;
+
+	// Insert this class by name
+	mClassNameMap.try_emplace(inRTTI->GetName(), inRTTI);
+
+	// Insert this class by hash
+	if (!mClassHashMap.try_emplace(inRTTI->GetHash(), inRTTI).second)
+	{
+		JPH_ASSERT(false, "Hash collision registering type!");
+		return false;
+	}
+
+	// Register base classes
+	for (int i = 0; i < inRTTI->GetBaseClassCount(); ++i)
+		if (!Register(inRTTI->GetBaseClass(i)))
+			return false;
+
+#ifdef JPH_OBJECT_STREAM
+	// Register attribute classes
+	for (int i = 0; i < inRTTI->GetAttributeCount(); ++i)
+	{
+		const RTTI *rtti = inRTTI->GetAttribute(i).GetMemberPrimitiveType();
+		if (rtti != nullptr && !Register(rtti))
+			return false;
+	}
+#endif // JPH_OBJECT_STREAM
+
+	return true;
+}
+
+bool Factory::Register(const RTTI **inRTTIs, uint inNumber)
+{
+	mClassHashMap.reserve(mClassHashMap.size() + inNumber);
+	mClassNameMap.reserve(mClassNameMap.size() + inNumber);
+
+	for (const RTTI **rtti = inRTTIs; rtti < inRTTIs + inNumber; ++rtti)
+		if (!Register(*rtti))
+			return false;
+
+	return true;
+}
+
+void Factory::Clear()
+{
+	mClassNameMap.clear();
+	mClassHashMap.clear();
+}
+
+Array<const RTTI *> Factory::GetAllClasses() const
+{
+	Array<const RTTI *> all_classes;
+	all_classes.reserve(mClassNameMap.size());
+	for (const ClassNameMap::value_type &c : mClassNameMap)
+		all_classes.push_back(c.second);
+	return all_classes;
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Factory.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Factory.h
@ -0,0 +1,54 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/RTTI.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// This class is responsible for creating instances of classes based on their name or hash and is mainly used for deserialization of saved data.
+class JPH_EXPORT Factory
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Create an object
+	void *						CreateObject(const char *inName);
+
+	/// Find type info for a specific class by name
+	const RTTI *				Find(const char *inName);
+
+	/// Find type info for a specific class by hash
+	const RTTI *				Find(uint32 inHash);
+
+	/// Register an object with the factory. Returns false on failure.
+	bool						Register(const RTTI *inRTTI);
+
+	/// Register a list of objects with the factory. Returns false on failure.
+	bool						Register(const RTTI **inRTTIs, uint inNumber);
+
+	/// Unregisters all types
+	void						Clear();
+
+	/// Get all registered classes
+	Array<const RTTI *>			GetAllClasses() const;
+
+	/// Singleton factory instance
+	static Factory *			sInstance;
+
+private:
+	using ClassNameMap = UnorderedMap<string_view, const RTTI *>;
+
+	using ClassHashMap = UnorderedMap<uint32, const RTTI *>;
+
+	/// Map of class names to type info
+	ClassNameMap				mClassNameMap;
+
+	// Map of class hash to type info
+	ClassHashMap				mClassHashMap;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/FixedSizeFreeList.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/FixedSizeFreeList.h
@ -0,0 +1,122 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/Mutex.h>
+#include <Jolt/Core/Atomics.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Class that allows lock free creation / destruction of objects (unless a new page of objects needs to be allocated)
+/// It contains a fixed pool of objects and also allows batching up a lot of objects to be destroyed
+/// and doing the actual free in a single atomic operation
+template <typename Object>
+class FixedSizeFreeList : public NonCopyable
+{
+private:
+	/// Storage type for an Object
+	struct ObjectStorage
+	{
+		/// The object we're storing
+		Object				mObject;
+
+		/// When the object is freed (or in the process of being freed as a batch) this will contain the next free object
+		/// When an object is in use it will contain the object's index in the free list
+		atomic<uint32>		mNextFreeObject;
+	};
+
+	static_assert(alignof(ObjectStorage) == alignof(Object), "Object not properly aligned");
+
+	/// Access the object storage given the object index
+	const ObjectStorage &	GetStorage(uint32 inObjectIndex) const	{ return mPages[inObjectIndex >> mPageShift][inObjectIndex & mObjectMask]; }
+	ObjectStorage &			GetStorage(uint32 inObjectIndex)		{ return mPages[inObjectIndex >> mPageShift][inObjectIndex & mObjectMask]; }
+
+	/// Size (in objects) of a single page
+	uint32					mPageSize;
+
+	/// Number of bits to shift an object index to the right to get the page number
+	uint32					mPageShift;
+
+	/// Mask to and an object index with to get the page number
+	uint32					mObjectMask;
+
+	/// Total number of pages that are usable
+	uint32					mNumPages;
+
+	/// Total number of objects that have been allocated
+	uint32					mNumObjectsAllocated;
+
+	/// Array of pages of objects
+	ObjectStorage **		mPages = nullptr;
+
+	/// Mutex that is used to allocate a new page if the storage runs out
+	/// This variable is aligned to the cache line to prevent false sharing with
+	/// the constants used to index into the list via `Get()`.
+	alignas(JPH_CACHE_LINE_SIZE) Mutex mPageMutex;
+
+	/// Number of objects that we currently have in the free list / new pages
+#ifdef JPH_ENABLE_ASSERTS
+	atomic<uint32>			mNumFreeObjects;
+#endif // JPH_ENABLE_ASSERTS
+
+	/// Simple counter that makes the first free object pointer update with every CAS so that we don't suffer from the ABA problem
+	atomic<uint32>			mAllocationTag;
+
+	/// Index of first free object, the first 32 bits of an object are used to point to the next free object
+	atomic<uint64>			mFirstFreeObjectAndTag;
+
+	/// The first free object to use when the free list is empty (may need to allocate a new page)
+	atomic<uint32>			mFirstFreeObjectInNewPage;
+
+public:
+	/// Invalid index
+	static const uint32		cInvalidObjectIndex = 0xffffffff;
+
+	/// Size of an object + bookkeeping for the freelist
+	static const int		ObjectStorageSize = sizeof(ObjectStorage);
+
+	/// Destructor
+	inline					~FixedSizeFreeList();
+
+	/// Initialize the free list, up to inMaxObjects can be allocated
+	inline void				Init(uint inMaxObjects, uint inPageSize);
+
+	/// Lockless construct a new object, inParameters are passed on to the constructor
+	template <typename... Parameters>
+	inline uint32			ConstructObject(Parameters &&... inParameters);
+
+	/// Lockless destruct an object and return it to the free pool
+	inline void				DestructObject(uint32 inObjectIndex);
+
+	/// Lockless destruct an object and return it to the free pool
+	inline void				DestructObject(Object *inObject);
+
+	/// A batch of objects that can be destructed
+	struct Batch
+	{
+		uint32				mFirstObjectIndex = cInvalidObjectIndex;
+		uint32				mLastObjectIndex = cInvalidObjectIndex;
+		uint32				mNumObjects = 0;
+	};
+
+	/// Add a object to an existing batch to be destructed.
+	/// Adding objects to a batch does not destroy or modify the objects, this will merely link them
+	/// so that the entire batch can be returned to the free list in a single atomic operation
+	inline void				AddObjectToBatch(Batch &ioBatch, uint32 inObjectIndex);
+
+	/// Lockless destruct batch of objects
+	inline void				DestructObjectBatch(Batch &ioBatch);
+
+	/// Access an object by index.
+	inline Object &			Get(uint32 inObjectIndex)				{ return GetStorage(inObjectIndex).mObject; }
+
+	/// Access an object by index.
+	inline const Object &	Get(uint32 inObjectIndex) const			{ return GetStorage(inObjectIndex).mObject; }
+};
+
+JPH_NAMESPACE_END
+
+#include "FixedSizeFreeList.inl"
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/FixedSizeFreeList.inl
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/FixedSizeFreeList.inl
@ -0,0 +1,215 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+JPH_NAMESPACE_BEGIN
+
+template <typename Object>
+FixedSizeFreeList<Object>::~FixedSizeFreeList()
+{
+	// Check if we got our Init call
+	if (mPages != nullptr)
+	{
+		// Ensure everything is freed before the freelist is destructed
+		JPH_ASSERT(mNumFreeObjects.load(memory_order_relaxed) == mNumPages * mPageSize);
+
+		// Free memory for pages
+		uint32 num_pages = mNumObjectsAllocated / mPageSize;
+		for (uint32 page = 0; page < num_pages; ++page)
+			AlignedFree(mPages[page]);
+		Free(mPages);
+	}
+}
+
+template <typename Object>
+void FixedSizeFreeList<Object>::Init(uint inMaxObjects, uint inPageSize)
+{
+	// Check sanity
+	JPH_ASSERT(inPageSize > 0 && IsPowerOf2(inPageSize));
+	JPH_ASSERT(mPages == nullptr);
+
+	// Store configuration parameters
+	mNumPages = (inMaxObjects + inPageSize - 1) / inPageSize;
+	mPageSize = inPageSize;
+	mPageShift = CountTrailingZeros(inPageSize);
+	mObjectMask = inPageSize - 1;
+	JPH_IF_ENABLE_ASSERTS(mNumFreeObjects = mNumPages * inPageSize;)
+
+	// Allocate page table
+	mPages = reinterpret_cast<ObjectStorage **>(Allocate(mNumPages * sizeof(ObjectStorage *)));
+
+	// We didn't yet use any objects of any page
+	mNumObjectsAllocated = 0;
+	mFirstFreeObjectInNewPage = 0;
+
+	// Start with 1 as the first tag
+	mAllocationTag = 1;
+
+	// Set first free object (with tag 0)
+	mFirstFreeObjectAndTag = cInvalidObjectIndex;
+}
+
+template <typename Object>
+template <typename... Parameters>
+uint32 FixedSizeFreeList<Object>::ConstructObject(Parameters &&... inParameters)
+{
+	for (;;)
+	{
+		// Get first object from the linked list
+		uint64 first_free_object_and_tag = mFirstFreeObjectAndTag.load(memory_order_acquire);
+		uint32 first_free = uint32(first_free_object_and_tag);
+		if (first_free == cInvalidObjectIndex)
+		{
+			// The free list is empty, we take an object from the page that has never been used before
+			first_free = mFirstFreeObjectInNewPage.fetch_add(1, memory_order_relaxed);
+			if (first_free >= mNumObjectsAllocated)
+			{
+				// Allocate new page
+				lock_guard lock(mPageMutex);
+				while (first_free >= mNumObjectsAllocated)
+				{
+					uint32 next_page = mNumObjectsAllocated / mPageSize;
+					if (next_page == mNumPages)
+						return cInvalidObjectIndex; // Out of space!
+					mPages[next_page] = reinterpret_cast<ObjectStorage *>(AlignedAllocate(mPageSize * sizeof(ObjectStorage), max<size_t>(alignof(ObjectStorage), JPH_CACHE_LINE_SIZE)));
+					mNumObjectsAllocated += mPageSize;
+				}
+			}
+
+			// Allocation successful
+			JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_sub(1, memory_order_relaxed);)
+			ObjectStorage &storage = GetStorage(first_free);
+			new (&storage.mObject) Object(std::forward<Parameters>(inParameters)...);
+			storage.mNextFreeObject.store(first_free, memory_order_release);
+			return first_free;
+		}
+		else
+		{
+			// Load next pointer
+			uint32 new_first_free = GetStorage(first_free).mNextFreeObject.load(memory_order_acquire);
+
+			// Construct a new first free object tag
+			uint64 new_first_free_object_and_tag = uint64(new_first_free) + (uint64(mAllocationTag.fetch_add(1, memory_order_relaxed)) << 32);
+
+			// Compare and swap
+			if (mFirstFreeObjectAndTag.compare_exchange_weak(first_free_object_and_tag, new_first_free_object_and_tag, memory_order_release))
+			{
+				// Allocation successful
+				JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_sub(1, memory_order_relaxed);)
+				ObjectStorage &storage = GetStorage(first_free);
+				new (&storage.mObject) Object(std::forward<Parameters>(inParameters)...);
+				storage.mNextFreeObject.store(first_free, memory_order_release);
+				return first_free;
+			}
+		}
+	}
+}
+
+template <typename Object>
+void FixedSizeFreeList<Object>::AddObjectToBatch(Batch &ioBatch, uint32 inObjectIndex)
+{
+	JPH_ASSERT(ioBatch.mNumObjects != uint32(-1), "Trying to reuse a batch that has already been freed");
+
+	// Reset next index
+	atomic<uint32> &next_free_object = GetStorage(inObjectIndex).mNextFreeObject;
+	JPH_ASSERT(next_free_object.load(memory_order_relaxed) == inObjectIndex, "Trying to add a object to the batch that is already in a free list");
+	next_free_object.store(cInvalidObjectIndex, memory_order_release);
+
+	// Link object in batch to free
+	if (ioBatch.mFirstObjectIndex == cInvalidObjectIndex)
+		ioBatch.mFirstObjectIndex = inObjectIndex;
+	else
+		GetStorage(ioBatch.mLastObjectIndex).mNextFreeObject.store(inObjectIndex, memory_order_release);
+	ioBatch.mLastObjectIndex = inObjectIndex;
+	ioBatch.mNumObjects++;
+}
+
+template <typename Object>
+void FixedSizeFreeList<Object>::DestructObjectBatch(Batch &ioBatch)
+{
+	if (ioBatch.mFirstObjectIndex != cInvalidObjectIndex)
+	{
+		// Call destructors
+		if constexpr (!std::is_trivially_destructible<Object>())
+		{
+			uint32 object_idx = ioBatch.mFirstObjectIndex;
+			do
+			{
+				ObjectStorage &storage = GetStorage(object_idx);
+				storage.mObject.~Object();
+				object_idx = storage.mNextFreeObject.load(memory_order_relaxed);
+			}
+			while (object_idx != cInvalidObjectIndex);
+		}
+
+		// Add to objects free list
+		ObjectStorage &storage = GetStorage(ioBatch.mLastObjectIndex);
+		for (;;)
+		{
+			// Get first object from the list
+			uint64 first_free_object_and_tag = mFirstFreeObjectAndTag.load(memory_order_acquire);
+			uint32 first_free = uint32(first_free_object_and_tag);
+
+			// Make it the next pointer of the last object in the batch that is to be freed
+			storage.mNextFreeObject.store(first_free, memory_order_release);
+
+			// Construct a new first free object tag
+			uint64 new_first_free_object_and_tag = uint64(ioBatch.mFirstObjectIndex) + (uint64(mAllocationTag.fetch_add(1, memory_order_relaxed)) << 32);
+
+			// Compare and swap
+			if (mFirstFreeObjectAndTag.compare_exchange_weak(first_free_object_and_tag, new_first_free_object_and_tag, memory_order_release))
+			{
+				// Free successful
+				JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_add(ioBatch.mNumObjects, memory_order_relaxed);)
+
+				// Mark the batch as freed
+#ifdef JPH_ENABLE_ASSERTS
+				ioBatch.mNumObjects = uint32(-1);
+#endif
+				return;
+			}
+		}
+	}
+}
+
+template <typename Object>
+void FixedSizeFreeList<Object>::DestructObject(uint32 inObjectIndex)
+{
+	JPH_ASSERT(inObjectIndex != cInvalidObjectIndex);
+
+	// Call destructor
+	ObjectStorage &storage = GetStorage(inObjectIndex);
+	storage.mObject.~Object();
+
+	// Add to object free list
+	for (;;)
+	{
+		// Get first object from the list
+		uint64 first_free_object_and_tag = mFirstFreeObjectAndTag.load(memory_order_acquire);
+		uint32 first_free = uint32(first_free_object_and_tag);
+
+		// Make it the next pointer of the last object in the batch that is to be freed
+		storage.mNextFreeObject.store(first_free, memory_order_release);
+
+		// Construct a new first free object tag
+		uint64 new_first_free_object_and_tag = uint64(inObjectIndex) + (uint64(mAllocationTag.fetch_add(1, memory_order_relaxed)) << 32);
+
+		// Compare and swap
+		if (mFirstFreeObjectAndTag.compare_exchange_weak(first_free_object_and_tag, new_first_free_object_and_tag, memory_order_release))
+		{
+			// Free successful
+			JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_add(1, memory_order_relaxed);)
+			return;
+		}
+	}
+}
+
+template<typename Object>
+inline void FixedSizeFreeList<Object>::DestructObject(Object *inObject)
+{
+	uint32 index = reinterpret_cast<ObjectStorage *>(inObject)->mNextFreeObject.load(memory_order_relaxed);
+	JPH_ASSERT(index < mNumObjectsAllocated);
+	DestructObject(index);
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/HashCombine.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/HashCombine.h
@ -0,0 +1,234 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Implements the FNV-1a hash algorithm
+/// @see https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+/// @param inData Data block of bytes
+/// @param inSize Number of bytes
+/// @param inSeed Seed of the hash (can be used to pass in the hash of a previous operation, otherwise leave default)
+/// @return Hash
+inline uint64 HashBytes(const void *inData, uint inSize, uint64 inSeed = 0xcbf29ce484222325UL)
+{
+	uint64 hash = inSeed;
+	for (const uint8 *data = reinterpret_cast<const uint8 *>(inData); data < reinterpret_cast<const uint8 *>(inData) + inSize; ++data)
+	{
+		hash ^= uint64(*data);
+		hash *= 0x100000001b3UL;
+	}
+	return hash;
+}
+
+/// Calculate the FNV-1a hash of inString.
+/// @see https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+constexpr uint64 HashString(const char *inString, uint64 inSeed = 0xcbf29ce484222325UL)
+{
+	uint64 hash = inSeed;
+	for (const char *c = inString; *c != 0; ++c)
+	{
+		hash ^= uint64(*c);
+		hash *= 0x100000001b3UL;
+	}
+	return hash;
+}
+
+/// A 64 bit hash function by Thomas Wang, Jan 1997
+/// See: http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
+/// @param inValue Value to hash
+/// @return Hash
+inline uint64 Hash64(uint64 inValue)
+{
+	uint64 hash = inValue;
+	hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1;
+	hash = hash ^ (hash >> 24);
+	hash = (hash + (hash << 3)) + (hash << 8); // hash * 265
+	hash = hash ^ (hash >> 14);
+	hash = (hash + (hash << 2)) + (hash << 4); // hash * 21
+	hash = hash ^ (hash >> 28);
+	hash = hash + (hash << 31);
+	return hash;
+}
+
+/// Fallback hash function that calls T::GetHash()
+template <class T>
+struct Hash
+{
+	uint64		operator () (const T &inValue) const
+	{
+		return inValue.GetHash();
+	}
+};
+
+/// A hash function for floats
+template <>
+struct Hash<float>
+{
+	uint64		operator () (float inValue) const
+	{
+		float value = inValue == 0.0f? 0.0f : inValue; // Convert -0.0f to 0.0f
+		return HashBytes(&value, sizeof(value));
+	}
+};
+
+/// A hash function for doubles
+template <>
+struct Hash<double>
+{
+	uint64		operator () (double inValue) const
+	{
+		double value = inValue == 0.0? 0.0 : inValue; // Convert -0.0 to 0.0
+		return HashBytes(&value, sizeof(value));
+	}
+};
+
+/// A hash function for character pointers
+template <>
+struct Hash<const char *>
+{
+	uint64		operator () (const char *inValue) const
+	{
+		return HashString(inValue);
+	}
+};
+
+/// A hash function for std::string_view
+template <>
+struct Hash<std::string_view>
+{
+	uint64		operator () (const std::string_view &inValue) const
+	{
+		return HashBytes(inValue.data(), uint(inValue.size()));
+	}
+};
+
+/// A hash function for String
+template <>
+struct Hash<String>
+{
+	uint64		operator () (const String &inValue) const
+	{
+		return HashBytes(inValue.data(), uint(inValue.size()));
+	}
+};
+
+/// A fallback function for generic pointers
+template <class T>
+struct Hash<T *>
+{
+	uint64		operator () (T *inValue) const
+	{
+		return HashBytes(&inValue, sizeof(inValue));
+	}
+};
+
+/// Helper macro to define a hash function for trivial types
+#define JPH_DEFINE_TRIVIAL_HASH(type)						\
+template <>													\
+struct Hash<type>											\
+{															\
+	uint64		operator () (const type &inValue) const		\
+	{														\
+		return HashBytes(&inValue, sizeof(inValue));		\
+	}														\
+};
+
+/// Commonly used types
+JPH_DEFINE_TRIVIAL_HASH(char)
+JPH_DEFINE_TRIVIAL_HASH(int)
+JPH_DEFINE_TRIVIAL_HASH(uint32)
+JPH_DEFINE_TRIVIAL_HASH(uint64)
+
+/// Helper function that hashes a single value into ioSeed
+/// Based on https://github.com/jonmaiga/mx3 by Jon Maiga
+template <typename T>
+inline void HashCombine(uint64 &ioSeed, const T &inValue)
+{
+	constexpr uint64 c = 0xbea225f9eb34556dUL;
+
+	uint64 h = ioSeed;
+	uint64 x = Hash<T> { } (inValue);
+
+	// See: https://github.com/jonmaiga/mx3/blob/master/mx3.h
+	// mix_stream(h, x)
+	x *= c;
+	x ^= x >> 39;
+	h += x * c;
+	h *= c;
+
+	// mix(h)
+	h ^= h >> 32;
+	h *= c;
+	h ^= h >> 29;
+	h *= c;
+	h ^= h >> 32;
+	h *= c;
+	h ^= h >> 29;
+
+	ioSeed = h;
+}
+
+/// Hash combiner to use a custom struct in an unordered map or set
+///
+/// Usage:
+///
+///		struct SomeHashKey
+///		{
+///			std::string key1;
+///			std::string key2;
+///			bool key3;
+///		};
+///
+///		JPH_MAKE_HASHABLE(SomeHashKey, t.key1, t.key2, t.key3)
+template <typename FirstValue, typename... Values>
+inline uint64 HashCombineArgs(const FirstValue &inFirstValue, Values... inValues)
+{
+	// Prime the seed by hashing the first value
+	uint64 seed = Hash<FirstValue> { } (inFirstValue);
+
+	// Hash all remaining values together using a fold expression
+	(HashCombine(seed, inValues), ...);
+
+	return seed;
+}
+
+#define JPH_MAKE_HASH_STRUCT(type, name, ...)				\
+	struct [[nodiscard]] name								\
+	{														\
+		::JPH::uint64 operator()(const type &t) const		\
+		{													\
+			return ::JPH::HashCombineArgs(__VA_ARGS__);		\
+		}													\
+	};
+
+#define JPH_MAKE_STD_HASH(type)								\
+	JPH_SUPPRESS_WARNING_PUSH								\
+	JPH_SUPPRESS_WARNINGS									\
+	namespace std											\
+	{														\
+		template<>											\
+		struct [[nodiscard]] hash<type>						\
+		{													\
+			size_t operator()(const type &t) const			\
+			{												\
+				return size_t(::JPH::Hash<type>{ }(t));		\
+			}												\
+		};													\
+	}														\
+	JPH_SUPPRESS_WARNING_POP
+
+#define JPH_MAKE_HASHABLE(type, ...)						\
+	JPH_SUPPRESS_WARNING_PUSH								\
+	JPH_SUPPRESS_WARNINGS									\
+	namespace JPH											\
+	{														\
+		template<>											\
+		JPH_MAKE_HASH_STRUCT(type, Hash<type>, __VA_ARGS__) \
+	}														\
+	JPH_SUPPRESS_WARNING_POP								\
+	JPH_MAKE_STD_HASH(type)
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/HashTable.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/HashTable.h
@ -0,0 +1,876 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/BVec16.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Helper class for implementing an UnorderedSet or UnorderedMap
+/// Based on CppCon 2017: Matt Kulukundis "Designing a Fast, Efficient, Cache-friendly Hash Table, Step by Step"
+/// See: https://www.youtube.com/watch?v=ncHmEUmJZf4
+template <class Key, class KeyValue, class HashTableDetail, class Hash, class KeyEqual>
+class HashTable
+{
+public:
+	/// Properties
+	using value_type = KeyValue;
+	using size_type = uint32;
+	using difference_type = ptrdiff_t;
+
+private:
+	/// Base class for iterators
+	template <class Table, class Iterator>
+	class IteratorBase
+	{
+	public:
+		/// Properties
+		using difference_type = typename Table::difference_type;
+		using value_type = typename Table::value_type;
+		using iterator_category = std::forward_iterator_tag;
+
+		/// Copy constructor
+							IteratorBase(const IteratorBase &inRHS) = default;
+
+		/// Assignment operator
+		IteratorBase &		operator = (const IteratorBase &inRHS) = default;
+
+		/// Iterator at start of table
+		explicit			IteratorBase(Table *inTable) :
+			mTable(inTable),
+			mIndex(0)
+		{
+			while (mIndex < mTable->mMaxSize && (mTable->mControl[mIndex] & cBucketUsed) == 0)
+				++mIndex;
+		}
+
+		/// Iterator at specific index
+							IteratorBase(Table *inTable, size_type inIndex) :
+			mTable(inTable),
+			mIndex(inIndex)
+		{
+		}
+
+		/// Prefix increment
+		Iterator &			operator ++ ()
+		{
+			JPH_ASSERT(IsValid());
+
+			do
+			{
+				++mIndex;
+			}
+			while (mIndex < mTable->mMaxSize && (mTable->mControl[mIndex] & cBucketUsed) == 0);
+
+			return static_cast<Iterator &>(*this);
+		}
+
+		/// Postfix increment
+		Iterator			operator ++ (int)
+		{
+			Iterator result(mTable, mIndex);
+			++(*this);
+			return result;
+		}
+
+		/// Access to key value pair
+		const KeyValue &	operator * () const
+		{
+			JPH_ASSERT(IsValid());
+			return mTable->mData[mIndex];
+		}
+
+		/// Access to key value pair
+		const KeyValue *	operator -> () const
+		{
+			JPH_ASSERT(IsValid());
+			return mTable->mData + mIndex;
+		}
+
+		/// Equality operator
+		bool				operator == (const Iterator &inRHS) const
+		{
+			return mIndex == inRHS.mIndex && mTable == inRHS.mTable;
+		}
+
+		/// Inequality operator
+		bool				operator != (const Iterator &inRHS) const
+		{
+			return !(*this == inRHS);
+		}
+
+		/// Check that the iterator is valid
+		bool				IsValid() const
+		{
+			return mIndex < mTable->mMaxSize
+				&& (mTable->mControl[mIndex] & cBucketUsed) != 0;
+		}
+
+		Table *				mTable;
+		size_type			mIndex;
+	};
+
+	/// Get the maximum number of elements that we can support given a number of buckets
+	static constexpr size_type sGetMaxLoad(size_type inBucketCount)
+	{
+		return uint32((cMaxLoadFactorNumerator * inBucketCount) / cMaxLoadFactorDenominator);
+	}
+
+	/// Update the control value for a bucket
+	JPH_INLINE void			SetControlValue(size_type inIndex, uint8 inValue)
+	{
+		JPH_ASSERT(inIndex < mMaxSize);
+		mControl[inIndex] = inValue;
+
+		// Mirror the first 15 bytes to the 15 bytes beyond mMaxSize
+		// Note that this is equivalent to:
+		// if (inIndex < 15)
+		//   mControl[inIndex + mMaxSize] = inValue
+		// else
+		//   mControl[inIndex] = inValue
+		// Which performs a needless write if inIndex >= 15 but at least it is branch-less
+		mControl[((inIndex - 15) & (mMaxSize - 1)) + 15] = inValue;
+	}
+
+	/// Get the index and control value for a particular key
+	JPH_INLINE void			GetIndexAndControlValue(const Key &inKey, size_type &outIndex, uint8 &outControl) const
+	{
+		// Calculate hash
+		uint64 hash_value = Hash { } (inKey);
+
+		// Split hash into index and control value
+		outIndex = size_type(hash_value >> 7) & (mMaxSize - 1);
+		outControl = cBucketUsed | uint8(hash_value);
+	}
+
+	/// Allocate space for the hash table
+	void					AllocateTable(size_type inMaxSize)
+	{
+		JPH_ASSERT(mData == nullptr);
+
+		mMaxSize = inMaxSize;
+		mLoadLeft = sGetMaxLoad(inMaxSize);
+		size_t required_size = size_t(mMaxSize) * (sizeof(KeyValue) + 1) + 15; // Add 15 bytes to mirror the first 15 bytes of the control values
+		if constexpr (cNeedsAlignedAllocate)
+			mData = reinterpret_cast<KeyValue *>(AlignedAllocate(required_size, alignof(KeyValue)));
+		else
+			mData = reinterpret_cast<KeyValue *>(Allocate(required_size));
+		mControl = reinterpret_cast<uint8 *>(mData + mMaxSize);
+	}
+
+	/// Copy the contents of another hash table
+	void					CopyTable(const HashTable &inRHS)
+	{
+		if (inRHS.empty())
+			return;
+
+		AllocateTable(inRHS.mMaxSize);
+
+		// Copy control bytes
+		memcpy(mControl, inRHS.mControl, mMaxSize + 15);
+
+		// Copy elements
+		uint index = 0;
+		for (const uint8 *control = mControl, *control_end = mControl + mMaxSize; control != control_end; ++control, ++index)
+			if (*control & cBucketUsed)
+				new (mData + index) KeyValue(inRHS.mData[index]);
+		mSize = inRHS.mSize;
+	}
+
+	/// Grow the table to a new size
+	void					GrowTable(size_type inNewMaxSize)
+	{
+		// Move the old table to a temporary structure
+		size_type old_max_size = mMaxSize;
+		KeyValue *old_data = mData;
+		const uint8 *old_control = mControl;
+		mData = nullptr;
+		mControl = nullptr;
+		mSize = 0;
+		mMaxSize = 0;
+		mLoadLeft = 0;
+
+		// Allocate new table
+		AllocateTable(inNewMaxSize);
+
+		// Reset all control bytes
+		memset(mControl, cBucketEmpty, mMaxSize + 15);
+
+		if (old_data != nullptr)
+		{
+			// Copy all elements from the old table
+			for (size_type i = 0; i < old_max_size; ++i)
+				if (old_control[i] & cBucketUsed)
+				{
+					size_type index;
+					KeyValue *element = old_data + i;
+					JPH_IF_ENABLE_ASSERTS(bool inserted =) InsertKey</* InsertAfterGrow= */ true>(HashTableDetail::sGetKey(*element), index);
+					JPH_ASSERT(inserted);
+					new (mData + index) KeyValue(std::move(*element));
+					element->~KeyValue();
+				}
+
+			// Free memory
+			if constexpr (cNeedsAlignedAllocate)
+				AlignedFree(old_data);
+			else
+				Free(old_data);
+		}
+	}
+
+protected:
+	/// Get an element by index
+	KeyValue &				GetElement(size_type inIndex) const
+	{
+		return mData[inIndex];
+	}
+
+	/// Insert a key into the map, returns true if the element was inserted, false if it already existed.
+	/// outIndex is the index at which the element should be constructed / where it is located.
+	template <bool InsertAfterGrow = false>
+	bool					InsertKey(const Key &inKey, size_type &outIndex)
+	{
+		// Ensure we have enough space
+		if (mLoadLeft == 0)
+		{
+			// Should not be growing if we're already growing!
+			if constexpr (InsertAfterGrow)
+				JPH_ASSERT(false);
+
+			// Decide if we need to clean up all tombstones or if we need to grow the map
+			size_type num_deleted = sGetMaxLoad(mMaxSize) - mSize;
+			if (num_deleted * cMaxDeletedElementsDenominator > mMaxSize * cMaxDeletedElementsNumerator)
+				rehash(0);
+			else
+			{
+				// Grow by a power of 2
+				size_type new_max_size = max<size_type>(mMaxSize << 1, 16);
+				if (new_max_size < mMaxSize)
+				{
+					JPH_ASSERT(false, "Overflow in hash table size, can't grow!");
+					return false;
+				}
+				GrowTable(new_max_size);
+			}
+		}
+
+		// Split hash into index and control value
+		size_type index;
+		uint8 control;
+		GetIndexAndControlValue(inKey, index, control);
+
+		// Keeps track of the index of the first deleted bucket we found
+		constexpr size_type cNoDeleted = ~size_type(0);
+		size_type first_deleted_index = cNoDeleted;
+
+		// Linear probing
+		KeyEqual equal;
+		size_type bucket_mask = mMaxSize - 1;
+		BVec16 control16 = BVec16::sReplicate(control);
+		BVec16 bucket_empty = BVec16::sZero();
+		BVec16 bucket_deleted = BVec16::sReplicate(cBucketDeleted);
+		for (;;)
+		{
+			// Read 16 control values (note that we added 15 bytes at the end of the control values that mirror the first 15 bytes)
+			BVec16 control_bytes = BVec16::sLoadByte16(mControl + index);
+
+			// Check if we must find the element before we can insert
+			if constexpr (!InsertAfterGrow)
+			{
+				// Check for the control value we're looking for
+				// Note that when deleting we can create empty buckets instead of deleted buckets.
+				// This means we must unconditionally check all buckets in this batch for equality
+				// (also beyond the first empty bucket).
+				uint32 control_equal = uint32(BVec16::sEquals(control_bytes, control16).GetTrues());
+
+				// Index within the 16 buckets
+				size_type local_index = index;
+
+				// Loop while there's still buckets to process
+				while (control_equal != 0)
+				{
+					// Get the first equal bucket
+					uint first_equal = CountTrailingZeros(control_equal);
+
+					// Skip to the bucket
+					local_index += first_equal;
+
+					// Make sure that our index is not beyond the end of the table
+					local_index &= bucket_mask;
+
+					// We found a bucket with same control value
+					if (equal(HashTableDetail::sGetKey(mData[local_index]), inKey))
+					{
+						// Element already exists
+						outIndex = local_index;
+						return false;
+					}
+
+					// Skip past this bucket
+					control_equal >>= first_equal + 1;
+					local_index++;
+				}
+
+				// Check if we're still scanning for deleted buckets
+				if (first_deleted_index == cNoDeleted)
+				{
+					// Check if any buckets have been deleted, if so store the first one
+					uint32 control_deleted = uint32(BVec16::sEquals(control_bytes, bucket_deleted).GetTrues());
+					if (control_deleted != 0)
+						first_deleted_index = index + CountTrailingZeros(control_deleted);
+				}
+			}
+
+			// Check for empty buckets
+			uint32 control_empty = uint32(BVec16::sEquals(control_bytes, bucket_empty).GetTrues());
+			if (control_empty != 0)
+			{
+				// If we found a deleted bucket, use it.
+				// It doesn't matter if it is before or after the first empty bucket we found
+				// since we will always be scanning in batches of 16 buckets.
+				if (first_deleted_index == cNoDeleted || InsertAfterGrow)
+				{
+					index += CountTrailingZeros(control_empty);
+					--mLoadLeft; // Using an empty bucket decreases the load left
+				}
+				else
+				{
+					index = first_deleted_index;
+				}
+
+				// Make sure that our index is not beyond the end of the table
+				index &= bucket_mask;
+
+				// Update control byte
+				SetControlValue(index, control);
+				++mSize;
+
+				// Return index to newly allocated bucket
+				outIndex = index;
+				return true;
+			}
+
+			// Move to next batch of 16 buckets
+			index = (index + 16) & bucket_mask;
+		}
+	}
+
+public:
+	/// Non-const iterator
+	class iterator : public IteratorBase<HashTable, iterator>
+	{
+		using Base = IteratorBase<HashTable, iterator>;
+
+	public:
+		using IteratorBase<HashTable, iterator>::operator ==;
+
+		/// Properties
+		using reference = typename Base::value_type &;
+		using pointer = typename Base::value_type *;
+
+		/// Constructors
+		explicit			iterator(HashTable *inTable) : Base(inTable) { }
+							iterator(HashTable *inTable, size_type inIndex) : Base(inTable, inIndex) { }
+							iterator(const iterator &inIterator) : Base(inIterator) { }
+
+		/// Assignment
+		iterator &			operator = (const iterator &inRHS) { Base::operator = (inRHS); return *this; }
+
+		using Base::operator *;
+
+		/// Non-const access to key value pair
+		KeyValue &			operator * ()
+		{
+			JPH_ASSERT(this->IsValid());
+			return this->mTable->mData[this->mIndex];
+		}
+
+		using Base::operator ->;
+
+		/// Non-const access to key value pair
+		KeyValue *			operator -> ()
+		{
+			JPH_ASSERT(this->IsValid());
+			return this->mTable->mData + this->mIndex;
+		}
+	};
+
+	/// Const iterator
+	class const_iterator : public IteratorBase<const HashTable, const_iterator>
+	{
+		using Base = IteratorBase<const HashTable, const_iterator>;
+
+	public:
+		using IteratorBase<const HashTable, const_iterator>::operator ==;
+
+		/// Properties
+		using reference = const typename Base::value_type &;
+		using pointer = const typename Base::value_type *;
+
+		/// Constructors
+		explicit			const_iterator(const HashTable *inTable) : Base(inTable) { }
+							const_iterator(const HashTable *inTable, size_type inIndex) : Base(inTable, inIndex) { }
+							const_iterator(const const_iterator &inRHS) : Base(inRHS) { }
+							const_iterator(const iterator &inIterator) : Base(inIterator.mTable, inIterator.mIndex) { }
+
+		/// Assignment
+		const_iterator &	operator = (const iterator &inRHS) { this->mTable = inRHS.mTable; this->mIndex = inRHS.mIndex; return *this; }
+		const_iterator &	operator = (const const_iterator &inRHS) { Base::operator = (inRHS); return *this; }
+	};
+
+	/// Default constructor
+							HashTable() = default;
+
+	/// Copy constructor
+							HashTable(const HashTable &inRHS)
+	{
+		CopyTable(inRHS);
+	}
+
+	/// Move constructor
+							HashTable(HashTable &&ioRHS) noexcept :
+		mData(ioRHS.mData),
+		mControl(ioRHS.mControl),
+		mSize(ioRHS.mSize),
+		mMaxSize(ioRHS.mMaxSize),
+		mLoadLeft(ioRHS.mLoadLeft)
+	{
+		ioRHS.mData = nullptr;
+		ioRHS.mControl = nullptr;
+		ioRHS.mSize = 0;
+		ioRHS.mMaxSize = 0;
+		ioRHS.mLoadLeft = 0;
+	}
+
+	/// Assignment operator
+	HashTable &				operator = (const HashTable &inRHS)
+	{
+		if (this != &inRHS)
+		{
+			clear();
+
+			CopyTable(inRHS);
+		}
+
+		return *this;
+	}
+
+	/// Move assignment operator
+	HashTable &				operator = (HashTable &&ioRHS) noexcept
+	{
+		if (this != &ioRHS)
+		{
+			clear();
+
+			mData = ioRHS.mData;
+			mControl = ioRHS.mControl;
+			mSize = ioRHS.mSize;
+			mMaxSize = ioRHS.mMaxSize;
+			mLoadLeft = ioRHS.mLoadLeft;
+
+			ioRHS.mData = nullptr;
+			ioRHS.mControl = nullptr;
+			ioRHS.mSize = 0;
+			ioRHS.mMaxSize = 0;
+			ioRHS.mLoadLeft = 0;
+		}
+
+		return *this;
+	}
+
+	/// Destructor
+							~HashTable()
+	{
+		clear();
+	}
+
+	/// Reserve memory for a certain number of elements
+	void					reserve(size_type inMaxSize)
+	{
+		// Calculate max size based on load factor
+		size_type max_size = GetNextPowerOf2(max<uint32>((cMaxLoadFactorDenominator * inMaxSize) / cMaxLoadFactorNumerator, 16));
+		if (max_size <= mMaxSize)
+			return;
+
+		GrowTable(max_size);
+	}
+
+	/// Destroy the entire hash table
+	void					clear()
+	{
+		// Delete all elements
+		if constexpr (!std::is_trivially_destructible<KeyValue>())
+			if (!empty())
+				for (size_type i = 0; i < mMaxSize; ++i)
+					if (mControl[i] & cBucketUsed)
+						mData[i].~KeyValue();
+
+		if (mData != nullptr)
+		{
+			// Free memory
+			if constexpr (cNeedsAlignedAllocate)
+				AlignedFree(mData);
+			else
+				Free(mData);
+
+			// Reset members
+			mData = nullptr;
+			mControl = nullptr;
+			mSize = 0;
+			mMaxSize = 0;
+			mLoadLeft = 0;
+		}
+	}
+
+	/// Destroy the entire hash table but keeps the memory allocated
+	void					ClearAndKeepMemory()
+	{
+		// Destruct elements
+		if constexpr (!std::is_trivially_destructible<KeyValue>())
+			if (!empty())
+				for (size_type i = 0; i < mMaxSize; ++i)
+					if (mControl[i] & cBucketUsed)
+						mData[i].~KeyValue();
+		mSize = 0;
+
+		// If there are elements that are not marked cBucketEmpty, we reset them
+		size_type max_load = sGetMaxLoad(mMaxSize);
+		if (mLoadLeft != max_load)
+		{
+			// Reset all control bytes
+			memset(mControl, cBucketEmpty, mMaxSize + 15);
+			mLoadLeft = max_load;
+		}
+	}
+
+	/// Iterator to first element
+	iterator				begin()
+	{
+		return iterator(this);
+	}
+
+	/// Iterator to one beyond last element
+	iterator				end()
+	{
+		return iterator(this, mMaxSize);
+	}
+
+	/// Iterator to first element
+	const_iterator			begin() const
+	{
+		return const_iterator(this);
+	}
+
+	/// Iterator to one beyond last element
+	const_iterator			end() const
+	{
+		return const_iterator(this, mMaxSize);
+	}
+
+	/// Iterator to first element
+	const_iterator			cbegin() const
+	{
+		return const_iterator(this);
+	}
+
+	/// Iterator to one beyond last element
+	const_iterator			cend() const
+	{
+		return const_iterator(this, mMaxSize);
+	}
+
+	/// Number of buckets in the table
+	size_type				bucket_count() const
+	{
+		return mMaxSize;
+	}
+
+	/// Max number of buckets that the table can have
+	constexpr size_type		max_bucket_count() const
+	{
+		return size_type(1) << (sizeof(size_type) * 8 - 1);
+	}
+
+	/// Check if there are no elements in the table
+	bool					empty() const
+	{
+		return mSize == 0;
+	}
+
+	/// Number of elements in the table
+	size_type				size() const
+	{
+		return mSize;
+	}
+
+	/// Max number of elements that the table can hold
+	constexpr size_type		max_size() const
+	{
+		return size_type((uint64(max_bucket_count()) * cMaxLoadFactorNumerator) / cMaxLoadFactorDenominator);
+	}
+
+	/// Get the max load factor for this table (max number of elements / number of buckets)
+	constexpr float			max_load_factor() const
+	{
+		return float(cMaxLoadFactorNumerator) / float(cMaxLoadFactorDenominator);
+	}
+
+	/// Insert a new element, returns iterator and if the element was inserted
+	std::pair<iterator, bool> insert(const value_type &inValue)
+	{
+		size_type index;
+		bool inserted = InsertKey(HashTableDetail::sGetKey(inValue), index);
+		if (inserted)
+			new (mData + index) KeyValue(inValue);
+		return std::make_pair(iterator(this, index), inserted);
+	}
+
+	/// Find an element, returns iterator to element or end() if not found
+	const_iterator			find(const Key &inKey) const
+	{
+		// Check if we have any data
+		if (empty())
+			return cend();
+
+		// Split hash into index and control value
+		size_type index;
+		uint8 control;
+		GetIndexAndControlValue(inKey, index, control);
+
+		// Linear probing
+		KeyEqual equal;
+		size_type bucket_mask = mMaxSize - 1;
+		BVec16 control16 = BVec16::sReplicate(control);
+		BVec16 bucket_empty = BVec16::sZero();
+		for (;;)
+		{
+			// Read 16 control values
+			// (note that we added 15 bytes at the end of the control values that mirror the first 15 bytes)
+			BVec16 control_bytes = BVec16::sLoadByte16(mControl + index);
+
+			// Check for the control value we're looking for
+			// Note that when deleting we can create empty buckets instead of deleted buckets.
+			// This means we must unconditionally check all buckets in this batch for equality
+			// (also beyond the first empty bucket).
+			uint32 control_equal = uint32(BVec16::sEquals(control_bytes, control16).GetTrues());
+
+			// Index within the 16 buckets
+			size_type local_index = index;
+
+			// Loop while there's still buckets to process
+			while (control_equal != 0)
+			{
+				// Get the first equal bucket
+				uint first_equal = CountTrailingZeros(control_equal);
+
+				// Skip to the bucket
+				local_index += first_equal;
+
+				// Make sure that our index is not beyond the end of the table
+				local_index &= bucket_mask;
+
+				// We found a bucket with same control value
+				if (equal(HashTableDetail::sGetKey(mData[local_index]), inKey))
+				{
+					// Element found
+					return const_iterator(this, local_index);
+				}
+
+				// Skip past this bucket
+				control_equal >>= first_equal + 1;
+				local_index++;
+			}
+
+			// Check for empty buckets
+			uint32 control_empty = uint32(BVec16::sEquals(control_bytes, bucket_empty).GetTrues());
+			if (control_empty != 0)
+			{
+				// An empty bucket was found, we didn't find the element
+				return cend();
+			}
+
+			// Move to next batch of 16 buckets
+			index = (index + 16) & bucket_mask;
+		}
+	}
+
+	/// @brief Erase an element by iterator
+	void					erase(const const_iterator &inIterator)
+	{
+		JPH_ASSERT(inIterator.IsValid());
+
+		// Read 16 control values before and after the current index
+		// (note that we added 15 bytes at the end of the control values that mirror the first 15 bytes)
+		BVec16 control_bytes_before = BVec16::sLoadByte16(mControl + ((inIterator.mIndex - 16) & (mMaxSize - 1)));
+		BVec16 control_bytes_after = BVec16::sLoadByte16(mControl + inIterator.mIndex);
+		BVec16 bucket_empty = BVec16::sZero();
+		uint32 control_empty_before = uint32(BVec16::sEquals(control_bytes_before, bucket_empty).GetTrues());
+		uint32 control_empty_after = uint32(BVec16::sEquals(control_bytes_after, bucket_empty).GetTrues());
+
+		// If (this index including) there exist 16 consecutive non-empty slots (represented by a bit being 0) then
+		// a probe looking for some element needs to continue probing so we cannot mark the bucket as empty
+		// but must mark it as deleted instead.
+		// Note that we use: CountLeadingZeros(uint16) = CountLeadingZeros(uint32) - 16.
+		uint8 control_value = CountLeadingZeros(control_empty_before) - 16 + CountTrailingZeros(control_empty_after) < 16? cBucketEmpty : cBucketDeleted;
+
+		// Mark the bucket as empty/deleted
+		SetControlValue(inIterator.mIndex, control_value);
+
+		// Destruct the element
+		mData[inIterator.mIndex].~KeyValue();
+
+		// If we marked the bucket as empty we can increase the load left
+		if (control_value == cBucketEmpty)
+			++mLoadLeft;
+
+		// Decrease size
+		--mSize;
+	}
+
+	/// @brief Erase an element by key
+	size_type				erase(const Key &inKey)
+	{
+		const_iterator it = find(inKey);
+		if (it == cend())
+			return 0;
+
+		erase(it);
+		return 1;
+	}
+
+	/// Swap the contents of two hash tables
+	void					swap(HashTable &ioRHS) noexcept
+	{
+		std::swap(mData, ioRHS.mData);
+		std::swap(mControl, ioRHS.mControl);
+		std::swap(mSize, ioRHS.mSize);
+		std::swap(mMaxSize, ioRHS.mMaxSize);
+		std::swap(mLoadLeft, ioRHS.mLoadLeft);
+	}
+
+	/// In place re-hashing of all elements in the table. Removes all cBucketDeleted elements
+	/// The std version takes a bucket count, but we just re-hash to the same size.
+	void					rehash(size_type)
+	{
+		// Update the control value for all buckets
+		for (size_type i = 0; i < mMaxSize; ++i)
+		{
+			uint8 &control = mControl[i];
+			switch (control)
+			{
+			case cBucketDeleted:
+				// Deleted buckets become empty
+				control = cBucketEmpty;
+				break;
+			case cBucketEmpty:
+				// Remains empty
+				break;
+			default:
+				// Mark all occupied as deleted, to indicate it needs to move to the correct place
+				control = cBucketDeleted;
+				break;
+			}
+		}
+
+		// Replicate control values to the last 15 entries
+		for (size_type i = 0; i < 15; ++i)
+			mControl[mMaxSize + i] = mControl[i];
+
+		// Loop over all elements that have been 'deleted' and move them to their new spot
+		BVec16 bucket_used = BVec16::sReplicate(cBucketUsed);
+		size_type bucket_mask = mMaxSize - 1;
+		uint32 probe_mask = bucket_mask & ~uint32(0b1111); // Mask out lower 4 bits because we test 16 buckets at a time
+		for (size_type src = 0; src < mMaxSize; ++src)
+			if (mControl[src] == cBucketDeleted)
+				for (;;)
+				{
+					// Split hash into index and control value
+					size_type src_index;
+					uint8 src_control;
+					GetIndexAndControlValue(HashTableDetail::sGetKey(mData[src]), src_index, src_control);
+
+					// Linear probing
+					size_type dst = src_index;
+					for (;;)
+					{
+						// Check if any buckets are free
+						BVec16 control_bytes = BVec16::sLoadByte16(mControl + dst);
+						uint32 control_free = uint32(BVec16::sAnd(control_bytes, bucket_used).GetTrues()) ^ 0xffff;
+						if (control_free != 0)
+						{
+							// Select this bucket as destination
+							dst += CountTrailingZeros(control_free);
+							dst &= bucket_mask;
+							break;
+						}
+
+						// Move to next batch of 16 buckets
+						dst = (dst + 16) & bucket_mask;
+					}
+
+					// Check if we stay in the same probe group
+					if (((dst - src_index) & probe_mask) == ((src - src_index) & probe_mask))
+					{
+						// We stay in the same group, we can stay where we are
+						SetControlValue(src, src_control);
+						break;
+					}
+					else if (mControl[dst] == cBucketEmpty)
+					{
+						// There's an empty bucket, move us there
+						SetControlValue(dst, src_control);
+						SetControlValue(src, cBucketEmpty);
+						new (mData + dst) KeyValue(std::move(mData[src]));
+						mData[src].~KeyValue();
+						break;
+					}
+					else
+					{
+						// There's an element in the bucket we want to move to, swap them
+						JPH_ASSERT(mControl[dst] == cBucketDeleted);
+						SetControlValue(dst, src_control);
+						std::swap(mData[src], mData[dst]);
+						// Iterate again with the same source bucket
+					}
+				}
+
+		// Reinitialize load left
+		mLoadLeft = sGetMaxLoad(mMaxSize) - mSize;
+	}
+
+private:
+	/// If this allocator needs to fall back to aligned allocations because the type requires it
+	static constexpr bool	cNeedsAlignedAllocate = alignof(KeyValue) > JPH_DEFAULT_ALLOCATE_ALIGNMENT;
+
+	/// Max load factor is cMaxLoadFactorNumerator / cMaxLoadFactorDenominator
+	static constexpr uint64	cMaxLoadFactorNumerator = 7;
+	static constexpr uint64	cMaxLoadFactorDenominator = 8;
+
+	/// If we can recover this fraction of deleted elements, we'll reshuffle the buckets in place rather than growing the table
+	static constexpr uint64 cMaxDeletedElementsNumerator = 1;
+	static constexpr uint64 cMaxDeletedElementsDenominator = 8;
+
+	/// Values that the control bytes can have
+	static constexpr uint8	cBucketEmpty = 0;
+	static constexpr uint8	cBucketDeleted = 0x7f;
+	static constexpr uint8	cBucketUsed = 0x80;	// Lowest 7 bits are lowest 7 bits of the hash value
+
+	/// The buckets, an array of size mMaxSize
+	KeyValue *				mData = nullptr;
+
+	/// Control bytes, an array of size mMaxSize + 15
+	uint8 *					mControl = nullptr;
+
+	/// Number of elements in the table
+	size_type				mSize = 0;
+
+	/// Max number of elements that can be stored in the table
+	size_type				mMaxSize = 0;
+
+	/// Number of elements we can add to the table before we need to grow
+	size_type				mLoadLeft = 0;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/IncludeWindows.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/IncludeWindows.h
@ -0,0 +1,36 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2025 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#ifdef JPH_PLATFORM_WINDOWS
+
+JPH_SUPPRESS_WARNING_PUSH
+JPH_MSVC_SUPPRESS_WARNING(5039) // winbase.h(13179): warning C5039: 'TpSetCallbackCleanupGroup': pointer or reference to potentially throwing function passed to 'extern "C"' function under -EHc. Undefined behavior may occur if this function throws an exception.
+JPH_MSVC2026_PLUS_SUPPRESS_WARNING(4865) // wingdi.h(2806,1): '<unnamed-enum-DISPLAYCONFIG_OUTPUT_TECHNOLOGY_OTHER>': the underlying type will change from 'int' to '__int64' when '/Zc:enumTypes' is specified on the command line
+JPH_CLANG_SUPPRESS_WARNING("-Wreserved-macro-identifier") // Complains about _WIN32_WINNT being reserved
+
+#ifndef WINVER
+	#define WINVER 0x0A00 // Targeting Windows 10 and above
+#endif
+
+#ifndef _WIN32_WINNT
+	#define _WIN32_WINNT 0x0A00
+#endif
+
+#ifndef WIN32_LEAN_AND_MEAN
+	#define WIN32_LEAN_AND_MEAN
+#endif
+
+#ifndef NOMINMAX
+	#define NOMINMAX
+#endif
+
+#ifndef JPH_COMPILER_MINGW
+	#include <Windows.h>
+#else
+	#include <windows.h>
+#endif
+
+JPH_SUPPRESS_WARNING_POP
+
+#endif
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/InsertionSort.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/InsertionSort.h
@ -0,0 +1,58 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Implementation of the insertion sort algorithm.
+template <typename Iterator, typename Compare>
+inline void InsertionSort(Iterator inBegin, Iterator inEnd, Compare inCompare)
+{
+	// Empty arrays don't need to be sorted
+	if (inBegin != inEnd)
+	{
+		// Start at the second element
+		for (Iterator i = inBegin + 1; i != inEnd; ++i)
+		{
+			// Move this element to a temporary value
+			auto x = std::move(*i);
+
+			// Check if the element goes before inBegin (we can't decrement the iterator before inBegin so this needs to be a separate branch)
+			if (inCompare(x, *inBegin))
+			{
+				// Move all elements to the right to make space for x
+				Iterator prev;
+				for (Iterator j = i; j != inBegin; j = prev)
+				{
+					prev = j - 1;
+					*j = *prev;
+				}
+
+				// Move x to the first place
+				*inBegin = std::move(x);
+			}
+			else
+			{
+				// Move elements to the right as long as they are bigger than x
+				Iterator j = i;
+				for (Iterator prev = j - 1; inCompare(x, *prev); j = prev, --prev)
+					*j = std::move(*prev);
+
+				// Move x into place
+				*j = std::move(x);
+			}
+		}
+	}
+}
+
+/// Implementation of insertion sort algorithm without comparator.
+template <typename Iterator>
+inline void InsertionSort(Iterator inBegin, Iterator inEnd)
+{
+	std::less<> compare;
+	InsertionSort(inBegin, inEnd, compare);
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/IssueReporting.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/IssueReporting.cpp
@ -0,0 +1,27 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+JPH_NAMESPACE_BEGIN
+
+static void DummyTrace([[maybe_unused]] const char *inFMT, ...)
+{
+	JPH_ASSERT(false);
+};
+
+TraceFunction Trace = DummyTrace;
+
+#ifdef JPH_ENABLE_ASSERTS
+
+static bool DummyAssertFailed(const char *inExpression, const char *inMessage, const char *inFile, uint inLine)
+{
+	return true; // Trigger breakpoint
+};
+
+AssertFailedFunction AssertFailed = DummyAssertFailed;
+
+#endif // JPH_ENABLE_ASSERTS
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/IssueReporting.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/IssueReporting.h
@ -0,0 +1,38 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Trace function, needs to be overridden by application. This should output a line of text to the log / TTY.
+using TraceFunction = void (*)(const char *inFMT, ...);
+JPH_EXPORT extern TraceFunction Trace;
+
+// Always turn on asserts in Debug mode
+#if defined(JPH_DEBUG) && !defined(JPH_ENABLE_ASSERTS)
+	#define JPH_ENABLE_ASSERTS
+#endif
+
+#ifdef JPH_ENABLE_ASSERTS
+	/// Function called when an assertion fails. This function should return true if a breakpoint needs to be triggered
+	using AssertFailedFunction = bool(*)(const char *inExpression, const char *inMessage, const char *inFile, uint inLine);
+	JPH_EXPORT extern AssertFailedFunction AssertFailed;
+
+	// Helper functions to pass message on to failed function
+	struct AssertLastParam { };
+	inline bool AssertFailedParamHelper(const char *inExpression, const char *inFile, uint inLine, AssertLastParam) { return AssertFailed(inExpression, nullptr, inFile, inLine); }
+	inline bool AssertFailedParamHelper(const char *inExpression, const char *inFile, uint inLine, const char *inMessage, AssertLastParam) { return AssertFailed(inExpression, inMessage, inFile, inLine); }
+
+	/// Main assert macro, usage: JPH_ASSERT(condition, message) or JPH_ASSERT(condition)
+	#define JPH_ASSERT(inExpression, ...)	do { if (!(inExpression) && AssertFailedParamHelper(#inExpression, __FILE__, JPH::uint(__LINE__), ##__VA_ARGS__, JPH::AssertLastParam())) JPH_BREAKPOINT; } while (false)
+
+	#define JPH_IF_ENABLE_ASSERTS(...)		__VA_ARGS__
+#else
+	#define JPH_ASSERT(...)					((void)0)
+
+	#define JPH_IF_ENABLE_ASSERTS(...)
+#endif // JPH_ENABLE_ASSERTS
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystem.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystem.h
@ -0,0 +1,311 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/Reference.h>
+#include <Jolt/Core/Color.h>
+#include <Jolt/Core/Profiler.h>
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/StaticArray.h>
+#include <Jolt/Core/Atomics.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// A class that allows units of work (Jobs) to be scheduled across multiple threads.
+/// It allows dependencies between the jobs so that the jobs form a graph.
+///
+/// The pattern for using this class is:
+///
+///		// Create job system
+///		JobSystem *job_system = new JobSystemThreadPool(...);
+///
+///		// Create some jobs
+///		JobHandle second_job = job_system->CreateJob("SecondJob", Color::sRed, []() { ... }, 1); // Create a job with 1 dependency
+///		JobHandle first_job = job_system->CreateJob("FirstJob", Color::sGreen, [second_job]() { ....; second_job.RemoveDependency(); }, 0); // Job can start immediately, will start second job when it's done
+///		JobHandle third_job = job_system->CreateJob("ThirdJob", Color::sBlue, []() { ... }, 0); // This job can run immediately as well and can run in parallel to job 1 and 2
+///
+///		// Add the jobs to the barrier so that we can execute them while we're waiting
+///		Barrier *barrier = job_system->CreateBarrier();
+///		barrier->AddJob(first_job);
+///		barrier->AddJob(second_job);
+///		barrier->AddJob(third_job);
+///		job_system->WaitForJobs(barrier);
+///
+///		// Clean up
+///		job_system->DestroyBarrier(barrier);
+///		delete job_system;
+///
+///	Jobs are guaranteed to be started in the order that their dependency counter becomes zero (in case they're scheduled on a background thread)
+///	or in the order they're added to the barrier (when dependency count is zero and when executing on the thread that calls WaitForJobs).
+///
+/// If you want to implement your own job system, inherit from JobSystem and implement:
+///
+/// * JobSystem::GetMaxConcurrency - This should return the maximum number of jobs that can run in parallel.
+/// * JobSystem::CreateJob - This should create a Job object and return it to the caller.
+/// * JobSystem::FreeJob - This should free the memory associated with the job object. It is called by the Job destructor when it is Release()-ed for the last time.
+/// * JobSystem::QueueJob/QueueJobs - These should store the job pointer in an internal queue to run immediately (dependencies are tracked internally, this function is called when the job can run).
+/// The Job objects are reference counted and are guaranteed to stay alive during the QueueJob(s) call. If you store the job in your own data structure you need to call AddRef() to take a reference.
+/// After the job has been executed you need to call Release() to release the reference. Make sure you no longer dereference the job pointer after calling Release().
+///
+/// JobSystem::Barrier is used to track the completion of a set of jobs. Jobs will be created by other jobs and added to the barrier while it is being waited on. This means that you cannot
+/// create a dependency graph beforehand as the graph changes while jobs are running. Implement the following functions:
+///
+/// * Barrier::AddJob/AddJobs - Add a job to the barrier, any call to WaitForJobs will now also wait for this job to complete.
+/// If you store the job in a data structure in the Barrier you need to call AddRef() on the job to keep it alive and Release() after you're done with it.
+/// * Barrier::OnJobFinished - This function is called when a job has finished executing, you can use this to track completion and remove the job from the list of jobs to wait on.
+///
+/// The functions on JobSystem that need to be implemented to support barriers are:
+///
+/// * JobSystem::CreateBarrier - Create a new barrier.
+/// * JobSystem::DestroyBarrier - Destroy a barrier.
+/// * JobSystem::WaitForJobs - This is the main function that is used to wait for all jobs that have been added to a Barrier. WaitForJobs can execute jobs that have
+/// been added to the barrier while waiting. It is not wise to execute other jobs that touch physics structures as this can cause race conditions and deadlocks. Please keep in mind that the barrier is
+/// only intended to wait on the completion of the Jolt jobs added to it, if you scheduled any jobs in your engine's job system to execute the Jolt jobs as part of QueueJob/QueueJobs, you might still need
+/// to wait for these in this function after the barrier is finished waiting.
+///
+/// An example implementation is JobSystemThreadPool. If you don't want to write the Barrier class you can also inherit from JobSystemWithBarrier.
+class JPH_EXPORT JobSystem : public NonCopyable
+{
+protected:
+	class Job;
+
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// A job handle contains a reference to a job. The job will be deleted as soon as there are no JobHandles.
+	/// referring to the job and when it is not in the job queue / being processed.
+	class JobHandle : private Ref<Job>
+	{
+	public:
+		/// Constructor
+		inline				JobHandle()									= default;
+		inline				JobHandle(const JobHandle &inHandle)		= default;
+		inline				JobHandle(JobHandle &&inHandle) noexcept	: Ref<Job>(std::move(inHandle)) { }
+
+		/// Constructor, only to be used by JobSystem
+		inline explicit		JobHandle(Job *inJob)						: Ref<Job>(inJob) { }
+
+		/// Assignment
+		inline JobHandle &	operator = (const JobHandle &inHandle)		= default;
+		inline JobHandle &	operator = (JobHandle &&inHandle) noexcept	= default;
+
+		/// Check if this handle contains a job
+		inline bool			IsValid() const								{ return GetPtr() != nullptr; }
+
+		/// Check if this job has finished executing
+		inline bool			IsDone() const								{ return GetPtr() != nullptr && GetPtr()->IsDone(); }
+
+		/// Add to the dependency counter.
+		inline void			AddDependency(int inCount = 1) const		{ GetPtr()->AddDependency(inCount); }
+
+		/// Remove from the dependency counter. Job will start whenever the dependency counter reaches zero
+		/// and if it does it is no longer valid to call the AddDependency/RemoveDependency functions.
+		inline void			RemoveDependency(int inCount = 1) const		{ GetPtr()->RemoveDependencyAndQueue(inCount); }
+
+		/// Remove a dependency from a batch of jobs at once, this can be more efficient than removing them one by one as it requires less locking
+		static inline void	sRemoveDependencies(const JobHandle *inHandles, uint inNumHandles, int inCount = 1);
+
+		/// Helper function to remove dependencies on a static array of job handles
+		template <uint N>
+		static inline void	sRemoveDependencies(StaticArray<JobHandle, N> &inHandles, int inCount = 1)
+		{
+			sRemoveDependencies(inHandles.data(), inHandles.size(), inCount);
+		}
+
+		/// Inherit the GetPtr function, only to be used by the JobSystem
+		using Ref<Job>::GetPtr;
+	};
+
+	/// A job barrier keeps track of a number of jobs and allows waiting until they are all completed.
+	class Barrier : public NonCopyable
+	{
+	public:
+		JPH_OVERRIDE_NEW_DELETE
+
+		/// Add a job to this barrier
+		/// Note that jobs can keep being added to the barrier while waiting for the barrier
+		virtual void		AddJob(const JobHandle &inJob) = 0;
+
+		/// Add multiple jobs to this barrier
+		/// Note that jobs can keep being added to the barrier while waiting for the barrier
+		virtual void		AddJobs(const JobHandle *inHandles, uint inNumHandles) = 0;
+
+	protected:
+		/// Job needs to be able to call OnJobFinished
+		friend class Job;
+
+		/// Destructor, you should call JobSystem::DestroyBarrier instead of destructing this object directly
+		virtual				~Barrier() = default;
+
+		/// Called by a Job to mark that it is finished
+		virtual void		OnJobFinished(Job *inJob) = 0;
+	};
+
+	/// Main function of the job
+	using JobFunction = function<void()>;
+
+	/// Destructor
+	virtual					~JobSystem() = default;
+
+	/// Get maximum number of concurrently executing jobs
+	virtual int				GetMaxConcurrency() const = 0;
+
+	/// Create a new job, the job is started immediately if inNumDependencies == 0 otherwise it starts when
+	/// RemoveDependency causes the dependency counter to reach 0.
+	virtual JobHandle		CreateJob(const char *inName, ColorArg inColor, const JobFunction &inJobFunction, uint32 inNumDependencies = 0) = 0;
+
+	/// Create a new barrier, used to wait on jobs
+	virtual Barrier *		CreateBarrier() = 0;
+
+	/// Destroy a barrier when it is no longer used. The barrier should be empty at this point.
+	virtual void			DestroyBarrier(Barrier *inBarrier) = 0;
+
+	/// Wait for a set of jobs to be finished, note that only 1 thread can be waiting on a barrier at a time
+	virtual void			WaitForJobs(Barrier *inBarrier) = 0;
+
+protected:
+	/// A class that contains information for a single unit of work
+	class Job
+	{
+	public:
+		JPH_OVERRIDE_NEW_DELETE
+
+		/// Constructor
+							Job([[maybe_unused]] const char *inJobName, [[maybe_unused]] ColorArg inColor, JobSystem *inJobSystem, const JobFunction &inJobFunction, uint32 inNumDependencies) :
+		#if defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+			mJobName(inJobName),
+			mColor(inColor),
+		#endif // defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+			mJobSystem(inJobSystem),
+			mJobFunction(inJobFunction),
+			mNumDependencies(inNumDependencies)
+		{
+		}
+
+		/// Get the jobs system to which this job belongs
+		inline JobSystem *	GetJobSystem()								{ return mJobSystem; }
+
+		/// Add or release a reference to this object
+		inline void			AddRef()
+		{
+			// Adding a reference can use relaxed memory ordering
+			mReferenceCount.fetch_add(1, memory_order_relaxed);
+		}
+		inline void			Release()
+		{
+		#ifndef JPH_TSAN_ENABLED
+			// Releasing a reference must use release semantics...
+			if (mReferenceCount.fetch_sub(1, memory_order_release) == 1)
+			{
+				// ... so that we can use acquire to ensure that we see any updates from other threads that released a ref before freeing the job
+				atomic_thread_fence(memory_order_acquire);
+				mJobSystem->FreeJob(this);
+			}
+		#else
+			// But under TSAN, we cannot use atomic_thread_fence, so we use an acq_rel operation unconditionally instead
+			if (mReferenceCount.fetch_sub(1, memory_order_acq_rel) == 1)
+				mJobSystem->FreeJob(this);
+		#endif
+		}
+
+		/// Add to the dependency counter.
+		inline void			AddDependency(int inCount);
+
+		/// Remove from the dependency counter. Returns true whenever the dependency counter reaches zero
+		/// and if it does it is no longer valid to call the AddDependency/RemoveDependency functions.
+		inline bool			RemoveDependency(int inCount);
+
+		/// Remove from the dependency counter. Job will be queued whenever the dependency counter reaches zero
+		/// and if it does it is no longer valid to call the AddDependency/RemoveDependency functions.
+		inline void			RemoveDependencyAndQueue(int inCount);
+
+		/// Set the job barrier that this job belongs to and returns false if this was not possible because the job already finished
+		inline bool			SetBarrier(Barrier *inBarrier)
+		{
+			intptr_t barrier = 0;
+			if (mBarrier.compare_exchange_strong(barrier, reinterpret_cast<intptr_t>(inBarrier), memory_order_relaxed))
+				return true;
+			JPH_ASSERT(barrier == cBarrierDoneState, "A job can only belong to 1 barrier");
+			return false;
+		}
+
+		/// Run the job function, returns the number of dependencies that this job still has or cExecutingState or cDoneState
+		inline uint32		Execute()
+		{
+			// Transition job to executing state
+			uint32 state = 0; // We can only start running with a dependency counter of 0
+			if (!mNumDependencies.compare_exchange_strong(state, cExecutingState, memory_order_acquire))
+				return state; // state is updated by compare_exchange_strong to the current value
+
+			// Run the job function
+			{
+				JPH_PROFILE(mJobName, mColor.GetUInt32());
+				mJobFunction();
+			}
+
+			// Fetch the barrier pointer and exchange it for the done state, so we're sure that no barrier gets set after we want to call the callback
+			intptr_t barrier = mBarrier.load(memory_order_relaxed);
+			for (;;)
+			{
+				if (mBarrier.compare_exchange_weak(barrier, cBarrierDoneState, memory_order_relaxed))
+					break;
+			}
+			JPH_ASSERT(barrier != cBarrierDoneState);
+
+			// Mark job as done
+			state = cExecutingState;
+			mNumDependencies.compare_exchange_strong(state, cDoneState, memory_order_relaxed);
+			JPH_ASSERT(state == cExecutingState);
+
+			// Notify the barrier after we've changed the job to the done state so that any thread reading the state after receiving the callback will see that the job has finished
+			if (barrier != 0)
+				reinterpret_cast<Barrier *>(barrier)->OnJobFinished(this);
+
+			return cDoneState;
+		}
+
+		/// Test if the job can be executed
+		inline bool			CanBeExecuted() const						{ return mNumDependencies.load(memory_order_relaxed) == 0; }
+
+		/// Test if the job finished executing
+		inline bool			IsDone() const								{ return mNumDependencies.load(memory_order_relaxed) == cDoneState; }
+
+	#if defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+		/// Get the name of the job
+		const char *		GetName() const								{ return mJobName; }
+	#endif // defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+
+		static constexpr uint32 cExecutingState = 0xe0e0e0e0;			///< Value of mNumDependencies when job is executing
+		static constexpr uint32 cDoneState		= 0xd0d0d0d0;			///< Value of mNumDependencies when job is done executing
+
+		static constexpr intptr_t cBarrierDoneState = ~intptr_t(0);		///< Value to use when the barrier has been triggered
+
+private:
+	#if defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+		const char *		mJobName;									///< Name of the job
+		Color				mColor;										///< Color of the job in the profiler
+	#endif // defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+		JobSystem *			mJobSystem;									///< The job system we belong to
+		atomic<intptr_t>	mBarrier = 0;								///< Barrier that this job is associated with (is a Barrier pointer)
+		JobFunction			mJobFunction;								///< Main job function
+		atomic<uint32>		mReferenceCount = 0;						///< Amount of JobHandles pointing to this job
+		atomic<uint32>		mNumDependencies;							///< Amount of jobs that need to complete before this job can run
+	};
+
+	/// Adds a job to the job queue
+	virtual void			QueueJob(Job *inJob) = 0;
+
+	/// Adds a number of jobs at once to the job queue
+	virtual void			QueueJobs(Job **inJobs, uint inNumJobs) = 0;
+
+	/// Frees a job
+	virtual void			FreeJob(Job *inJob) = 0;
+};
+
+using JobHandle = JobSystem::JobHandle;
+
+JPH_NAMESPACE_END
+
+#include "JobSystem.inl"
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystem.inl
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystem.inl
@ -0,0 +1,56 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+JPH_NAMESPACE_BEGIN
+
+void JobSystem::Job::AddDependency(int inCount)
+{
+	JPH_IF_ENABLE_ASSERTS(uint32 old_value =) mNumDependencies.fetch_add(inCount, memory_order_relaxed);
+	JPH_ASSERT(old_value > 0 && old_value != cExecutingState && old_value != cDoneState, "Job is queued, running or done, it is not allowed to add a dependency to a running job");
+}
+
+bool JobSystem::Job::RemoveDependency(int inCount)
+{
+	uint32 old_value = mNumDependencies.fetch_sub(inCount, memory_order_release);
+	JPH_ASSERT(old_value != cExecutingState && old_value != cDoneState, "Job is running or done, it is not allowed to add a dependency to a running job");
+	uint32 new_value = old_value - inCount;
+	JPH_ASSERT(old_value > new_value, "Test wrap around, this is a logic error");
+	return new_value == 0;
+}
+
+void JobSystem::Job::RemoveDependencyAndQueue(int inCount)
+{
+	if (RemoveDependency(inCount))
+		mJobSystem->QueueJob(this);
+}
+
+void JobSystem::JobHandle::sRemoveDependencies(const JobHandle *inHandles, uint inNumHandles, int inCount)
+{
+	JPH_PROFILE_FUNCTION();
+
+	JPH_ASSERT(inNumHandles > 0);
+
+	// Get the job system, all jobs should be part of the same job system
+	JobSystem *job_system = inHandles->GetPtr()->GetJobSystem();
+
+	// Allocate a buffer to store the jobs that need to be queued
+	Job **jobs_to_queue = (Job **)JPH_STACK_ALLOC(inNumHandles * sizeof(Job *));
+	Job **next_job = jobs_to_queue;
+
+	// Remove the dependencies on all jobs
+	for (const JobHandle *handle = inHandles, *handle_end = inHandles + inNumHandles; handle < handle_end; ++handle)
+	{
+		Job *job = handle->GetPtr();
+		JPH_ASSERT(job->GetJobSystem() == job_system); // All jobs should belong to the same job system
+		if (job->RemoveDependency(inCount))
+			*(next_job++) = job;
+	}
+
+	// If any jobs need to be scheduled, schedule them as a batch
+	uint num_jobs_to_queue = uint(next_job - jobs_to_queue);
+	if (num_jobs_to_queue != 0)
+		job_system->QueueJobs(jobs_to_queue, num_jobs_to_queue);
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemSingleThreaded.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemSingleThreaded.cpp
@ -0,0 +1,65 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2023 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/JobSystemSingleThreaded.h>
+
+JPH_NAMESPACE_BEGIN
+
+void JobSystemSingleThreaded::Init(uint inMaxJobs)
+{
+	mJobs.Init(inMaxJobs, inMaxJobs);
+}
+
+JobHandle JobSystemSingleThreaded::CreateJob(const char *inJobName, ColorArg inColor, const JobFunction &inJobFunction, uint32 inNumDependencies)
+{
+	// Construct an object
+	uint32 index = mJobs.ConstructObject(inJobName, inColor, this, inJobFunction, inNumDependencies);
+	JPH_ASSERT(index != AvailableJobs::cInvalidObjectIndex);
+	Job *job = &mJobs.Get(index);
+
+	// Construct handle to keep a reference, the job is queued below and will immediately complete
+	JobHandle handle(job);
+
+	// If there are no dependencies, queue the job now
+	if (inNumDependencies == 0)
+		QueueJob(job);
+
+	// Return the handle
+	return handle;
+}
+
+void JobSystemSingleThreaded::FreeJob(Job *inJob)
+{
+	mJobs.DestructObject(inJob);
+}
+
+void JobSystemSingleThreaded::QueueJob(Job *inJob)
+{
+	inJob->Execute();
+}
+
+void JobSystemSingleThreaded::QueueJobs(Job **inJobs, uint inNumJobs)
+{
+	for (uint i = 0; i < inNumJobs; ++i)
+		QueueJob(inJobs[i]);
+}
+
+JobSystem::Barrier *JobSystemSingleThreaded::CreateBarrier()
+{
+	return &mDummyBarrier;
+}
+
+void JobSystemSingleThreaded::DestroyBarrier(Barrier *inBarrier)
+{
+	// There's nothing to do here, the barrier is just a dummy
+}
+
+void JobSystemSingleThreaded::WaitForJobs(Barrier *inBarrier)
+{
+	// There's nothing to do here, the barrier is just a dummy, we just execute the jobs immediately
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemSingleThreaded.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemSingleThreaded.h
@ -0,0 +1,62 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2023 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/JobSystem.h>
+#include <Jolt/Core/FixedSizeFreeList.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Implementation of a JobSystem without threads, runs jobs as soon as they are added
+class JPH_EXPORT JobSystemSingleThreaded final : public JobSystem
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+							JobSystemSingleThreaded() = default;
+	explicit				JobSystemSingleThreaded(uint inMaxJobs)			{ Init(inMaxJobs); }
+
+	/// Initialize the job system
+	/// @param inMaxJobs Max number of jobs that can be allocated at any time
+	void					Init(uint inMaxJobs);
+
+	// See JobSystem
+	virtual int				GetMaxConcurrency() const override				{ return 1; }
+	virtual JobHandle		CreateJob(const char *inName, ColorArg inColor, const JobFunction &inJobFunction, uint32 inNumDependencies = 0) override;
+	virtual Barrier *		CreateBarrier() override;
+	virtual void			DestroyBarrier(Barrier *inBarrier) override;
+	virtual void			WaitForJobs(Barrier *inBarrier) override;
+
+protected:
+	// Dummy implementation of Barrier, all jobs are executed immediately
+	class BarrierImpl : public Barrier
+	{
+	public:
+		JPH_OVERRIDE_NEW_DELETE
+
+		// See Barrier
+		virtual void		AddJob(const JobHandle &inJob) override			{ /* We don't need to track jobs */ }
+		virtual void		AddJobs(const JobHandle *inHandles, uint inNumHandles) override { /* We don't need to track jobs */ }
+
+	protected:
+		/// Called by a Job to mark that it is finished
+		virtual void		OnJobFinished(Job *inJob) override				{ /* We don't need to track jobs */ }
+	};
+
+	// See JobSystem
+	virtual void			QueueJob(Job *inJob) override;
+	virtual void			QueueJobs(Job **inJobs, uint inNumJobs) override;
+	virtual void			FreeJob(Job *inJob) override;
+
+	/// Shared barrier since the barrier implementation does nothing
+	BarrierImpl				mDummyBarrier;
+
+	/// Array of jobs (fixed size)
+	using AvailableJobs = FixedSizeFreeList<Job>;
+	AvailableJobs			mJobs;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemThreadPool.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemThreadPool.cpp
@ -0,0 +1,351 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/JobSystemThreadPool.h>
+#include <Jolt/Core/Profiler.h>
+#include <Jolt/Core/FPException.h>
+#include <Jolt/Core/IncludeWindows.h>
+
+#ifdef JPH_PLATFORM_LINUX
+	#include <sys/prctl.h>
+#endif
+
+JPH_NAMESPACE_BEGIN
+
+void JobSystemThreadPool::Init(uint inMaxJobs, uint inMaxBarriers, int inNumThreads)
+{
+	JobSystemWithBarrier::Init(inMaxBarriers);
+
+	// Init freelist of jobs
+	mJobs.Init(inMaxJobs, inMaxJobs);
+
+	// Init queue
+	for (atomic<Job *> &j : mQueue)
+		j = nullptr;
+
+	// Start the worker threads
+	StartThreads(inNumThreads);
+}
+
+JobSystemThreadPool::JobSystemThreadPool(uint inMaxJobs, uint inMaxBarriers, int inNumThreads)
+{
+	Init(inMaxJobs, inMaxBarriers, inNumThreads);
+}
+
+void JobSystemThreadPool::StartThreads([[maybe_unused]] int inNumThreads)
+{
+#if !defined(JPH_CPU_WASM) || defined(__EMSCRIPTEN_PTHREADS__) // If we're running without threads support we cannot create threads and we ignore the inNumThreads parameter
+	// Auto detect number of threads
+	if (inNumThreads < 0)
+		inNumThreads = thread::hardware_concurrency() - 1;
+
+	// If no threads are requested we're done
+	if (inNumThreads == 0)
+		return;
+
+	// Don't quit the threads
+	mQuit = false;
+
+	// Allocate heads
+	mHeads = reinterpret_cast<atomic<uint> *>(Allocate(sizeof(atomic<uint>) * inNumThreads));
+	for (int i = 0; i < inNumThreads; ++i)
+		mHeads[i] = 0;
+
+	// Start running threads
+	JPH_ASSERT(mThreads.empty());
+	mThreads.reserve(inNumThreads);
+	for (int i = 0; i < inNumThreads; ++i)
+		mThreads.emplace_back([this, i] { ThreadMain(i); });
+#endif
+}
+
+JobSystemThreadPool::~JobSystemThreadPool()
+{
+	// Stop all worker threads
+	StopThreads();
+}
+
+void JobSystemThreadPool::StopThreads()
+{
+	if (mThreads.empty())
+		return;
+
+	// Signal threads that we want to stop and wake them up
+	mQuit = true;
+	mSemaphore.Release((uint)mThreads.size());
+
+	// Wait for all threads to finish
+	for (thread &t : mThreads)
+		if (t.joinable())
+			t.join();
+
+	// Delete all threads
+	mThreads.clear();
+
+	// Ensure that there are no lingering jobs in the queue
+	for (uint head = 0; head != mTail; ++head)
+	{
+		// Fetch job
+		Job *job_ptr = mQueue[head & (cQueueLength - 1)].exchange(nullptr);
+		if (job_ptr != nullptr)
+		{
+			// And execute it
+			job_ptr->Execute();
+			job_ptr->Release();
+		}
+	}
+
+	// Destroy heads and reset tail
+	Free(mHeads);
+	mHeads = nullptr;
+	mTail = 0;
+}
+
+JobHandle JobSystemThreadPool::CreateJob(const char *inJobName, ColorArg inColor, const JobFunction &inJobFunction, uint32 inNumDependencies)
+{
+	JPH_PROFILE_FUNCTION();
+
+	// Loop until we can get a job from the free list
+	uint32 index;
+	for (;;)
+	{
+		index = mJobs.ConstructObject(inJobName, inColor, this, inJobFunction, inNumDependencies);
+		if (index != AvailableJobs::cInvalidObjectIndex)
+			break;
+		JPH_ASSERT(false, "No jobs available!");
+		std::this_thread::sleep_for(std::chrono::microseconds(100));
+	}
+	Job *job = &mJobs.Get(index);
+
+	// Construct handle to keep a reference, the job is queued below and may immediately complete
+	JobHandle handle(job);
+
+	// If there are no dependencies, queue the job now
+	if (inNumDependencies == 0)
+		QueueJob(job);
+
+	// Return the handle
+	return handle;
+}
+
+void JobSystemThreadPool::FreeJob(Job *inJob)
+{
+	mJobs.DestructObject(inJob);
+}
+
+uint JobSystemThreadPool::GetHead() const
+{
+	// Find the minimal value across all threads
+	uint head = mTail;
+	for (size_t i = 0; i < mThreads.size(); ++i)
+		head = min(head, mHeads[i].load());
+	return head;
+}
+
+void JobSystemThreadPool::QueueJobInternal(Job *inJob)
+{
+	// Add reference to job because we're adding the job to the queue
+	inJob->AddRef();
+
+	// Need to read head first because otherwise the tail can already have passed the head
+	// We read the head outside of the loop since it involves iterating over all threads and we only need to update
+	// it if there's not enough space in the queue.
+	uint head = GetHead();
+
+	for (;;)
+	{
+		// Check if there's space in the queue
+		uint old_value = mTail;
+		if (old_value - head >= cQueueLength)
+		{
+			// We calculated the head outside of the loop, update head (and we also need to update tail to prevent it from passing head)
+			head = GetHead();
+			old_value = mTail;
+
+			// Second check if there's space in the queue
+			if (old_value - head >= cQueueLength)
+			{
+				// Wake up all threads in order to ensure that they can clear any nullptrs they may not have processed yet
+				mSemaphore.Release((uint)mThreads.size());
+
+				// Sleep a little (we have to wait for other threads to update their head pointer in order for us to be able to continue)
+				std::this_thread::sleep_for(std::chrono::microseconds(100));
+				continue;
+			}
+		}
+
+		// Write the job pointer if the slot is empty
+		Job *expected_job = nullptr;
+		bool success = mQueue[old_value & (cQueueLength - 1)].compare_exchange_strong(expected_job, inJob);
+
+		// Regardless of who wrote the slot, we will update the tail (if the successful thread got scheduled out
+		// after writing the pointer we still want to be able to continue)
+		mTail.compare_exchange_strong(old_value, old_value + 1);
+
+		// If we successfully added our job we're done
+		if (success)
+			break;
+	}
+}
+
+void JobSystemThreadPool::QueueJob(Job *inJob)
+{
+	JPH_PROFILE_FUNCTION();
+
+	// If we have no worker threads, we can't queue the job either. We assume in this case that the job will be added to a barrier and that the barrier will execute the job when it's Wait() function is called.
+	if (mThreads.empty())
+		return;
+
+	// Queue the job
+	QueueJobInternal(inJob);
+
+	// Wake up thread
+	mSemaphore.Release();
+}
+
+void JobSystemThreadPool::QueueJobs(Job **inJobs, uint inNumJobs)
+{
+	JPH_PROFILE_FUNCTION();
+
+	JPH_ASSERT(inNumJobs > 0);
+
+	// If we have no worker threads, we can't queue the job either. We assume in this case that the job will be added to a barrier and that the barrier will execute the job when it's Wait() function is called.
+	if (mThreads.empty())
+		return;
+
+	// Queue all jobs
+	for (Job **job = inJobs, **job_end = inJobs + inNumJobs; job < job_end; ++job)
+		QueueJobInternal(*job);
+
+	// Wake up threads
+	mSemaphore.Release(min(inNumJobs, (uint)mThreads.size()));
+}
+
+#if defined(JPH_PLATFORM_WINDOWS)
+
+#if !defined(JPH_COMPILER_MINGW) // MinGW doesn't support __try/__except)
+	// Sets the current thread name in MSVC debugger
+	static void RaiseThreadNameException(const char *inName)
+	{
+		#pragma pack(push, 8)
+
+		struct THREADNAME_INFO
+		{
+			DWORD	dwType;			// Must be 0x1000.
+			LPCSTR	szName;			// Pointer to name (in user addr space).
+			DWORD	dwThreadID;		// Thread ID (-1=caller thread).
+			DWORD	dwFlags;		// Reserved for future use, must be zero.
+		};
+
+		#pragma pack(pop)
+
+		THREADNAME_INFO info;
+		info.dwType = 0x1000;
+		info.szName = inName;
+		info.dwThreadID = (DWORD)-1;
+		info.dwFlags = 0;
+
+		__try
+		{
+			RaiseException(0x406D1388, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR *)&info);
+		}
+		__except(EXCEPTION_EXECUTE_HANDLER)
+		{
+		}
+	}
+#endif // !JPH_COMPILER_MINGW
+
+	static void SetThreadName(const char* inName)
+	{
+		JPH_SUPPRESS_WARNING_PUSH
+
+		// Suppress casting warning, it's fine here as GetProcAddress doesn't really return a FARPROC
+		JPH_CLANG_SUPPRESS_WARNING("-Wcast-function-type") // error : cast from 'FARPROC' (aka 'long long (*)()') to 'SetThreadDescriptionFunc' (aka 'long (*)(void *, const wchar_t *)') converts to incompatible function type
+		JPH_CLANG_SUPPRESS_WARNING("-Wcast-function-type-strict") // error : cast from 'FARPROC' (aka 'long long (*)()') to 'SetThreadDescriptionFunc' (aka 'long (*)(void *, const wchar_t *)') converts to incompatible function type
+		JPH_MSVC_SUPPRESS_WARNING(4191) // reinterpret_cast' : unsafe conversion from 'FARPROC' to 'SetThreadDescriptionFunc'. Calling this function through the result pointer may cause your program to fail
+
+		using SetThreadDescriptionFunc = HRESULT(WINAPI*)(HANDLE hThread, PCWSTR lpThreadDescription);
+		static SetThreadDescriptionFunc SetThreadDescription = reinterpret_cast<SetThreadDescriptionFunc>(GetProcAddress(GetModuleHandleW(L"Kernel32.dll"), "SetThreadDescription"));
+
+		JPH_SUPPRESS_WARNING_POP
+
+		if (SetThreadDescription)
+		{
+			wchar_t name_buffer[64] = { 0 };
+			if (MultiByteToWideChar(CP_UTF8, 0, inName, -1, name_buffer, sizeof(name_buffer) / sizeof(wchar_t) - 1) == 0)
+				return;
+
+			SetThreadDescription(GetCurrentThread(), name_buffer);
+		}
+#if !defined(JPH_COMPILER_MINGW)
+		else if (IsDebuggerPresent())
+			RaiseThreadNameException(inName);
+#endif // !JPH_COMPILER_MINGW
+	}
+#elif defined(JPH_PLATFORM_LINUX)
+	static void SetThreadName(const char *inName)
+	{
+		JPH_ASSERT(strlen(inName) < 16); // String will be truncated if it is longer
+		prctl(PR_SET_NAME, inName, 0, 0, 0);
+	}
+#endif // JPH_PLATFORM_LINUX
+
+void JobSystemThreadPool::ThreadMain(int inThreadIndex)
+{
+	// Name the thread
+	char name[64];
+	snprintf(name, sizeof(name), "Worker %d", int(inThreadIndex + 1));
+
+#if defined(JPH_PLATFORM_WINDOWS) || defined(JPH_PLATFORM_LINUX)
+	SetThreadName(name);
+#endif // JPH_PLATFORM_WINDOWS && !JPH_COMPILER_MINGW
+
+	// Enable floating point exceptions
+	FPExceptionsEnable enable_exceptions;
+	JPH_UNUSED(enable_exceptions);
+
+	JPH_PROFILE_THREAD_START(name);
+
+	// Call the thread init function
+	mThreadInitFunction(inThreadIndex);
+
+	atomic<uint> &head = mHeads[inThreadIndex];
+
+	while (!mQuit)
+	{
+		// Wait for jobs
+		mSemaphore.Acquire();
+
+		{
+			JPH_PROFILE("Executing Jobs");
+
+			// Loop over the queue
+			while (head != mTail)
+			{
+				// Exchange any job pointer we find with a nullptr
+				atomic<Job *> &job = mQueue[head & (cQueueLength - 1)];
+				if (job.load() != nullptr)
+				{
+					Job *job_ptr = job.exchange(nullptr);
+					if (job_ptr != nullptr)
+					{
+						// And execute it
+						job_ptr->Execute();
+						job_ptr->Release();
+					}
+				}
+				head++;
+			}
+		}
+	}
+
+	// Call the thread exit function
+	mThreadExitFunction(inThreadIndex);
+
+	JPH_PROFILE_THREAD_END();
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemThreadPool.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemThreadPool.h
@ -0,0 +1,101 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/JobSystemWithBarrier.h>
+#include <Jolt/Core/FixedSizeFreeList.h>
+#include <Jolt/Core/Semaphore.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <thread>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+// Things we're using from STL
+using std::thread;
+
+/// Implementation of a JobSystem using a thread pool
+///
+/// Note that this is considered an example implementation. It is expected that when you integrate
+/// the physics engine into your own project that you'll provide your own implementation of the
+/// JobSystem built on top of whatever job system your project uses.
+class JPH_EXPORT JobSystemThreadPool final : public JobSystemWithBarrier
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Creates a thread pool.
+	/// @see JobSystemThreadPool::Init
+							JobSystemThreadPool(uint inMaxJobs, uint inMaxBarriers, int inNumThreads = -1);
+							JobSystemThreadPool() = default;
+	virtual					~JobSystemThreadPool() override;
+
+	/// Functions to call when a thread is initialized or exits, must be set before calling Init()
+	using InitExitFunction = function<void(int)>;
+	void					SetThreadInitFunction(const InitExitFunction &inInitFunction)	{ mThreadInitFunction = inInitFunction; }
+	void					SetThreadExitFunction(const InitExitFunction &inExitFunction)	{ mThreadExitFunction = inExitFunction; }
+
+	/// Initialize the thread pool
+	/// @param inMaxJobs Max number of jobs that can be allocated at any time
+	/// @param inMaxBarriers Max number of barriers that can be allocated at any time
+	/// @param inNumThreads Number of threads to start (the number of concurrent jobs is 1 more because the main thread will also run jobs while waiting for a barrier to complete). Use -1 to auto detect the amount of CPU's.
+	void					Init(uint inMaxJobs, uint inMaxBarriers, int inNumThreads = -1);
+
+	// See JobSystem
+	virtual int				GetMaxConcurrency() const override				{ return int(mThreads.size()) + 1; }
+	virtual JobHandle		CreateJob(const char *inName, ColorArg inColor, const JobFunction &inJobFunction, uint32 inNumDependencies = 0) override;
+
+	/// Change the max concurrency after initialization
+	void					SetNumThreads(int inNumThreads)					{ StopThreads(); StartThreads(inNumThreads); }
+
+protected:
+	// See JobSystem
+	virtual void			QueueJob(Job *inJob) override;
+	virtual void			QueueJobs(Job **inJobs, uint inNumJobs) override;
+	virtual void			FreeJob(Job *inJob) override;
+
+private:
+	/// Start/stop the worker threads
+	void					StartThreads(int inNumThreads);
+	void					StopThreads();
+
+	/// Entry point for a thread
+	void					ThreadMain(int inThreadIndex);
+
+	/// Get the head of the thread that has processed the least amount of jobs
+	inline uint				GetHead() const;
+
+	/// Internal helper function to queue a job
+	inline void				QueueJobInternal(Job *inJob);
+
+	/// Functions to call when initializing or exiting a thread
+	InitExitFunction		mThreadInitFunction = [](int) { };
+	InitExitFunction		mThreadExitFunction = [](int) { };
+
+	/// Array of jobs (fixed size)
+	using AvailableJobs = FixedSizeFreeList<Job>;
+	AvailableJobs			mJobs;
+
+	/// Threads running jobs
+	Array<thread>			mThreads;
+
+	// The job queue
+	static constexpr uint32 cQueueLength = 1024;
+	static_assert(IsPowerOf2(cQueueLength));								// We do bit operations and require queue length to be a power of 2
+	atomic<Job *>			mQueue[cQueueLength];
+
+	// Head and tail of the queue, do this value modulo cQueueLength - 1 to get the element in the mQueue array
+	atomic<uint> *			mHeads = nullptr;								///< Per executing thread the head of the current queue
+	alignas(JPH_CACHE_LINE_SIZE) atomic<uint> mTail = 0;					///< Tail (write end) of the queue
+
+	// Semaphore used to signal worker threads that there is new work
+	Semaphore				mSemaphore;
+
+	/// Boolean to indicate that we want to stop the job system
+	atomic<bool>			mQuit = false;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemWithBarrier.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemWithBarrier.cpp
@ -0,0 +1,230 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2023 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/JobSystemWithBarrier.h>
+#include <Jolt/Core/Profiler.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <thread>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+JobSystemWithBarrier::BarrierImpl::BarrierImpl()
+{
+	for (atomic<Job *> &j : mJobs)
+		j = nullptr;
+}
+
+JobSystemWithBarrier::BarrierImpl::~BarrierImpl()
+{
+	JPH_ASSERT(IsEmpty());
+}
+
+void JobSystemWithBarrier::BarrierImpl::AddJob(const JobHandle &inJob)
+{
+	JPH_PROFILE_FUNCTION();
+
+	bool release_semaphore = false;
+
+	// Set the barrier on the job, this returns true if the barrier was successfully set (otherwise the job is already done and we don't need to add it to our list)
+	Job *job = inJob.GetPtr();
+	if (job->SetBarrier(this))
+	{
+		// If the job can be executed we want to release the semaphore an extra time to allow the waiting thread to start executing it
+		mNumToAcquire++;
+		if (job->CanBeExecuted())
+		{
+			release_semaphore = true;
+			mNumToAcquire++;
+		}
+
+		// Add the job to our job list
+		job->AddRef();
+		uint write_index = mJobWriteIndex++;
+		while (write_index - mJobReadIndex >= cMaxJobs)
+		{
+			JPH_ASSERT(false, "Barrier full, stalling!");
+			std::this_thread::sleep_for(std::chrono::microseconds(100));
+		}
+		mJobs[write_index & (cMaxJobs - 1)] = job;
+	}
+
+	// Notify waiting thread that a new executable job is available
+	if (release_semaphore)
+		mSemaphore.Release();
+}
+
+void JobSystemWithBarrier::BarrierImpl::AddJobs(const JobHandle *inHandles, uint inNumHandles)
+{
+	JPH_PROFILE_FUNCTION();
+
+	bool release_semaphore = false;
+
+	for (const JobHandle *handle = inHandles, *handles_end = inHandles + inNumHandles; handle < handles_end; ++handle)
+	{
+		// Set the barrier on the job, this returns true if the barrier was successfully set (otherwise the job is already done and we don't need to add it to our list)
+		Job *job = handle->GetPtr();
+		if (job->SetBarrier(this))
+		{
+			// If the job can be executed we want to release the semaphore an extra time to allow the waiting thread to start executing it
+			mNumToAcquire++;
+			if (!release_semaphore && job->CanBeExecuted())
+			{
+				release_semaphore = true;
+				mNumToAcquire++;
+			}
+
+			// Add the job to our job list
+			job->AddRef();
+			uint write_index = mJobWriteIndex++;
+			while (write_index - mJobReadIndex >= cMaxJobs)
+			{
+				JPH_ASSERT(false, "Barrier full, stalling!");
+				std::this_thread::sleep_for(std::chrono::microseconds(100));
+			}
+			mJobs[write_index & (cMaxJobs - 1)] = job;
+		}
+	}
+
+	// Notify waiting thread that a new executable job is available
+	if (release_semaphore)
+		mSemaphore.Release();
+}
+
+void JobSystemWithBarrier::BarrierImpl::OnJobFinished(Job *inJob)
+{
+	JPH_PROFILE_FUNCTION();
+
+	mSemaphore.Release();
+}
+
+void JobSystemWithBarrier::BarrierImpl::Wait()
+{
+	while (mNumToAcquire > 0)
+	{
+		{
+			JPH_PROFILE("Execute Jobs");
+
+			// Go through all jobs
+			bool has_executed;
+			do
+			{
+				has_executed = false;
+
+				// Loop through the jobs and erase jobs from the beginning of the list that are done
+				while (mJobReadIndex < mJobWriteIndex)
+				{
+					atomic<Job *> &job = mJobs[mJobReadIndex & (cMaxJobs - 1)];
+					Job *job_ptr = job.load();
+					if (job_ptr == nullptr || !job_ptr->IsDone())
+						break;
+
+					// Job is finished, release it
+					job_ptr->Release();
+					job = nullptr;
+					++mJobReadIndex;
+				}
+
+				// Loop through the jobs and execute the first executable job
+				for (uint index = mJobReadIndex; index < mJobWriteIndex; ++index)
+				{
+					const atomic<Job *> &job = mJobs[index & (cMaxJobs - 1)];
+					Job *job_ptr = job.load();
+					if (job_ptr != nullptr && job_ptr->CanBeExecuted())
+					{
+						// This will only execute the job if it has not already executed
+						job_ptr->Execute();
+						has_executed = true;
+						break;
+					}
+				}
+
+			} while (has_executed);
+		}
+
+		// Wait for another thread to wake us when either there is more work to do or when all jobs have completed.
+		// When there have been multiple releases, we acquire them all at the same time to avoid needlessly spinning on executing jobs.
+		// Note that using GetValue is inherently unsafe since we can read a stale value, but this is not an issue here as this is the only
+		// place where we acquire the semaphore. Other threads only release it, so we can only read a value that is lower or equal to the actual value.
+		int num_to_acquire = max(1, mSemaphore.GetValue());
+		mSemaphore.Acquire(num_to_acquire);
+		mNumToAcquire -= num_to_acquire;
+	}
+
+	// All jobs should be done now, release them
+	while (mJobReadIndex < mJobWriteIndex)
+	{
+		atomic<Job *> &job = mJobs[mJobReadIndex & (cMaxJobs - 1)];
+		Job *job_ptr = job.load();
+		JPH_ASSERT(job_ptr != nullptr && job_ptr->IsDone());
+		job_ptr->Release();
+		job = nullptr;
+		++mJobReadIndex;
+	}
+}
+
+void JobSystemWithBarrier::Init(uint inMaxBarriers)
+{
+	JPH_ASSERT(mBarriers == nullptr); // Already initialized?
+
+	// Init freelist of barriers
+	mMaxBarriers = inMaxBarriers;
+	mBarriers = new BarrierImpl [inMaxBarriers];
+}
+
+JobSystemWithBarrier::JobSystemWithBarrier(uint inMaxBarriers)
+{
+	Init(inMaxBarriers);
+}
+
+JobSystemWithBarrier::~JobSystemWithBarrier()
+{
+	// Ensure that none of the barriers are used
+#ifdef JPH_ENABLE_ASSERTS
+	for (const BarrierImpl *b = mBarriers, *b_end = mBarriers + mMaxBarriers; b < b_end; ++b)
+		JPH_ASSERT(!b->mInUse);
+#endif // JPH_ENABLE_ASSERTS
+	delete [] mBarriers;
+}
+
+JobSystem::Barrier *JobSystemWithBarrier::CreateBarrier()
+{
+	JPH_PROFILE_FUNCTION();
+
+	// Find the first unused barrier
+	for (uint32 index = 0; index < mMaxBarriers; ++index)
+	{
+		bool expected = false;
+		if (mBarriers[index].mInUse.compare_exchange_strong(expected, true))
+			return &mBarriers[index];
+	}
+
+	return nullptr;
+}
+
+void JobSystemWithBarrier::DestroyBarrier(Barrier *inBarrier)
+{
+	JPH_PROFILE_FUNCTION();
+
+	// Check that no jobs are in the barrier
+	JPH_ASSERT(static_cast<BarrierImpl *>(inBarrier)->IsEmpty());
+
+	// Flag the barrier as unused
+	bool expected = true;
+	static_cast<BarrierImpl *>(inBarrier)->mInUse.compare_exchange_strong(expected, false);
+	JPH_ASSERT(expected);
+}
+
+void JobSystemWithBarrier::WaitForJobs(Barrier *inBarrier)
+{
+	JPH_PROFILE_FUNCTION();
+
+	// Let our barrier implementation wait for the jobs
+	static_cast<BarrierImpl *>(inBarrier)->Wait();
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemWithBarrier.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/JobSystemWithBarrier.h
@ -0,0 +1,85 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2023 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/JobSystem.h>
+#include <Jolt/Core/Semaphore.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Implementation of the Barrier class for a JobSystem
+///
+/// This class can be used to make it easier to create a new JobSystem implementation that integrates with your own job system.
+/// It will implement all functionality relating to barriers, so the only functions that are left to be implemented are:
+///
+/// * JobSystem::GetMaxConcurrency
+/// * JobSystem::CreateJob
+/// * JobSystem::FreeJob
+/// * JobSystem::QueueJob/QueueJobs
+///
+/// See instructions in JobSystem for more information on how to implement these.
+class JPH_EXPORT JobSystemWithBarrier : public JobSystem
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructs barriers
+	/// @see JobSystemWithBarrier::Init
+	explicit				JobSystemWithBarrier(uint inMaxBarriers);
+							JobSystemWithBarrier() = default;
+	virtual					~JobSystemWithBarrier() override;
+
+	/// Initialize the barriers
+	/// @param inMaxBarriers Max number of barriers that can be allocated at any time
+	void					Init(uint inMaxBarriers);
+
+	// See JobSystem
+	virtual Barrier *		CreateBarrier() override;
+	virtual void			DestroyBarrier(Barrier *inBarrier) override;
+	virtual void			WaitForJobs(Barrier *inBarrier) override;
+
+private:
+	class BarrierImpl : public Barrier
+	{
+	public:
+		JPH_OVERRIDE_NEW_DELETE
+
+		/// Constructor
+							BarrierImpl();
+		virtual				~BarrierImpl() override;
+
+		// See Barrier
+		virtual void		AddJob(const JobHandle &inJob) override;
+		virtual void		AddJobs(const JobHandle *inHandles, uint inNumHandles) override;
+
+		/// Check if there are any jobs in the job barrier
+		inline bool			IsEmpty() const									{ return mJobReadIndex == mJobWriteIndex; }
+
+		/// Wait for all jobs in this job barrier, while waiting, execute jobs that are part of this barrier on the current thread
+		void				Wait();
+
+		/// Flag to indicate if a barrier has been handed out
+		atomic<bool>		mInUse { false };
+
+	protected:
+		/// Called by a Job to mark that it is finished
+		virtual void		OnJobFinished(Job *inJob) override;
+
+		/// Jobs queue for the barrier
+		static constexpr uint cMaxJobs = 2048;
+		static_assert(IsPowerOf2(cMaxJobs));								// We do bit operations and require max jobs to be a power of 2
+		atomic<Job *>		mJobs[cMaxJobs];								///< List of jobs that are part of this barrier, nullptrs for empty slots
+		alignas(JPH_CACHE_LINE_SIZE) atomic<uint> mJobReadIndex { 0 };		///< First job that could be valid (modulo cMaxJobs), can be nullptr if other thread is still working on adding the job
+		alignas(JPH_CACHE_LINE_SIZE) atomic<uint> mJobWriteIndex { 0 };		///< First job that can be written (modulo cMaxJobs)
+		atomic<int>			mNumToAcquire { 0 };							///< Number of times the semaphore has been released, the barrier should acquire the semaphore this many times (written at the same time as mJobWriteIndex so ok to put in same cache line)
+		Semaphore			mSemaphore;										///< Semaphore used by finishing jobs to signal the barrier that they're done
+	};
+
+	/// Array of barriers (we keep them constructed all the time since constructing a semaphore/mutex is not cheap)
+	uint					mMaxBarriers = 0;								///< Max amount of barriers
+	BarrierImpl *			mBarriers = nullptr;							///< List of the actual barriers
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/LinearCurve.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/LinearCurve.cpp
@ -0,0 +1,51 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/LinearCurve.h>
+#include <Jolt/Core/StreamIn.h>
+#include <Jolt/Core/StreamOut.h>
+#include <Jolt/ObjectStream/TypeDeclarations.h>
+
+JPH_NAMESPACE_BEGIN
+
+JPH_IMPLEMENT_SERIALIZABLE_NON_VIRTUAL(LinearCurve::Point)
+{
+	JPH_ADD_ATTRIBUTE(Point, mX)
+	JPH_ADD_ATTRIBUTE(Point, mY)
+}
+
+JPH_IMPLEMENT_SERIALIZABLE_NON_VIRTUAL(LinearCurve)
+{
+	JPH_ADD_ATTRIBUTE(LinearCurve, mPoints)
+}
+
+float LinearCurve::GetValue(float inX) const
+{
+	if (mPoints.empty())
+		return 0.0f;
+
+	Points::const_iterator i2 = std::lower_bound(mPoints.begin(), mPoints.end(), inX, [](const Point &inPoint, float inValue) { return inPoint.mX < inValue; });
+
+	if (i2 == mPoints.begin())
+		return mPoints.front().mY;
+	else if (i2 == mPoints.end())
+		return mPoints.back().mY;
+
+	Points::const_iterator i1 = i2 - 1;
+	return i1->mY + (inX - i1->mX) * (i2->mY - i1->mY) / (i2->mX - i1->mX);
+}
+
+void LinearCurve::SaveBinaryState(StreamOut &inStream) const
+{
+	inStream.Write(mPoints);
+}
+
+void LinearCurve::RestoreBinaryState(StreamIn &inStream)
+{
+	inStream.Read(mPoints);
+}
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/LinearCurve.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/LinearCurve.h
@ -0,0 +1,67 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/ObjectStream/SerializableObject.h>
+#include <Jolt/Core/QuickSort.h>
+
+JPH_NAMESPACE_BEGIN
+
+class StreamOut;
+class StreamIn;
+
+// A set of points (x, y) that form a linear curve
+class JPH_EXPORT LinearCurve
+{
+	JPH_DECLARE_SERIALIZABLE_NON_VIRTUAL(JPH_EXPORT, LinearCurve)
+
+public:
+	/// A point on the curve
+	class Point
+	{
+		JPH_DECLARE_SERIALIZABLE_NON_VIRTUAL(JPH_EXPORT, Point)
+
+	public:
+		float			mX = 0.0f;
+		float			mY = 0.0f;
+	};
+
+	/// Remove all points
+	void				Clear()											{ mPoints.clear(); }
+
+	/// Reserve memory for inNumPoints points
+	void				Reserve(uint inNumPoints)						{ mPoints.reserve(inNumPoints); }
+
+	/// Add a point to the curve. Points must be inserted in ascending X or Sort() needs to be called when all points have been added.
+	/// @param inX X value
+	/// @param inY Y value
+	void				AddPoint(float inX, float inY)					{ mPoints.push_back({ inX, inY }); }
+
+	/// Sort the points on X ascending
+	void				Sort()											{ QuickSort(mPoints.begin(), mPoints.end(), [](const Point &inLHS, const Point &inRHS) { return inLHS.mX < inRHS.mX; }); }
+
+	/// Get the lowest X value
+	float				GetMinX() const									{ return mPoints.empty()? 0.0f : mPoints.front().mX; }
+
+	/// Get the highest X value
+	float				GetMaxX() const									{ return mPoints.empty()? 0.0f : mPoints.back().mX; }
+
+	/// Sample value on the curve
+	/// @param inX X value to sample at
+	/// @return Interpolated Y value
+	float				GetValue(float inX) const;
+
+	/// Saves the state of this object in binary form to inStream.
+	void				SaveBinaryState(StreamOut &inStream) const;
+
+	/// Restore the state of this object from inStream.
+	void				RestoreBinaryState(StreamIn &inStream);
+
+	/// The points on the curve, should be sorted ascending by x
+	using Points = Array<Point>;
+	Points				mPoints;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/LockFreeHashMap.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/LockFreeHashMap.h
@ -0,0 +1,182 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/Atomics.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Allocator for a lock free hash map
+class LFHMAllocator : public NonCopyable
+{
+public:
+	/// Destructor
+	inline					~LFHMAllocator();
+
+	/// Initialize the allocator
+	/// @param inObjectStoreSizeBytes Number of bytes to reserve for all key value pairs
+	inline void				Init(uint inObjectStoreSizeBytes);
+
+	/// Clear all allocations
+	inline void				Clear();
+
+	/// Allocate a new block of data
+	/// @param inBlockSize Size of block to allocate (will potentially return a smaller block if memory is full).
+	/// @param ioBegin Should be the start of the first free byte in current memory block on input, will contain the start of the first free byte in allocated block on return.
+	/// @param ioEnd Should be the byte beyond the current memory block on input, will contain the byte beyond the allocated block on return.
+	inline void				Allocate(uint32 inBlockSize, uint32 &ioBegin, uint32 &ioEnd);
+
+	/// Convert a pointer to an offset
+	template <class T>
+	inline uint32			ToOffset(const T *inData) const;
+
+	/// Convert an offset to a pointer
+	template <class T>
+	inline T *				FromOffset(uint32 inOffset) const;
+
+private:
+	uint8 *					mObjectStore = nullptr;			///< This contains a contiguous list of objects (possibly of varying size)
+	uint32					mObjectStoreSizeBytes = 0;		///< The size of mObjectStore in bytes
+	atomic<uint32>			mWriteOffset { 0 };				///< Next offset to write to in mObjectStore
+};
+
+/// Allocator context object for a lock free hash map that allocates a larger memory block at once and hands it out in smaller portions.
+/// This avoids contention on the atomic LFHMAllocator::mWriteOffset.
+class LFHMAllocatorContext : public NonCopyable
+{
+public:
+	/// Construct a new allocator context
+	inline					LFHMAllocatorContext(LFHMAllocator &inAllocator, uint32 inBlockSize);
+
+	/// @brief Allocate data block
+	/// @param inSize Size of block to allocate.
+	/// @param inAlignment Alignment of block to allocate.
+	/// @param outWriteOffset Offset in buffer where block is located
+	/// @return True if allocation succeeded
+	inline bool				Allocate(uint32 inSize, uint32 inAlignment, uint32 &outWriteOffset);
+
+private:
+	LFHMAllocator &			mAllocator;
+	uint32					mBlockSize;
+	uint32					mBegin = 0;
+	uint32					mEnd = 0;
+};
+
+/// Very simple lock free hash map that only allows insertion, retrieval and provides a fixed amount of buckets and fixed storage.
+/// Note: This class currently assumes key and value are simple types that need no calls to the destructor.
+template <class Key, class Value>
+class LockFreeHashMap : public NonCopyable
+{
+public:
+	using MapType = LockFreeHashMap<Key, Value>;
+
+	/// Destructor
+	explicit				LockFreeHashMap(LFHMAllocator &inAllocator) : mAllocator(inAllocator) { }
+							~LockFreeHashMap();
+
+	/// Initialization
+	/// @param inMaxBuckets Max amount of buckets to use in the hashmap. Must be power of 2.
+	void					Init(uint32 inMaxBuckets);
+
+	/// Remove all elements.
+	/// Note that this cannot happen simultaneously with adding new elements.
+	void					Clear();
+
+	/// Get the current amount of buckets that the map is using
+	uint32					GetNumBuckets() const			{ return mNumBuckets; }
+
+	/// Get the maximum amount of buckets that this map supports
+	uint32					GetMaxBuckets() const			{ return mMaxBuckets; }
+
+	/// Update the number of buckets. This must be done after clearing the map and cannot be done concurrently with any other operations on the map.
+	/// Note that the number of buckets can never become bigger than the specified max buckets during initialization and that it must be a power of 2.
+	void					SetNumBuckets(uint32 inNumBuckets);
+
+	/// A key / value pair that is inserted in the map
+	class KeyValue
+	{
+	public:
+		const Key &			GetKey() const					{ return mKey; }
+		Value &				GetValue()						{ return mValue; }
+		const Value &		GetValue() const				{ return mValue; }
+
+	private:
+		template <class K, class V> friend class LockFreeHashMap;
+
+		Key					mKey;							///< Key for this entry
+		uint32				mNextOffset;					///< Offset in mObjectStore of next KeyValue entry with same hash
+		Value				mValue;							///< Value for this entry + optionally extra bytes
+	};
+
+	/// Insert a new element, returns null if map full.
+	/// Multiple threads can be inserting in the map at the same time.
+	template <class... Params>
+	inline KeyValue *		Create(LFHMAllocatorContext &ioContext, const Key &inKey, uint64 inKeyHash, int inExtraBytes, Params &&... inConstructorParams);
+
+	/// Find an element, returns null if not found
+	inline const KeyValue *	Find(const Key &inKey, uint64 inKeyHash) const;
+
+	/// Value of an invalid handle
+	const static uint32		cInvalidHandle = uint32(-1);
+
+	/// Get convert key value pair to uint32 handle
+	inline uint32			ToHandle(const KeyValue *inKeyValue) const;
+
+	/// Convert uint32 handle back to key and value
+	inline const KeyValue *	FromHandle(uint32 inHandle) const;
+
+#ifdef JPH_ENABLE_ASSERTS
+	/// Get the number of key value pairs that this map currently contains.
+	/// Available only when asserts are enabled because adding elements creates contention on this atomic and negatively affects performance.
+	inline uint32			GetNumKeyValues() const			{ return mNumKeyValues; }
+#endif // JPH_ENABLE_ASSERTS
+
+	/// Get all key/value pairs
+	inline void				GetAllKeyValues(Array<const KeyValue *> &outAll) const;
+
+	/// Non-const iterator
+	struct Iterator
+	{
+		/// Comparison
+		bool				operator == (const Iterator &inRHS) const	{ return mMap == inRHS.mMap && mBucket == inRHS.mBucket && mOffset == inRHS.mOffset; }
+		bool				operator != (const Iterator &inRHS) const	{ return !(*this == inRHS); }
+
+		/// Convert to key value pair
+		KeyValue &			operator * ();
+
+		/// Next item
+		Iterator &			operator ++ ();
+
+		MapType *			mMap;
+		uint32				mBucket;
+		uint32				mOffset;
+	};
+
+	/// Iterate over the map, note that it is not safe to do this in parallel to Clear().
+	/// It is safe to do this while adding elements to the map, but newly added elements may or may not be returned by the iterator.
+	Iterator				begin();
+	Iterator				end();
+
+#ifdef JPH_DEBUG
+	/// Output stats about this map to the log
+	void					TraceStats() const;
+#endif
+
+private:
+	LFHMAllocator &			mAllocator;						///< Allocator used to allocate key value pairs
+
+#ifdef JPH_ENABLE_ASSERTS
+	atomic<uint32>			mNumKeyValues = 0;				///< Number of key value pairs in the store
+#endif // JPH_ENABLE_ASSERTS
+
+	atomic<uint32> *		mBuckets = nullptr;				///< This contains the offset in mObjectStore of the first object with a particular hash
+	uint32					mNumBuckets = 0;				///< Current number of buckets
+	uint32					mMaxBuckets = 0;				///< Maximum number of buckets
+};
+
+JPH_NAMESPACE_END
+
+#include "LockFreeHashMap.inl"
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/LockFreeHashMap.inl
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/LockFreeHashMap.inl
@ -0,0 +1,351 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////////
+// LFHMAllocator
+///////////////////////////////////////////////////////////////////////////////////
+
+inline LFHMAllocator::~LFHMAllocator()
+{
+	AlignedFree(mObjectStore);
+}
+
+inline void LFHMAllocator::Init(uint inObjectStoreSizeBytes)
+{
+	JPH_ASSERT(mObjectStore == nullptr);
+
+	mObjectStoreSizeBytes = inObjectStoreSizeBytes;
+	mObjectStore = reinterpret_cast<uint8 *>(JPH::AlignedAllocate(inObjectStoreSizeBytes, 16));
+}
+
+inline void LFHMAllocator::Clear()
+{
+	mWriteOffset = 0;
+}
+
+inline void LFHMAllocator::Allocate(uint32 inBlockSize, uint32 &ioBegin, uint32 &ioEnd)
+{
+	// If we're already beyond the end of our buffer then don't do an atomic add.
+	// It's possible that many keys are inserted after the allocator is full, making it possible
+	// for mWriteOffset (uint32) to wrap around to zero. When this happens, there will be a memory corruption.
+	// This way, we will be able to progress the write offset beyond the size of the buffer
+	// worst case by max <CPU count> * inBlockSize.
+	if (mWriteOffset.load(memory_order_relaxed) >= mObjectStoreSizeBytes)
+		return;
+
+	// Atomically fetch a block from the pool
+	uint32 begin = mWriteOffset.fetch_add(inBlockSize, memory_order_relaxed);
+	uint32 end = min(begin + inBlockSize, mObjectStoreSizeBytes);
+
+	if (ioEnd == begin)
+	{
+		// Block is allocated straight after our previous block
+		begin = ioBegin;
+	}
+	else
+	{
+		// Block is a new block
+		begin = min(begin, mObjectStoreSizeBytes);
+	}
+
+	// Store the begin and end of the resulting block
+	ioBegin = begin;
+	ioEnd = end;
+}
+
+template <class T>
+inline uint32 LFHMAllocator::ToOffset(const T *inData) const
+{
+	const uint8 *data = reinterpret_cast<const uint8 *>(inData);
+	JPH_ASSERT(data >= mObjectStore && data < mObjectStore + mObjectStoreSizeBytes);
+	return uint32(data - mObjectStore);
+}
+
+template <class T>
+inline T *LFHMAllocator::FromOffset(uint32 inOffset) const
+{
+	JPH_ASSERT(inOffset < mObjectStoreSizeBytes);
+	return reinterpret_cast<T *>(mObjectStore + inOffset);
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+// LFHMAllocatorContext
+///////////////////////////////////////////////////////////////////////////////////
+
+inline LFHMAllocatorContext::LFHMAllocatorContext(LFHMAllocator &inAllocator, uint32 inBlockSize) :
+	mAllocator(inAllocator),
+	mBlockSize(inBlockSize)
+{
+}
+
+inline bool LFHMAllocatorContext::Allocate(uint32 inSize, uint32 inAlignment, uint32 &outWriteOffset)
+{
+	// Calculate needed bytes for alignment
+	JPH_ASSERT(IsPowerOf2(inAlignment));
+	uint32 alignment_mask = inAlignment - 1;
+	uint32 alignment = (inAlignment - (mBegin & alignment_mask)) & alignment_mask;
+
+	// Check if we have space
+	if (mEnd - mBegin < inSize + alignment)
+	{
+		// Allocate a new block
+		mAllocator.Allocate(mBlockSize, mBegin, mEnd);
+
+		// Update alignment
+		alignment = (inAlignment - (mBegin & alignment_mask)) & alignment_mask;
+
+		// Check if we have space again
+		if (mEnd - mBegin < inSize + alignment)
+			return false;
+	}
+
+	// Make the allocation
+	mBegin += alignment;
+	outWriteOffset = mBegin;
+	mBegin += inSize;
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+// LockFreeHashMap
+///////////////////////////////////////////////////////////////////////////////////
+
+template <class Key, class Value>
+void LockFreeHashMap<Key, Value>::Init(uint32 inMaxBuckets)
+{
+	JPH_ASSERT(inMaxBuckets >= 4 && IsPowerOf2(inMaxBuckets));
+	JPH_ASSERT(mBuckets == nullptr);
+
+	mNumBuckets = inMaxBuckets;
+	mMaxBuckets = inMaxBuckets;
+
+	mBuckets = reinterpret_cast<atomic<uint32> *>(AlignedAllocate(inMaxBuckets * sizeof(atomic<uint32>), 16));
+
+	Clear();
+}
+
+template <class Key, class Value>
+LockFreeHashMap<Key, Value>::~LockFreeHashMap()
+{
+	AlignedFree(mBuckets);
+}
+
+template <class Key, class Value>
+void LockFreeHashMap<Key, Value>::Clear()
+{
+#ifdef JPH_ENABLE_ASSERTS
+	// Reset number of key value pairs
+	mNumKeyValues = 0;
+#endif // JPH_ENABLE_ASSERTS
+
+	// Reset buckets 4 at a time
+	static_assert(sizeof(atomic<uint32>) == sizeof(uint32));
+	UVec4 invalid_handle = UVec4::sReplicate(cInvalidHandle);
+	uint32 *start = reinterpret_cast<uint32 *>(mBuckets);
+	const uint32 *end = start + mNumBuckets;
+	JPH_ASSERT(IsAligned(start, 16));
+	while (start < end)
+	{
+		invalid_handle.StoreInt4Aligned(start);
+		start += 4;
+	}
+}
+
+template <class Key, class Value>
+void LockFreeHashMap<Key, Value>::SetNumBuckets(uint32 inNumBuckets)
+{
+	JPH_ASSERT(mNumKeyValues == 0);
+	JPH_ASSERT(inNumBuckets <= mMaxBuckets);
+	JPH_ASSERT(inNumBuckets >= 4 && IsPowerOf2(inNumBuckets));
+
+	mNumBuckets = inNumBuckets;
+}
+
+template <class Key, class Value>
+template <class... Params>
+inline typename LockFreeHashMap<Key, Value>::KeyValue *LockFreeHashMap<Key, Value>::Create(LFHMAllocatorContext &ioContext, const Key &inKey, uint64 inKeyHash, int inExtraBytes, Params &&... inConstructorParams)
+{
+	// This is not a multi map, test the key hasn't been inserted yet
+	JPH_ASSERT(Find(inKey, inKeyHash) == nullptr);
+
+	// Calculate total size
+	uint size = sizeof(KeyValue) + inExtraBytes;
+
+	// Get the write offset for this key value pair
+	uint32 write_offset;
+	if (!ioContext.Allocate(size, alignof(KeyValue), write_offset))
+		return nullptr;
+
+#ifdef JPH_ENABLE_ASSERTS
+	// Increment amount of entries in map
+	mNumKeyValues.fetch_add(1, memory_order_relaxed);
+#endif // JPH_ENABLE_ASSERTS
+
+	// Construct the key/value pair
+	KeyValue *kv = mAllocator.template FromOffset<KeyValue>(write_offset);
+	JPH_ASSERT(intptr_t(kv) % alignof(KeyValue) == 0);
+#ifdef JPH_DEBUG
+	memset(kv, 0xcd, size);
+#endif
+	kv->mKey = inKey;
+	new (&kv->mValue) Value(std::forward<Params>(inConstructorParams)...);
+
+	// Get the offset to the first object from the bucket with corresponding hash
+	atomic<uint32> &offset = mBuckets[inKeyHash & (mNumBuckets - 1)];
+
+	// Add this entry as the first element in the linked list
+	uint32 old_offset = offset.load(memory_order_relaxed);
+	for (;;)
+	{
+		kv->mNextOffset = old_offset;
+		if (offset.compare_exchange_weak(old_offset, write_offset, memory_order_release))
+			break;
+	}
+
+	return kv;
+}
+
+template <class Key, class Value>
+inline const typename LockFreeHashMap<Key, Value>::KeyValue *LockFreeHashMap<Key, Value>::Find(const Key &inKey, uint64 inKeyHash) const
+{
+	// Get the offset to the keyvalue object from the bucket with corresponding hash
+	uint32 offset = mBuckets[inKeyHash & (mNumBuckets - 1)].load(memory_order_acquire);
+	while (offset != cInvalidHandle)
+	{
+		// Loop through linked list of values until the right one is found
+		const KeyValue *kv = mAllocator.template FromOffset<const KeyValue>(offset);
+		if (kv->mKey == inKey)
+			return kv;
+		offset = kv->mNextOffset;
+	}
+
+	// Not found
+	return nullptr;
+}
+
+template <class Key, class Value>
+inline uint32 LockFreeHashMap<Key, Value>::ToHandle(const KeyValue *inKeyValue) const
+{
+	return mAllocator.ToOffset(inKeyValue);
+}
+
+template <class Key, class Value>
+inline const typename LockFreeHashMap<Key, Value>::KeyValue *LockFreeHashMap<Key, Value>::FromHandle(uint32 inHandle) const
+{
+	return mAllocator.template FromOffset<const KeyValue>(inHandle);
+}
+
+template <class Key, class Value>
+inline void LockFreeHashMap<Key, Value>::GetAllKeyValues(Array<const KeyValue *> &outAll) const
+{
+	for (const atomic<uint32> *bucket = mBuckets; bucket < mBuckets + mNumBuckets; ++bucket)
+	{
+		uint32 offset = *bucket;
+		while (offset != cInvalidHandle)
+		{
+			const KeyValue *kv = mAllocator.template FromOffset<const KeyValue>(offset);
+			outAll.push_back(kv);
+			offset = kv->mNextOffset;
+		}
+	}
+}
+
+template <class Key, class Value>
+typename LockFreeHashMap<Key, Value>::Iterator LockFreeHashMap<Key, Value>::begin()
+{
+	// Start with the first bucket
+	Iterator it { this, 0, mBuckets[0] };
+
+	// If it doesn't contain a valid entry, use the ++ operator to find the first valid entry
+	if (it.mOffset == cInvalidHandle)
+		++it;
+
+	return it;
+}
+
+template <class Key, class Value>
+typename LockFreeHashMap<Key, Value>::Iterator LockFreeHashMap<Key, Value>::end()
+{
+	return { this, mNumBuckets, cInvalidHandle };
+}
+
+template <class Key, class Value>
+typename LockFreeHashMap<Key, Value>::KeyValue &LockFreeHashMap<Key, Value>::Iterator::operator* ()
+{
+	JPH_ASSERT(mOffset != cInvalidHandle);
+
+	return *mMap->mAllocator.template FromOffset<KeyValue>(mOffset);
+}
+
+template <class Key, class Value>
+typename LockFreeHashMap<Key, Value>::Iterator &LockFreeHashMap<Key, Value>::Iterator::operator++ ()
+{
+	JPH_ASSERT(mBucket < mMap->mNumBuckets);
+
+	// Find the next key value in this bucket
+	if (mOffset != cInvalidHandle)
+	{
+		const KeyValue *kv = mMap->mAllocator.template FromOffset<const KeyValue>(mOffset);
+		mOffset = kv->mNextOffset;
+		if (mOffset != cInvalidHandle)
+			return *this;
+	}
+
+	// Loop over next buckets
+	for (;;)
+	{
+		// Next bucket
+		++mBucket;
+		if (mBucket >= mMap->mNumBuckets)
+			return *this;
+
+		// Fetch the first entry in the bucket
+		mOffset = mMap->mBuckets[mBucket];
+		if (mOffset != cInvalidHandle)
+			return *this;
+	}
+}
+
+#ifdef JPH_DEBUG
+
+template <class Key, class Value>
+void LockFreeHashMap<Key, Value>::TraceStats() const
+{
+	const int cMaxPerBucket = 256;
+
+	int max_objects_per_bucket = 0;
+	int num_objects = 0;
+	int histogram[cMaxPerBucket];
+	for (int i = 0; i < cMaxPerBucket; ++i)
+		histogram[i] = 0;
+
+	for (atomic<uint32> *bucket = mBuckets, *bucket_end = mBuckets + mNumBuckets; bucket < bucket_end; ++bucket)
+	{
+		int objects_in_bucket = 0;
+		uint32 offset = *bucket;
+		while (offset != cInvalidHandle)
+		{
+			const KeyValue *kv = mAllocator.template FromOffset<const KeyValue>(offset);
+			offset = kv->mNextOffset;
+			++objects_in_bucket;
+			++num_objects;
+		}
+		max_objects_per_bucket = max(objects_in_bucket, max_objects_per_bucket);
+		histogram[min(objects_in_bucket, cMaxPerBucket - 1)]++;
+	}
+
+	Trace("max_objects_per_bucket = %d, num_buckets = %u, num_objects = %d", max_objects_per_bucket, mNumBuckets, num_objects);
+
+	for (int i = 0; i < cMaxPerBucket; ++i)
+		if (histogram[i] != 0)
+			Trace("%d: %d", i, histogram[i]);
+}
+
+#endif
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Memory.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Memory.cpp
@ -0,0 +1,85 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <cstdlib>
+JPH_SUPPRESS_WARNINGS_STD_END
+#include <stdlib.h>
+
+JPH_NAMESPACE_BEGIN
+
+#ifdef JPH_DISABLE_CUSTOM_ALLOCATOR
+	#define JPH_ALLOC_FN(x)	x
+	#define JPH_ALLOC_SCOPE
+#else
+	#define JPH_ALLOC_FN(x)	x##Impl
+	#define JPH_ALLOC_SCOPE static
+#endif
+
+JPH_ALLOC_SCOPE void *JPH_ALLOC_FN(Allocate)(size_t inSize)
+{
+	JPH_ASSERT(inSize > 0);
+	return malloc(inSize);
+}
+
+JPH_ALLOC_SCOPE void *JPH_ALLOC_FN(Reallocate)(void *inBlock, [[maybe_unused]] size_t inOldSize, size_t inNewSize)
+{
+	JPH_ASSERT(inNewSize > 0);
+	return realloc(inBlock, inNewSize);
+}
+
+JPH_ALLOC_SCOPE void JPH_ALLOC_FN(Free)(void *inBlock)
+{
+	free(inBlock);
+}
+
+JPH_ALLOC_SCOPE void *JPH_ALLOC_FN(AlignedAllocate)(size_t inSize, size_t inAlignment)
+{
+	JPH_ASSERT(inSize > 0 && inAlignment > 0);
+
+#if defined(JPH_PLATFORM_WINDOWS)
+	// Microsoft doesn't implement posix_memalign
+	return _aligned_malloc(inSize, inAlignment);
+#else
+	void *block = nullptr;
+	JPH_SUPPRESS_WARNING_PUSH
+	JPH_GCC_SUPPRESS_WARNING("-Wunused-result")
+	JPH_CLANG_SUPPRESS_WARNING("-Wunused-result")
+	posix_memalign(&block, inAlignment, inSize);
+	JPH_SUPPRESS_WARNING_POP
+	return block;
+#endif
+}
+
+JPH_ALLOC_SCOPE void JPH_ALLOC_FN(AlignedFree)(void *inBlock)
+{
+#if defined(JPH_PLATFORM_WINDOWS)
+	_aligned_free(inBlock);
+#else
+	free(inBlock);
+#endif
+}
+
+#ifndef JPH_DISABLE_CUSTOM_ALLOCATOR
+
+AllocateFunction Allocate = nullptr;
+ReallocateFunction Reallocate = nullptr;
+FreeFunction Free = nullptr;
+AlignedAllocateFunction AlignedAllocate = nullptr;
+AlignedFreeFunction AlignedFree = nullptr;
+
+void RegisterDefaultAllocator()
+{
+	Allocate = AllocateImpl;
+	Reallocate = ReallocateImpl;
+	Free = FreeImpl;
+	AlignedAllocate = AlignedAllocateImpl;
+	AlignedFree = AlignedFreeImpl;
+}
+
+#endif // JPH_DISABLE_CUSTOM_ALLOCATOR
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Memory.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Memory.h
@ -0,0 +1,85 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+#ifndef JPH_DISABLE_CUSTOM_ALLOCATOR
+
+/// Normal memory allocation, must be at least 8 byte aligned on 32 bit platform and 16 byte aligned on 64 bit platform.
+/// Note that you can override JPH_DEFAULT_ALLOCATE_ALIGNMENT if your allocator's alignment is different from the alignment as defined by `__STDCPP_DEFAULT_NEW_ALIGNMENT__`.
+using AllocateFunction = void *(*)(size_t inSize);
+
+/// Reallocate memory. inBlock can be nullptr in which case it must behave as a memory allocation.
+using ReallocateFunction = void *(*)(void *inBlock, size_t inOldSize, size_t inNewSize);
+
+/// Free memory. inBlock can be nullptr in which case it must do nothing.
+using FreeFunction = void (*)(void *inBlock);
+
+/// Aligned memory allocation.
+using AlignedAllocateFunction = void *(*)(size_t inSize, size_t inAlignment);
+
+/// Free aligned memory. inBlock can be nullptr in which case it must do nothing.
+using AlignedFreeFunction = void (*)(void *inBlock);
+
+// User defined allocation / free functions
+JPH_EXPORT extern AllocateFunction Allocate;
+JPH_EXPORT extern ReallocateFunction Reallocate;
+JPH_EXPORT extern FreeFunction Free;
+JPH_EXPORT extern AlignedAllocateFunction AlignedAllocate;
+JPH_EXPORT extern AlignedFreeFunction AlignedFree;
+
+/// Register platform default allocation / free functions
+JPH_EXPORT void RegisterDefaultAllocator();
+
+// 32-bit MinGW g++ doesn't call the correct overload for the new operator when a type is 16 bytes aligned.
+// It uses the non-aligned version, which on 32 bit platforms usually returns an 8 byte aligned block.
+// We therefore default to 16 byte aligned allocations when the regular new operator is used.
+// See: https://github.com/godotengine/godot/issues/105455#issuecomment-2824311547
+#if defined(JPH_COMPILER_MINGW) && JPH_CPU_ARCH_BITS == 32
+	#define JPH_INTERNAL_DEFAULT_ALLOCATE(size) JPH::AlignedAllocate(size, 16)
+	#define JPH_INTERNAL_DEFAULT_FREE(pointer) JPH::AlignedFree(pointer)
+#else
+	#define JPH_INTERNAL_DEFAULT_ALLOCATE(size) JPH::Allocate(size)
+	#define JPH_INTERNAL_DEFAULT_FREE(pointer) JPH::Free(pointer)
+#endif
+
+/// Macro to override the new and delete functions
+#define JPH_OVERRIDE_NEW_DELETE \
+	JPH_INLINE void *operator new (size_t inCount)												{ return JPH_INTERNAL_DEFAULT_ALLOCATE(inCount); } \
+	JPH_INLINE void operator delete (void *inPointer) noexcept									{ JPH_INTERNAL_DEFAULT_FREE(inPointer); } \
+	JPH_INLINE void operator delete (void *inPointer, [[maybe_unused]] size_t inSize) noexcept	{ JPH_INTERNAL_DEFAULT_FREE(inPointer); } \
+	JPH_INLINE void *operator new[] (size_t inCount)											{ return JPH_INTERNAL_DEFAULT_ALLOCATE(inCount); } \
+	JPH_INLINE void operator delete[] (void *inPointer) noexcept								{ JPH_INTERNAL_DEFAULT_FREE(inPointer); } \
+	JPH_INLINE void operator delete[] (void *inPointer, [[maybe_unused]] size_t inSize) noexcept{ JPH_INTERNAL_DEFAULT_FREE(inPointer); } \
+	JPH_INLINE void *operator new (size_t inCount, std::align_val_t inAlignment)				{ return JPH::AlignedAllocate(inCount, static_cast<size_t>(inAlignment)); } \
+	JPH_INLINE void operator delete (void *inPointer, [[maybe_unused]] std::align_val_t inAlignment) noexcept { JPH::AlignedFree(inPointer); } \
+	JPH_INLINE void operator delete (void *inPointer, [[maybe_unused]] size_t inSize, [[maybe_unused]] std::align_val_t inAlignment) noexcept { JPH::AlignedFree(inPointer); } \
+	JPH_INLINE void *operator new[] (size_t inCount, std::align_val_t inAlignment)				{ return JPH::AlignedAllocate(inCount, static_cast<size_t>(inAlignment)); } \
+	JPH_INLINE void operator delete[] (void *inPointer, [[maybe_unused]] std::align_val_t inAlignment) noexcept	{ JPH::AlignedFree(inPointer); } \
+	JPH_INLINE void operator delete[] (void *inPointer, [[maybe_unused]] size_t inSize, [[maybe_unused]] std::align_val_t inAlignment) noexcept { JPH::AlignedFree(inPointer); } \
+	JPH_INLINE void *operator new ([[maybe_unused]] size_t inCount, void *inPointer) noexcept	{ return inPointer; } \
+	JPH_INLINE void operator delete ([[maybe_unused]] void *inPointer, [[maybe_unused]] void *inPlace) noexcept { /* Do nothing */ } \
+	JPH_INLINE void *operator new[] ([[maybe_unused]] size_t inCount, void *inPointer) noexcept	{ return inPointer; } \
+	JPH_INLINE void operator delete[] ([[maybe_unused]] void *inPointer, [[maybe_unused]] void *inPlace) noexcept { /* Do nothing */ }
+
+#else
+
+// Directly define the allocation functions
+JPH_EXPORT void *Allocate(size_t inSize);
+JPH_EXPORT void *Reallocate(void *inBlock, size_t inOldSize, size_t inNewSize);
+JPH_EXPORT void Free(void *inBlock);
+JPH_EXPORT void *AlignedAllocate(size_t inSize, size_t inAlignment);
+JPH_EXPORT void AlignedFree(void *inBlock);
+
+// Don't implement allocator registering
+inline void RegisterDefaultAllocator() { }
+
+// Don't override new/delete
+#define JPH_OVERRIDE_NEW_DELETE
+
+#endif // !JPH_DISABLE_CUSTOM_ALLOCATOR
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Mutex.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Mutex.h
@ -0,0 +1,223 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/Profiler.h>
+#include <Jolt/Core/NonCopyable.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <mutex>
+#include <shared_mutex>
+#include <thread>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+// Things we're using from STL
+using std::mutex;
+using std::shared_mutex;
+using std::thread;
+using std::lock_guard;
+using std::shared_lock;
+using std::unique_lock;
+
+#ifdef JPH_PLATFORM_BLUE
+
+// On Platform Blue the mutex class is not very fast so we implement it using the official APIs
+class MutexBase : public NonCopyable
+{
+public:
+					MutexBase()
+	{
+		JPH_PLATFORM_BLUE_MUTEX_INIT(mMutex);
+	}
+
+					~MutexBase()
+	{
+		JPH_PLATFORM_BLUE_MUTEX_DESTROY(mMutex);
+	}
+
+	inline bool		try_lock()
+	{
+		return JPH_PLATFORM_BLUE_MUTEX_TRYLOCK(mMutex);
+	}
+
+	inline void		lock()
+	{
+		JPH_PLATFORM_BLUE_MUTEX_LOCK(mMutex);
+	}
+
+	inline void		unlock()
+	{
+		JPH_PLATFORM_BLUE_MUTEX_UNLOCK(mMutex);
+	}
+
+private:
+	JPH_PLATFORM_BLUE_MUTEX		mMutex;
+};
+
+// On Platform Blue the shared_mutex class is not very fast so we implement it using the official APIs
+class SharedMutexBase : public NonCopyable
+{
+public:
+					SharedMutexBase()
+	{
+		JPH_PLATFORM_BLUE_RWLOCK_INIT(mRWLock);
+	}
+
+					~SharedMutexBase()
+	{
+		JPH_PLATFORM_BLUE_RWLOCK_DESTROY(mRWLock);
+	}
+
+	inline bool		try_lock()
+	{
+		return JPH_PLATFORM_BLUE_RWLOCK_TRYWLOCK(mRWLock);
+	}
+
+	inline bool		try_lock_shared()
+	{
+		return JPH_PLATFORM_BLUE_RWLOCK_TRYRLOCK(mRWLock);
+	}
+
+	inline void		lock()
+	{
+		JPH_PLATFORM_BLUE_RWLOCK_WLOCK(mRWLock);
+	}
+
+	inline void		unlock()
+	{
+		JPH_PLATFORM_BLUE_RWLOCK_WUNLOCK(mRWLock);
+	}
+
+	inline void		lock_shared()
+	{
+		JPH_PLATFORM_BLUE_RWLOCK_RLOCK(mRWLock);
+	}
+
+	inline void		unlock_shared()
+	{
+		JPH_PLATFORM_BLUE_RWLOCK_RUNLOCK(mRWLock);
+	}
+
+private:
+	JPH_PLATFORM_BLUE_RWLOCK	mRWLock;
+};
+
+#else
+
+// On other platforms just use the STL implementation
+using MutexBase = mutex;
+using SharedMutexBase = shared_mutex;
+
+#endif // JPH_PLATFORM_BLUE
+
+#if defined(JPH_ENABLE_ASSERTS) || defined(JPH_PROFILE_ENABLED) || defined(JPH_EXTERNAL_PROFILE)
+
+/// Very simple wrapper around MutexBase which tracks lock contention in the profiler
+/// and asserts that locks/unlocks take place on the same thread
+class Mutex : public MutexBase
+{
+public:
+	inline bool		try_lock()
+	{
+		JPH_ASSERT(mLockedThreadID != std::this_thread::get_id());
+		if (MutexBase::try_lock())
+		{
+			JPH_IF_ENABLE_ASSERTS(mLockedThreadID = std::this_thread::get_id();)
+			return true;
+		}
+		return false;
+	}
+
+	inline void		lock()
+	{
+		if (!try_lock())
+		{
+			JPH_PROFILE("Lock", 0xff00ffff);
+			MutexBase::lock();
+			JPH_IF_ENABLE_ASSERTS(mLockedThreadID = std::this_thread::get_id();)
+		}
+	}
+
+	inline void		unlock()
+	{
+		JPH_ASSERT(mLockedThreadID == std::this_thread::get_id());
+		JPH_IF_ENABLE_ASSERTS(mLockedThreadID = thread::id();)
+		MutexBase::unlock();
+	}
+
+#ifdef JPH_ENABLE_ASSERTS
+	inline bool		is_locked()
+	{
+		return mLockedThreadID != thread::id();
+	}
+#endif // JPH_ENABLE_ASSERTS
+
+private:
+	JPH_IF_ENABLE_ASSERTS(thread::id mLockedThreadID;)
+};
+
+/// Very simple wrapper around SharedMutexBase which tracks lock contention in the profiler
+/// and asserts that locks/unlocks take place on the same thread
+class SharedMutex : public SharedMutexBase
+{
+public:
+	inline bool		try_lock()
+	{
+		JPH_ASSERT(mLockedThreadID != std::this_thread::get_id());
+		if (SharedMutexBase::try_lock())
+		{
+			JPH_IF_ENABLE_ASSERTS(mLockedThreadID = std::this_thread::get_id();)
+			return true;
+		}
+		return false;
+	}
+
+	inline void		lock()
+	{
+		if (!try_lock())
+		{
+			JPH_PROFILE("WLock", 0xff00ffff);
+			SharedMutexBase::lock();
+			JPH_IF_ENABLE_ASSERTS(mLockedThreadID = std::this_thread::get_id();)
+		}
+	}
+
+	inline void		unlock()
+	{
+		JPH_ASSERT(mLockedThreadID == std::this_thread::get_id());
+		JPH_IF_ENABLE_ASSERTS(mLockedThreadID = thread::id();)
+		SharedMutexBase::unlock();
+	}
+
+#ifdef JPH_ENABLE_ASSERTS
+	inline bool		is_locked()
+	{
+		return mLockedThreadID != thread::id();
+	}
+#endif // JPH_ENABLE_ASSERTS
+
+	inline void		lock_shared()
+	{
+		if (!try_lock_shared())
+		{
+			JPH_PROFILE("RLock", 0xff00ffff);
+			SharedMutexBase::lock_shared();
+		}
+	}
+
+private:
+	JPH_IF_ENABLE_ASSERTS(thread::id mLockedThreadID;)
+};
+
+#else
+
+using Mutex = MutexBase;
+using SharedMutex = SharedMutexBase;
+
+#endif
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/MutexArray.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/MutexArray.h
@ -0,0 +1,98 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/NonCopyable.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// A mutex array protects a number of resources with a limited amount of mutexes.
+/// It uses hashing to find the mutex of a particular object.
+/// The idea is that if the amount of threads is much smaller than the amount of mutexes
+/// that there is a relatively small chance that two different objects map to the same mutex.
+template <class MutexType>
+class MutexArray : public NonCopyable
+{
+public:
+	/// Constructor, constructs an empty mutex array that you need to initialize with Init()
+							MutexArray() = default;
+
+	/// Constructor, constructs an array with inNumMutexes entries
+	explicit				MutexArray(uint inNumMutexes) { Init(inNumMutexes); }
+
+	/// Destructor
+							~MutexArray() { delete [] mMutexStorage; }
+
+	/// Initialization
+	/// @param inNumMutexes The amount of mutexes to allocate
+	void					Init(uint inNumMutexes)
+	{
+		JPH_ASSERT(mMutexStorage == nullptr);
+		JPH_ASSERT(inNumMutexes > 0 && IsPowerOf2(inNumMutexes));
+
+		mMutexStorage = new MutexStorage[inNumMutexes];
+		mNumMutexes = inNumMutexes;
+	}
+
+	/// Get the number of mutexes that were allocated
+	inline uint				GetNumMutexes() const
+	{
+		return mNumMutexes;
+	}
+
+	/// Convert an object index to a mutex index
+	inline uint32			GetMutexIndex(uint32 inObjectIndex) const
+	{
+		Hash<uint32> hasher;
+		return hasher(inObjectIndex) & (mNumMutexes - 1);
+	}
+
+	/// Get the mutex belonging to a certain object by index
+	inline MutexType &		GetMutexByObjectIndex(uint32 inObjectIndex)
+	{
+		return mMutexStorage[GetMutexIndex(inObjectIndex)].mMutex;
+	}
+
+	/// Get a mutex by index in the array
+	inline MutexType &		GetMutexByIndex(uint32 inMutexIndex)
+	{
+		return mMutexStorage[inMutexIndex].mMutex;
+	}
+
+	/// Lock all mutexes
+	void					LockAll()
+	{
+		JPH_PROFILE_FUNCTION();
+
+		MutexStorage *end = mMutexStorage + mNumMutexes;
+		for (MutexStorage *m = mMutexStorage; m < end; ++m)
+			m->mMutex.lock();
+	}
+
+	/// Unlock all mutexes
+	void					UnlockAll()
+	{
+		JPH_PROFILE_FUNCTION();
+
+		MutexStorage *end = mMutexStorage + mNumMutexes;
+		for (MutexStorage *m = mMutexStorage; m < end; ++m)
+			m->mMutex.unlock();
+	}
+
+private:
+	/// Align the mutex to a cache line to ensure there is no false sharing (this is platform dependent, we do this to be safe)
+	struct alignas(JPH_CACHE_LINE_SIZE) MutexStorage
+	{
+		JPH_OVERRIDE_NEW_DELETE
+
+		MutexType			mMutex;
+	};
+
+	MutexStorage *			mMutexStorage = nullptr;
+	uint					mNumMutexes = 0;
+};
+
+JPH_NAMESPACE_END
+
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/NonCopyable.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/NonCopyable.h
@ -0,0 +1,18 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Class that makes another class non-copyable. Usage: Inherit from NonCopyable.
+class JPH_EXPORT NonCopyable
+{
+public:
+			NonCopyable() = default;
+			NonCopyable(const NonCopyable &) = delete;
+	void	operator = (const NonCopyable &) = delete;
+};
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/ObjectToIDMap.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/ObjectToIDMap.h
@ -0,0 +1,21 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2026 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/UnorderedMapFwd.h>
+
+JPH_NAMESPACE_BEGIN
+
+namespace StreamUtils {
+
+template <class Type>
+using ObjectToIDMap = UnorderedMap<const Type *, uint32>;
+
+template <class Type>
+using IDToObjectMap = Array<Ref<Type>>;
+
+} // StreamUtils
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Profiler.cpp
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Profiler.cpp
@ -0,0 +1,679 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Core/Profiler.h>
+#include <Jolt/Core/Color.h>
+#include <Jolt/Core/StringTools.h>
+#include <Jolt/Core/QuickSort.h>
+#include <Jolt/Core/UnorderedMap.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <fstream>
+#include <chrono>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+JPH_NAMESPACE_BEGIN
+
+#if defined(JPH_EXTERNAL_PROFILE) && defined(JPH_SHARED_LIBRARY)
+
+ProfileStartMeasurementFunction ProfileStartMeasurement = [](const char *, uint32, uint8 *) { };
+ProfileEndMeasurementFunction ProfileEndMeasurement = [](uint8 *) { };
+
+#elif defined(JPH_PROFILE_ENABLED)
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Profiler
+//////////////////////////////////////////////////////////////////////////////////////////
+
+Profiler *Profiler::sInstance = nullptr;
+
+#ifdef JPH_SHARED_LIBRARY
+	static thread_local ProfileThread *sInstance = nullptr;
+
+	ProfileThread *ProfileThread::sGetInstance()
+	{
+		return sInstance;
+	}
+
+	void ProfileThread::sSetInstance(ProfileThread *inInstance)
+	{
+		sInstance = inInstance;
+	}
+#else
+	thread_local ProfileThread *ProfileThread::sInstance = nullptr;
+#endif
+
+bool ProfileMeasurement::sOutOfSamplesReported = false;
+
+void Profiler::UpdateReferenceTime()
+{
+	mReferenceTick = GetProcessorTickCount();
+	mReferenceTime = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+}
+
+uint64 Profiler::GetProcessorTicksPerSecond() const
+{
+	uint64 ticks = GetProcessorTickCount();
+	uint64 micros = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+
+	return (ticks - mReferenceTick) * 1000000ULL / (micros - mReferenceTime);
+}
+
+// This function assumes that none of the threads are active while we're dumping the profile,
+// otherwise there will be a race condition on mCurrentSample and the profile data.
+JPH_TSAN_NO_SANITIZE
+void Profiler::NextFrame()
+{
+	std::lock_guard lock(mLock);
+
+	if (mDump)
+	{
+		DumpInternal();
+		mDump = false;
+	}
+
+	for (ProfileThread *t : mThreads)
+		t->mCurrentSample = 0;
+
+	UpdateReferenceTime();
+}
+
+void Profiler::Dump(const string_view &inTag)
+{
+	mDump = true;
+	mDumpTag = inTag;
+}
+
+void Profiler::AddThread(ProfileThread *inThread)
+{
+	std::lock_guard lock(mLock);
+
+	mThreads.push_back(inThread);
+}
+
+void Profiler::RemoveThread(ProfileThread *inThread)
+{
+	std::lock_guard lock(mLock);
+
+	Array<ProfileThread *>::iterator i = std::find(mThreads.begin(), mThreads.end(), inThread);
+	JPH_ASSERT(i != mThreads.end());
+	mThreads.erase(i);
+}
+
+void Profiler::sAggregate(int inDepth, uint32 inColor, ProfileSample *&ioSample, const ProfileSample *inEnd, Aggregators &ioAggregators, KeyToAggregator &ioKeyToAggregator)
+{
+	// Store depth
+	ioSample->mDepth = uint8(min(255, inDepth));
+
+	// Update color
+	if (ioSample->mColor == 0)
+		ioSample->mColor = inColor;
+	else
+		inColor = ioSample->mColor;
+
+	// Start accumulating totals
+	uint64 cycles_this_with_children = ioSample->mEndCycle - ioSample->mStartCycle;
+
+	// Loop over following samples until we find a sample that starts on or after our end
+	ProfileSample *sample;
+	for (sample = ioSample + 1; sample < inEnd && sample->mStartCycle < ioSample->mEndCycle; ++sample)
+	{
+		JPH_ASSERT(sample[-1].mStartCycle <= sample->mStartCycle);
+		JPH_ASSERT(sample->mStartCycle >= ioSample->mStartCycle);
+		JPH_ASSERT(sample->mEndCycle <= ioSample->mEndCycle);
+
+		// Recurse and skip over the children of this child
+		sAggregate(inDepth + 1, inColor, sample, inEnd, ioAggregators, ioKeyToAggregator);
+	}
+
+	// Find the aggregator for this name / filename pair
+	Aggregator *aggregator;
+	KeyToAggregator::iterator aggregator_idx = ioKeyToAggregator.find(ioSample->mName);
+	if (aggregator_idx == ioKeyToAggregator.end())
+	{
+		// Not found, add to map and insert in array
+		ioKeyToAggregator.try_emplace(ioSample->mName, ioAggregators.size());
+		ioAggregators.emplace_back(ioSample->mName);
+		aggregator = &ioAggregators.back();
+	}
+	else
+	{
+		// Found
+		aggregator = &ioAggregators[aggregator_idx->second];
+	}
+
+	// Add the measurement to the aggregator
+	aggregator->AccumulateMeasurement(cycles_this_with_children);
+
+	// Update ioSample to the last child of ioSample
+	JPH_ASSERT(sample[-1].mStartCycle <= ioSample->mEndCycle);
+	JPH_ASSERT(sample >= inEnd || sample->mStartCycle >= ioSample->mEndCycle);
+	ioSample = sample - 1;
+}
+
+void Profiler::DumpInternal()
+{
+	// Freeze data from threads
+	// Note that this is not completely thread safe: As a profile sample is added mCurrentSample is incremented
+	// but the data is not written until the sample finishes. So if we dump the profile information while
+	// some other thread is running, we may get some garbage information from the previous frame
+	Threads threads;
+	for (ProfileThread *t : mThreads)
+		threads.push_back({ t->mThreadName, t->mSamples, t->mSamples + t->mCurrentSample });
+
+	// Shift all samples so that the first sample is at zero
+	uint64 min_cycle = 0xffffffffffffffffUL;
+	for (const ThreadSamples &t : threads)
+		if (t.mSamplesBegin < t.mSamplesEnd)
+			min_cycle = min(min_cycle, t.mSamplesBegin[0].mStartCycle);
+	for (const ThreadSamples &t : threads)
+		for (ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+		{
+			s->mStartCycle -= min_cycle;
+			s->mEndCycle -= min_cycle;
+		}
+
+	// Determine tag of this profile
+	String tag;
+	if (mDumpTag.empty())
+	{
+		// Next sequence number
+		static int number = 0;
+		++number;
+		tag = ConvertToString(number);
+	}
+	else
+	{
+		// Take provided tag
+		tag = mDumpTag;
+		mDumpTag.clear();
+	}
+
+	// Aggregate data across threads
+	Aggregators aggregators;
+	KeyToAggregator key_to_aggregators;
+	for (const ThreadSamples &t : threads)
+		for (ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+			sAggregate(0, Color::sGetDistinctColor(0).GetUInt32(), s, end, aggregators, key_to_aggregators);
+
+	// Dump as chart
+	DumpChart(tag.c_str(), threads, key_to_aggregators, aggregators);
+}
+
+static String sHTMLEncode(const char *inString)
+{
+	String str(inString);
+	StringReplace(str, "<", "&lt;");
+	StringReplace(str, ">", "&gt;");
+	return str;
+}
+
+void Profiler::DumpChart(const char *inTag, const Threads &inThreads, const KeyToAggregator &inKeyToAggregators, const Aggregators &inAggregators)
+{
+	// Open file
+	std::ofstream f;
+	f.open(StringFormat("profile_chart_%s.html", inTag).c_str(), std::ofstream::out | std::ofstream::trunc);
+	if (!f.is_open())
+		return;
+
+	// Write header
+	f << R"(<!DOCTYPE html>
+<html>
+	<head>
+		<title>Profile Chart</title>
+		<style>
+			html, body {
+				padding: 0px;
+				border: 0px;
+				margin: 0px;
+				width: 100%;
+				height: 100%;
+				overflow: hidden;
+			}
+
+			canvas {
+				position: absolute;
+				top: 10px;
+				left: 10px;
+				padding: 0px;
+				border: 0px;
+				margin: 0px;
+			}
+
+			#tooltip {
+				font: Courier New;
+				position: absolute;
+				background-color: white;
+				border: 1px;
+				border-style: solid;
+				border-color: black;
+				pointer-events: none;
+				padding: 5px;
+				font: 14px Arial;
+				visibility: hidden;
+				height: auto;
+			}
+
+			.stat {
+				color: blue;
+				text-align: right;
+			}
+		</style>
+		<script type="text/javascript">
+			var canvas;
+			var ctx;
+			var tooltip;
+			var min_scale;
+			var scale;
+			var offset_x = 0;
+			var offset_y = 0;
+			var size_y;
+			var dragging = false;
+			var previous_x = 0;
+			var previous_y = 0;
+			var bar_height = 15;
+			var line_height = bar_height + 2;
+			var thread_separation = 6;
+			var thread_font_size = 12;
+			var thread_font = thread_font_size + "px Arial";
+			var bar_font_size = 10;
+			var bar_font = bar_font_size + "px Arial";
+			var end_cycle = 0;
+
+			function drawChart()
+			{
+				ctx.clearRect(0, 0, canvas.width, canvas.height);
+
+				ctx.lineWidth = 1;
+
+				var y = offset_y;
+
+				for (var t = 0; t < threads.length; t++)
+				{
+					// Check if thread has samples
+					var thread = threads[t];
+					if (thread.start.length == 0)
+						continue;
+
+					// Draw thread name
+					y += thread_font_size;
+					ctx.font = thread_font;
+					ctx.fillStyle = "#000000";
+					ctx.fillText(thread.thread_name, 0, y);
+					y += thread_separation;
+
+					// Draw outlines for each bar of samples
+					ctx.fillStyle = "#c0c0c0";
+					for (var d = 0; d <= thread.max_depth; d++)
+						ctx.fillRect(0, y + d * line_height, canvas.width, bar_height);
+
+					// Draw samples
+					ctx.font = bar_font;
+					for (var s = 0; s < thread.start.length; s++)
+					{
+						// Cull bar
+						var rx = scale * (offset_x + thread.start[s]);
+						if (rx > canvas.width) // right of canvas
+							break;
+						var rw = scale * thread.cycles[s];
+						if (rw < 0.5) // less than half pixel, skip
+							continue;
+						if (rx + rw < 0) // left of canvas
+							continue;
+
+						// Draw bar
+						var ry = y + line_height * thread.depth[s];
+						ctx.fillStyle = thread.color[s];
+						ctx.fillRect(rx, ry, rw, bar_height);
+						ctx.strokeStyle = thread.darkened_color[s];
+						ctx.strokeRect(rx, ry, rw, bar_height);
+
+						// Get index in aggregated list
+						var a = thread.aggregator[s];
+
+						// Draw text
+						if (rw > aggregated.name_width[a])
+						{
+							ctx.fillStyle = "#000000";
+							ctx.fillText(aggregated.name[a], rx + (rw - aggregated.name_width[a]) / 2, ry + bar_height - 4);
+						}
+					}
+
+					// Next line
+					y += line_height * (1 + thread.max_depth) + thread_separation;
+				}
+
+				// Update size
+				size_y = y - offset_y;
+			}
+
+			function drawTooltip(mouse_x, mouse_y)
+			{
+				var y = offset_y;
+
+				for (var t = 0; t < threads.length; t++)
+				{
+					// Check if thread has samples
+					var thread = threads[t];
+					if (thread.start.length == 0)
+						continue;
+
+					// Thead name
+					y += thread_font_size + thread_separation;
+
+					// Draw samples
+					for (var s = 0; s < thread.start.length; s++)
+					{
+						// Cull bar
+						var rx = scale * (offset_x + thread.start[s]);
+						if (rx > mouse_x)
+							break;
+						var rw = scale * thread.cycles[s];
+						if (rx + rw < mouse_x)
+							continue;
+
+						var ry = y + line_height * thread.depth[s];
+						if (mouse_y >= ry && mouse_y < ry + bar_height)
+						{
+							// Get index into aggregated list
+							var a = thread.aggregator[s];
+
+							// Found bar, fill in tooltip
+							tooltip.style.left = (canvas.offsetLeft + mouse_x) + "px";
+							tooltip.style.top = (canvas.offsetTop + mouse_y) + "px";
+							tooltip.style.visibility = "visible";
+							tooltip.innerHTML = aggregated.name[a] + "<br>"
+								+ "<table>"
+								+ "<tr><td>Time:</td><td class=\"stat\">" + (1000000 * thread.cycles[s] / cycles_per_second).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>Start:</td><td class=\"stat\">" + (1000000 * thread.start[s] / cycles_per_second).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>End:</td><td class=\"stat\">" + (1000000 * (thread.start[s] + thread.cycles[s]) / cycles_per_second).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>Avg. Time:</td><td class=\"stat\">" + (1000000 * aggregated.cycles_per_frame[a] / cycles_per_second / aggregated.calls[a]).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>Min Time:</td><td class=\"stat\">" + (1000000 * aggregated.min_cycles[a] / cycles_per_second).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>Max Time:</td><td class=\"stat\">" + (1000000 * aggregated.max_cycles[a] / cycles_per_second).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>Time / Frame:</td><td class=\"stat\">" + (1000000 * aggregated.cycles_per_frame[a] / cycles_per_second).toFixed(2) + " &micro;s</td></tr>"
+								+ "<tr><td>Calls:</td><td class=\"stat\">" + aggregated.calls[a] + "</td></tr>"
+								+ "</table>";
+							return;
+						}
+					}
+
+					// Next line
+					y += line_height * (1 + thread.max_depth) + thread_separation;
+				}
+
+				// No bar found, hide tooltip
+				tooltip.style.visibility = "hidden";
+			}
+
+			function onMouseDown(evt)
+			{
+				dragging = true;
+				previous_x = evt.clientX, previous_y = evt.clientY;
+				tooltip.style.visibility = "hidden";
+			}
+
+			function onMouseUp(evt)
+			{
+				dragging = false;
+			}
+
+			function clampMotion()
+			{
+				// Clamp horizontally
+				var min_offset_x = canvas.width / scale - end_cycle;
+				if (offset_x < min_offset_x)
+					offset_x = min_offset_x;
+				if (offset_x > 0)
+					offset_x = 0;
+
+				// Clamp vertically
+				var min_offset_y = canvas.height - size_y;
+				if (offset_y < min_offset_y)
+					offset_y = min_offset_y;
+				if (offset_y > 0)
+					offset_y = 0;
+
+				// Clamp scale
+				if (scale < min_scale)
+					scale = min_scale;
+				var max_scale = 1000 * min_scale;
+				if (scale > max_scale)
+					scale = max_scale;
+			}
+
+			function onMouseMove(evt)
+			{
+				if (dragging)
+				{
+					// Calculate new offset
+					offset_x += (evt.clientX - previous_x) / scale;
+					offset_y += evt.clientY - previous_y;
+
+					clampMotion();
+
+					drawChart();
+				}
+				else
+					drawTooltip(evt.clientX - canvas.offsetLeft, evt.clientY - canvas.offsetTop);
+
+				previous_x = evt.clientX, previous_y = evt.clientY;
+			}
+
+			function onScroll(evt)
+			{
+				tooltip.style.visibility = "hidden";
+
+				var old_scale = scale;
+				if (evt.deltaY > 0)
+					scale /= 1.1;
+				else
+					scale *= 1.1;
+
+				clampMotion();
+
+				// Ensure that event under mouse stays under mouse
+				var x = previous_x - canvas.offsetLeft;
+				offset_x += x / scale - x / old_scale;
+
+				clampMotion();
+
+				drawChart();
+			}
+
+			function darkenColor(color)
+			{
+				var i = parseInt(color.slice(1), 16);
+
+				var r = i >> 16;
+				var g = (i >> 8) & 0xff;
+				var b = i & 0xff;
+
+				r = Math.round(0.8 * r);
+				g = Math.round(0.8 * g);
+				b = Math.round(0.8 * b);
+
+				i = (r << 16) + (g << 8) + b;
+
+				return "#" + i.toString(16);
+			}
+
+			function startChart()
+			{
+				// Fetch elements
+				canvas = document.getElementById('canvas');
+				ctx = canvas.getContext("2d");
+				tooltip = document.getElementById('tooltip');
+
+				// Resize canvas to fill screen
+				canvas.width = document.body.offsetWidth - 20;
+				canvas.height = document.body.offsetHeight - 20;
+
+				// Register mouse handlers
+				canvas.onmousedown = onMouseDown;
+				canvas.onmouseup = onMouseUp;
+				canvas.onmouseout = onMouseUp;
+				canvas.onmousemove = onMouseMove;
+				canvas.onwheel	= onScroll;
+
+				for (var t = 0; t < threads.length; t++)
+				{
+					var thread = threads[t];
+
+					// Calculate darkened colors
+					thread.darkened_color = new Array(thread.color.length);
+					for (var s = 0; s < thread.color.length; s++)
+						thread.darkened_color[s] = darkenColor(thread.color[s]);
+
+					// Calculate max depth and end cycle
+					thread.max_depth = 0;
+					for (var s = 0; s < thread.start.length; s++)
+					{
+						thread.max_depth = Math.max(thread.max_depth, thread.depth[s]);
+						end_cycle = Math.max(end_cycle, thread.start[s] + thread.cycles[s]);
+					}
+				}
+
+				// Calculate width of name strings
+				ctx.font = bar_font;
+				aggregated.name_width = new Array(aggregated.name.length);
+				for (var a = 0; a < aggregated.name.length; a++)
+					aggregated.name_width[a] = ctx.measureText(aggregated.name[a]).width;
+
+				// Store scale properties
+				min_scale = canvas.width / end_cycle;
+				scale = min_scale;
+
+				drawChart();
+			}
+		</script>
+	</head>
+	<body onload="startChart();">
+	<script type="text/javascript">
+)";
+
+	// Get cycles per second
+	uint64 cycles_per_second = GetProcessorTicksPerSecond();
+	f << "var cycles_per_second = " << cycles_per_second << ";\n";
+
+	// Dump samples
+	f << "var threads = [\n";
+	bool first_thread = true;
+	for (const ThreadSamples &t : inThreads)
+	{
+		if (!first_thread)
+			f << ",\n";
+		first_thread = false;
+
+		f << "{\nthread_name: \"" << t.mThreadName << "\",\naggregator: [";
+		bool first = true;
+		for (const ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+		{
+			if (!first)
+				f << ",";
+			first = false;
+			f << inKeyToAggregators.find(s->mName)->second;
+		}
+		f << "],\ncolor: [";
+		first = true;
+		for (const ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+		{
+			if (!first)
+				f << ",";
+			first = false;
+			Color c(s->mColor);
+			f << StringFormat("\"#%02x%02x%02x\"", c.r, c.g, c.b);
+		}
+		f << "],\nstart: [";
+		first = true;
+		for (const ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+		{
+			if (!first)
+				f << ",";
+			first = false;
+			f << s->mStartCycle;
+		}
+		f << "],\ncycles: [";
+		first = true;
+		for (const ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+		{
+			if (!first)
+				f << ",";
+			first = false;
+			f << s->mEndCycle - s->mStartCycle;
+		}
+		f << "],\ndepth: [";
+		first = true;
+		for (const ProfileSample *s = t.mSamplesBegin, *end = t.mSamplesEnd; s < end; ++s)
+		{
+			if (!first)
+				f << ",";
+			first = false;
+			f << int(s->mDepth);
+		}
+		f << "]\n}";
+	}
+
+	// Dump aggregated data
+	f << "];\nvar aggregated = {\nname: [";
+	bool first = true;
+	for (const Aggregator &a : inAggregators)
+	{
+		if (!first)
+			f << ",";
+		first = false;
+		String name = "\"" + sHTMLEncode(a.mName) + "\"";
+		f << name;
+	}
+	f << "],\ncalls: [";
+	first = true;
+	for (const Aggregator &a : inAggregators)
+	{
+		if (!first)
+			f << ",";
+		first = false;
+		f << a.mCallCounter;
+	}
+	f << "],\nmin_cycles: [";
+	first = true;
+	for (const Aggregator &a : inAggregators)
+	{
+		if (!first)
+			f << ",";
+		first = false;
+		f << a.mMinCyclesInCallWithChildren;
+	}
+	f << "],\nmax_cycles: [";
+	first = true;
+	for (const Aggregator &a : inAggregators)
+	{
+		if (!first)
+			f << ",";
+		first = false;
+		f << a.mMaxCyclesInCallWithChildren;
+	}
+	f << "],\ncycles_per_frame: [";
+	first = true;
+	for (const Aggregator &a : inAggregators)
+	{
+		if (!first)
+			f << ",";
+		first = false;
+		f << a.mTotalCyclesInCallWithChildren;
+	}
+
+	// Write footer
+	f << R"(]};
+</script>
+
+<canvas id="canvas"></canvas>
+<div id="tooltip"></div>
+
+</tbody></table></body></html>)";
+}
+
+#endif // JPH_PROFILE_ENABLED
+
+JPH_NAMESPACE_END
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Profiler.h
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Profiler.h
@ -0,0 +1,300 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <mutex>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+#include <Jolt/Core/NonCopyable.h>
+#include <Jolt/Core/TickCounter.h>
+#include <Jolt/Core/UnorderedMapFwd.h>
+
+#if defined(JPH_EXTERNAL_PROFILE)
+
+JPH_NAMESPACE_BEGIN
+
+#ifdef JPH_SHARED_LIBRARY
+/// Functions called when a profiler measurement starts or stops, need to be overridden by the user.
+using ProfileStartMeasurementFunction = void (*)(const char *inName, uint32 inColor, uint8 *ioUserData);
+using ProfileEndMeasurementFunction = void (*)(uint8 *ioUserData);
+
+JPH_EXPORT extern ProfileStartMeasurementFunction ProfileStartMeasurement;
+JPH_EXPORT extern ProfileEndMeasurementFunction ProfileEndMeasurement;
+#endif // JPH_SHARED_LIBRARY
+
+/// Create this class on the stack to start sampling timing information of a particular scope.
+///
+/// For statically linked builds, this is left unimplemented intentionally. Needs to be implemented by the user of the library.
+/// On construction a measurement should start, on destruction it should be stopped.
+/// For dynamically linked builds, the user should override the ProfileStartMeasurement and ProfileEndMeasurement functions.
+class alignas(16) ExternalProfileMeasurement : public NonCopyable
+{
+public:
+	/// Constructor
+#ifdef JPH_SHARED_LIBRARY
+	JPH_INLINE						ExternalProfileMeasurement(const char *inName, uint32 inColor = 0) { ProfileStartMeasurement(inName, inColor, mUserData); }
+	JPH_INLINE						~ExternalProfileMeasurement() { ProfileEndMeasurement(mUserData); }
+#else
+									ExternalProfileMeasurement(const char *inName, uint32 inColor = 0);
+									~ExternalProfileMeasurement();
+#endif
+
+private:
+	uint8							mUserData[64];
+};
+
+JPH_NAMESPACE_END
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Macros to do the actual profiling
+//////////////////////////////////////////////////////////////////////////////////////////
+
+JPH_SUPPRESS_WARNING_PUSH
+JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic")
+
+// Dummy implementations
+#define JPH_PROFILE_START(name)
+#define JPH_PROFILE_END()
+#define JPH_PROFILE_THREAD_START(name)
+#define JPH_PROFILE_THREAD_END()
+#define JPH_PROFILE_NEXTFRAME()
+#define JPH_PROFILE_DUMP(...)
+
+// Scope profiling measurement
+#define JPH_PROFILE_TAG2(line)		profile##line
+#define JPH_PROFILE_TAG(line)		JPH_PROFILE_TAG2(line)
+
+/// Macro to collect profiling information.
+///
+/// Usage:
+///
+///		{
+///			JPH_PROFILE("Operation");
+///			do operation;
+///		}
+///
+#define JPH_PROFILE(...)			ExternalProfileMeasurement JPH_PROFILE_TAG(__LINE__)(__VA_ARGS__)
+
+// Scope profiling for function
+#define JPH_PROFILE_FUNCTION()		JPH_PROFILE(JPH_FUNCTION_NAME)
+
+JPH_SUPPRESS_WARNING_POP
+
+#elif defined(JPH_PROFILE_ENABLED)
+
+JPH_NAMESPACE_BEGIN
+
+class ProfileSample;
+class ProfileThread;
+
+/// Singleton class for managing profiling information
+class JPH_EXPORT Profiler : public NonCopyable
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+								Profiler()															{ UpdateReferenceTime(); }
+
+	/// Increments the frame counter to provide statistics per frame
+	void						NextFrame();
+
+	/// Dump profiling statistics at the start of the next frame
+	/// @param inTag If not empty, this overrides the auto incrementing number in the filename of the dump file
+	void						Dump(const string_view &inTag = string_view());
+
+	/// Add a thread to be instrumented
+	void						AddThread(ProfileThread *inThread);
+
+	/// Remove a thread from being instrumented
+	void						RemoveThread(ProfileThread *inThread);
+
+	/// Get the amount of ticks per second, note that this number will never be fully accurate as the amount of ticks per second may vary with CPU load, so this number is only to be used to give an indication of time for profiling purposes
+	uint64						GetProcessorTicksPerSecond() const;
+
+	/// Singleton instance
+	static Profiler *			sInstance;
+
+private:
+	/// Helper class to freeze ProfileSamples per thread while processing them
+	struct ThreadSamples
+	{
+		String					mThreadName;
+		ProfileSample *			mSamplesBegin;
+		ProfileSample *			mSamplesEnd;
+	};
+
+	/// Helper class to aggregate ProfileSamples
+	class Aggregator
+	{
+	public:
+		/// Constructor
+								Aggregator(const char *inName)										: mName(inName) { }
+
+		/// Accumulate results for a measurement
+		void					AccumulateMeasurement(uint64 inCyclesInCallWithChildren)
+		{
+			mCallCounter++;
+			mTotalCyclesInCallWithChildren += inCyclesInCallWithChildren;
+			mMinCyclesInCallWithChildren = min(inCyclesInCallWithChildren, mMinCyclesInCallWithChildren);
+			mMaxCyclesInCallWithChildren = max(inCyclesInCallWithChildren, mMaxCyclesInCallWithChildren);
+		}
+
+		/// Sort descending by total cycles
+		bool					operator < (const Aggregator &inRHS) const
+		{
+			return mTotalCyclesInCallWithChildren > inRHS.mTotalCyclesInCallWithChildren;
+		}
+
+		/// Identification
+		const char *			mName;																///< User defined name of this item
+
+		/// Statistics
+		uint32					mCallCounter = 0;													///< Number of times AccumulateMeasurement was called
+		uint64					mTotalCyclesInCallWithChildren = 0;									///< Total amount of cycles spent in this scope
+		uint64					mMinCyclesInCallWithChildren = 0xffffffffffffffffUL;				///< Minimum amount of cycles spent per call
+		uint64					mMaxCyclesInCallWithChildren = 0;									///< Maximum amount of cycles spent per call
+	};
+
+	using Threads = Array<ThreadSamples>;
+	using Aggregators = Array<Aggregator>;
+	using KeyToAggregator = UnorderedMap<const char *, size_t>;
+
+	/// Helper function to aggregate profile sample data
+	static void					sAggregate(int inDepth, uint32 inColor, ProfileSample *&ioSample, const ProfileSample *inEnd, Aggregators &ioAggregators, KeyToAggregator &ioKeyToAggregator);
+
+	/// We measure the amount of ticks per second, this function resets the reference time point
+	void						UpdateReferenceTime();
+
+	/// Dump profiling statistics
+	void						DumpInternal();
+	void						DumpChart(const char *inTag, const Threads &inThreads, const KeyToAggregator &inKeyToAggregators, const Aggregators &inAggregators);
+
+	std::mutex					mLock;																///< Lock that protects mThreads
+	uint64						mReferenceTick;														///< Tick count at the start of the frame
+	uint64						mReferenceTime;														///< Time at the start of the frame in microseconds
+	Array<ProfileThread *>		mThreads;															///< List of all active threads
+	bool						mDump = false;														///< When true, the samples are dumped next frame
+	String						mDumpTag;															///< When not empty, this overrides the auto incrementing number of the dump filename
+};
+
+// Class that contains the information of a single scoped measurement
+class alignas(16) JPH_EXPORT_GCC_BUG_WORKAROUND ProfileSample : public NonCopyable
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	const char *				mName;																///< User defined name of this item
+	uint32						mColor;																///< Color to use for this sample
+	uint8						mDepth;																///< Calculated depth
+	uint8						mUnused[3];
+	uint64						mStartCycle;														///< Cycle counter at start of measurement
+	uint64						mEndCycle;															///< Cycle counter at end of measurement
+};
+
+/// Collects all samples of a single thread
+class ProfileThread : public NonCopyable
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	/// Constructor
+	inline						ProfileThread(const string_view &inThreadName);
+	inline						~ProfileThread();
+
+	static const uint cMaxSamples = 65536;
+
+	String						mThreadName;														///< Name of the thread that we're collecting information for
+	ProfileSample				mSamples[cMaxSamples];												///< Buffer of samples
+	uint						mCurrentSample = 0;													///< Next position to write a sample to
+
+#ifdef JPH_SHARED_LIBRARY
+	JPH_EXPORT static void		sSetInstance(ProfileThread *inInstance);
+	JPH_EXPORT static ProfileThread *sGetInstance();
+#else
+	static inline void			sSetInstance(ProfileThread *inInstance)								{ sInstance = inInstance; }
+	static inline ProfileThread *sGetInstance()														{ return sInstance; }
+
+private:
+	static thread_local ProfileThread *sInstance;
+#endif
+};
+
+/// Create this class on the stack to start sampling timing information of a particular scope
+class JPH_EXPORT ProfileMeasurement : public NonCopyable
+{
+public:
+	/// Constructor
+	inline						ProfileMeasurement(const char *inName, uint32 inColor = 0);
+	inline						~ProfileMeasurement();
+
+private:
+	ProfileSample *				mSample;
+	ProfileSample				mTemp;
+
+	static bool					sOutOfSamplesReported;
+};
+
+JPH_NAMESPACE_END
+
+#include "Profiler.inl"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Macros to do the actual profiling
+//////////////////////////////////////////////////////////////////////////////////////////
+
+JPH_SUPPRESS_WARNING_PUSH
+JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic")
+
+/// Start instrumenting program
+#define JPH_PROFILE_START(name)			do { Profiler::sInstance = new Profiler; JPH_PROFILE_THREAD_START(name); } while (false)
+
+/// End instrumenting program
+#define JPH_PROFILE_END()				do { JPH_PROFILE_THREAD_END(); delete Profiler::sInstance; Profiler::sInstance = nullptr; } while (false)
+
+/// Start instrumenting a thread
+#define JPH_PROFILE_THREAD_START(name)	do { if (Profiler::sInstance) ProfileThread::sSetInstance(new ProfileThread(name)); } while (false)
+
+/// End instrumenting a thread
+#define JPH_PROFILE_THREAD_END()		do { delete ProfileThread::sGetInstance(); ProfileThread::sSetInstance(nullptr); } while (false)
+
+/// Scope profiling measurement
+#define JPH_PROFILE_TAG2(line)			profile##line
+#define JPH_PROFILE_TAG(line)			JPH_PROFILE_TAG2(line)
+#define JPH_PROFILE(...)				ProfileMeasurement JPH_PROFILE_TAG(__LINE__)(__VA_ARGS__)
+
+/// Scope profiling for function
+#define JPH_PROFILE_FUNCTION()			JPH_PROFILE(JPH_FUNCTION_NAME)
+
+/// Update frame counter
+#define JPH_PROFILE_NEXTFRAME()			Profiler::sInstance->NextFrame()
+
+/// Dump profiling info
+#define JPH_PROFILE_DUMP(...)			Profiler::sInstance->Dump(__VA_ARGS__)
+
+JPH_SUPPRESS_WARNING_POP
+
+#else
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Dummy profiling instructions
+//////////////////////////////////////////////////////////////////////////////////////////
+
+JPH_SUPPRESS_WARNING_PUSH
+JPH_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic")
+
+#define JPH_PROFILE_START(name)
+#define JPH_PROFILE_END()
+#define JPH_PROFILE_THREAD_START(name)
+#define JPH_PROFILE_THREAD_END()
+#define JPH_PROFILE(...)
+#define JPH_PROFILE_FUNCTION()
+#define JPH_PROFILE_NEXTFRAME()
+#define JPH_PROFILE_DUMP(...)
+
+JPH_SUPPRESS_WARNING_POP
+
+#endif
--- a/lib/haxejolt/JoltPhysics/Jolt/Core/Profiler.inl
+++ b/lib/haxejolt/JoltPhysics/Jolt/Core/Profiler.inl
@ -0,0 +1,90 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+JPH_NAMESPACE_BEGIN
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// ProfileThread
+//////////////////////////////////////////////////////////////////////////////////////////
+
+ProfileThread::ProfileThread(const string_view &inThreadName) :
+	mThreadName(inThreadName)
+{
+	Profiler::sInstance->AddThread(this);
+}
+
+ProfileThread::~ProfileThread()
+{
+	Profiler::sInstance->RemoveThread(this);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// ProfileMeasurement
+//////////////////////////////////////////////////////////////////////////////////////////
+
+JPH_TSAN_NO_SANITIZE // TSAN reports a race on sOutOfSamplesReported, however the worst case is that we report the out of samples message multiple times
+ProfileMeasurement::ProfileMeasurement(const char *inName, uint32 inColor)
+{
+	ProfileThread *current_thread = ProfileThread::sGetInstance();
+	if (current_thread == nullptr)
+	{
+		// Thread not instrumented
+		mSample = nullptr;
+	}
+	else if (current_thread->mCurrentSample < ProfileThread::cMaxSamples)
+	{
+		// Get pointer to write data to
+		mSample = &current_thread->mSamples[current_thread->mCurrentSample++];
+
+		// Start constructing sample (will end up on stack)
+		mTemp.mName = inName;
+		mTemp.mColor = inColor;
+
+		// Collect start sample last
+		mTemp.mStartCycle = GetProcessorTickCount();
+	}
+	else
+	{
+		// Out of samples
+		if (!sOutOfSamplesReported)
+		{
+			sOutOfSamplesReported = true;
+			Trace("ProfileMeasurement: Too many samples, some data will be lost!");
+		}
+		mSample = nullptr;
+	}
+}
+
+ProfileMeasurement::~ProfileMeasurement()
+{
+	if (mSample != nullptr)
+	{
+		// Finalize sample
+		mTemp.mEndCycle = GetProcessorTickCount();
+
+		// Write it to the memory buffer bypassing the cache
+		static_assert(sizeof(ProfileSample) == 32, "Assume 32 bytes");
+		static_assert(alignof(ProfileSample) == 16, "Assume 16 byte alignment");
+	#if defined(JPH_USE_SSE)
+		const __m128i *src = reinterpret_cast<const __m128i *>(&mTemp);
+		__m128i *dst = reinterpret_cast<__m128i *>(mSample);
+		__m128i val = _mm_loadu_si128(src);
+		_mm_stream_si128(dst, val);
+		val = _mm_loadu_si128(src + 1);
+		_mm_stream_si128(dst + 1, val);
+	#elif defined(JPH_USE_NEON)
+		const int *src = reinterpret_cast<const int *>(&mTemp);
+		int *dst = reinterpret_cast<int *>(mSample);
+		int32x4_t val = vld1q_s32(src);
+		vst1q_s32(dst, val);
+		val = vld1q_s32(src + 4);
+		vst1q_s32(dst + 4, val);
+	#else
+		memcpy(mSample, &mTemp, sizeof(ProfileSample));
+	#endif
+		mSample = nullptr;
+	}
+}
+
+JPH_NAMESPACE_END
--- a/Show More
+++ b/Show More