diff --git a/drivers/gles2/rasterizer_canvas_gles2.cpp b/drivers/gles2/rasterizer_canvas_gles2.cpp
index 69d06251fff..73957beb81a 100644
--- a/drivers/gles2/rasterizer_canvas_gles2.cpp
+++ b/drivers/gles2/rasterizer_canvas_gles2.cpp
@@ -221,9 +221,7 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 	int command_count = p_item->commands.size();
 	Item::Command *const *commands = p_item->commands.ptr();
 
-	Transform2D transform;
-	TransformMode transform_mode = _find_transform_mode(r_fill_state.use_hardware_transform, p_item->final_transform, transform);
-
+	// just a local, might be more efficient in a register (check)
 	Vector2 texpixel_size = r_fill_state.texpixel_size;
 
 	// checking the color for not being white makes it 92/90 times faster in the case where it is white
@@ -252,7 +250,36 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 		switch (command->type) {
 
 			default: {
-				_prefill_default_batch(r_fill_state, command_num);
+				_prefill_default_batch(r_fill_state, command_num, *p_item);
+			} break;
+			case Item::Command::TYPE_TRANSFORM: {
+				// if the extra matrix has been sent already,
+				// break this extra matrix software path (as we don't want to unset it on the GPU etc)
+				if (r_fill_state.extra_matrix_sent) {
+					_prefill_default_batch(r_fill_state, command_num, *p_item);
+				} else {
+					// Extra matrix fast path.
+					// Instead of sending the command immediately, we store the modified transform (in combined)
+					// for software transform, and only flush this transform command if we NEED to (i.e. we want to
+					// render some default commands)
+					Item::CommandTransform *transform = static_cast<Item::CommandTransform *>(command);
+					const Transform2D &extra_matrix = transform->xform;
+
+					if (r_fill_state.use_hardware_transform) {
+						// if we are using hardware transform mode, we have already sent the final transform,
+						// so we only want to software transform the extra matrix
+						r_fill_state.transform_combined = extra_matrix;
+					} else {
+						r_fill_state.transform_combined = p_item->final_transform * extra_matrix;
+					}
+					// after a transform command, always use some form of software transform (either the combined final + extra, or just the extra)
+					// until we flush this dirty extra matrix because we need to render default commands.
+					r_fill_state.transform_mode = _find_transform_mode(r_fill_state.transform_combined);
+
+					// make a note of which command the dirty extra matrix is store in, so we can send it later
+					// if necessary
+					r_fill_state.transform_extra_command_number_p1 = command_num + 1; // plus 1 so we can test against zero
+				}
 			} break;
 			case Item::Command::TYPE_RECT: {
 
@@ -277,7 +304,7 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 						int command_num_next = command_num + 1;
 						if (command_num_next < command_count) {
 							Item::Command *command_next = commands[command_num_next];
-							if (command_next->type != Item::Command::TYPE_RECT) {
+							if ((command_next->type != Item::Command::TYPE_RECT) && (command_next->type != Item::Command::TYPE_TRANSFORM)) {
 								is_single_rect = true;
 							}
 						} else {
@@ -285,7 +312,7 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 						}
 						// if it is a rect on its own, do exactly the same as the default routine
 						if (is_single_rect) {
-							_prefill_default_batch(r_fill_state, command_num);
+							_prefill_default_batch(r_fill_state, command_num, *p_item);
 							break;
 						}
 					} // if use hardware transform
@@ -352,8 +379,8 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 				// fill the quad geometry
 				Vector2 mins = rect->rect.position;
 
-				if (transform_mode == TM_TRANSLATE) {
-					_software_transform_vertex(mins, transform);
+				if (r_fill_state.transform_mode == TM_TRANSLATE) {
+					_software_transform_vertex(mins, r_fill_state.transform_combined);
 				}
 
 				Vector2 maxs = mins + rect->rect.size;
@@ -385,11 +412,11 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 					SWAP(bB->pos, bC->pos);
 				}
 
-				if (transform_mode == TM_ALL) {
-					_software_transform_vertex(bA->pos, transform);
-					_software_transform_vertex(bB->pos, transform);
-					_software_transform_vertex(bC->pos, transform);
-					_software_transform_vertex(bD->pos, transform);
+				if (r_fill_state.transform_mode == TM_ALL) {
+					_software_transform_vertex(bA->pos, r_fill_state.transform_combined);
+					_software_transform_vertex(bB->pos, r_fill_state.transform_combined);
+					_software_transform_vertex(bC->pos, r_fill_state.transform_combined);
+					_software_transform_vertex(bD->pos, r_fill_state.transform_combined);
 				}
 
 				// uvs
@@ -1452,6 +1479,7 @@ void RasterizerCanvasGLES2::render_joined_item_commands(const BItemJoined &p_bij
 	FillState fill_state;
 	fill_state.reset();
 	fill_state.use_hardware_transform = p_bij.use_hardware_transform();
+	fill_state.extra_matrix_sent = false;
 
 	for (unsigned int i = 0; i < p_bij.num_item_refs; i++) {
 		const BItemRef &ref = bdata.item_refs[p_bij.first_item_ref + i];
@@ -1461,6 +1489,23 @@ void RasterizerCanvasGLES2::render_joined_item_commands(const BItemJoined &p_bij
 		int command_count = item->commands.size();
 		int command_start = 0;
 
+		// ONCE OFF fill state setup, that will be retained over multiple calls to
+		// prefill_joined_item()
+		fill_state.transform_combined = item->final_transform;
+
+		// decide the initial transform mode, and make a backup
+		// in orig_transform_mode in case we need to switch back
+		if (!fill_state.use_hardware_transform) {
+			fill_state.transform_mode = _find_transform_mode(fill_state.transform_combined);
+		} else {
+			fill_state.transform_mode = TM_NONE;
+		}
+		fill_state.orig_transform_mode = fill_state.transform_mode;
+
+		// keep track of when we added an extra matrix
+		// so we can defer sending until we see a default command
+		fill_state.transform_extra_command_number_p1 = 0;
+
 		while (command_start < command_count) {
 			// fill as many batches as possible (until all done, or the vertex buffer is full)
 			bool bFull = prefill_joined_item(fill_state, command_start, item, p_current_clip, r_reclip, p_material);
@@ -1469,7 +1514,6 @@ void RasterizerCanvasGLES2::render_joined_item_commands(const BItemJoined &p_bij
 				// always pass first item (commands for default are always first item)
 				flush_render_batches(first_item, p_current_clip, r_reclip, p_material);
 				fill_state.reset();
-				fill_state.use_hardware_transform = p_bij.use_hardware_transform();
 			}
 		}
 	}
@@ -1799,7 +1843,7 @@ bool RasterizerCanvasGLES2::try_join_item(Item *p_ci, RenderItemState &r_ris, bo
 	}
 
 	// non rects will break the batching anyway, we don't want to record item changes, detect this
-	if (_detect_batch_break(p_ci)) {
+	if (!r_batch_break && _detect_batch_break(p_ci)) {
 		join = false;
 		r_batch_break = true;
 	}
@@ -1847,7 +1891,8 @@ bool RasterizerCanvasGLES2::_detect_batch_break(Item *p_ci) {
 				default: {
 					return true;
 				} break;
-				case Item::Command::TYPE_RECT: {
+				case Item::Command::TYPE_RECT:
+				case Item::Command::TYPE_TRANSFORM: {
 				} break;
 			} // switch
 
diff --git a/drivers/gles2/rasterizer_canvas_gles2.h b/drivers/gles2/rasterizer_canvas_gles2.h
index cf8adba95e9..4de3a197c27 100644
--- a/drivers/gles2/rasterizer_canvas_gles2.h
+++ b/drivers/gles2/rasterizer_canvas_gles2.h
@@ -203,9 +203,10 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
 
 	struct FillState {
 		void reset() {
+			// don't reset members that need to be preserved after flushing
+			// half way through a list of commands
 			curr_batch = 0;
 			batch_tex_id = -1;
-			use_hardware_transform = true;
 			texpixel_size = Vector2(1, 1);
 		}
 		Batch *curr_batch;
@@ -213,6 +214,13 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
 		bool use_hardware_transform;
 		Vector2 texpixel_size;
 		Color final_modulate;
+		TransformMode transform_mode;
+		TransformMode orig_transform_mode;
+
+		// support for extra matrices
+		bool extra_matrix_sent; // whether sent on this item (in which case sofware transform can't be used untl end of item)
+		int transform_extra_command_number_p1; // plus one to allow fast checking against zero
+		Transform2D transform_combined; // final * extra
 	};
 
 public:
@@ -247,8 +255,8 @@ private:
 	bool _detect_batch_break(Item *p_ci);
 	void _software_transform_vertex(BatchVector2 &r_v, const Transform2D &p_tr) const;
 	void _software_transform_vertex(Vector2 &r_v, const Transform2D &p_tr) const;
-	TransformMode _find_transform_mode(bool p_use_hardware_transform, const Transform2D &p_tr, Transform2D &r_tr) const;
-	_FORCE_INLINE_ void _prefill_default_batch(FillState &r_fill_state, int p_command_num);
+	TransformMode _find_transform_mode(const Transform2D &p_tr) const;
+	_FORCE_INLINE_ void _prefill_default_batch(FillState &r_fill_state, int p_command_num, const Item &p_item);
 
 	// light scissoring
 	bool _light_find_intersection(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect, Rect2 &r_cliprect) const;
@@ -262,12 +270,88 @@ public:
 
 //////////////////////////////////////////////////////////////
 
-_FORCE_INLINE_ void RasterizerCanvasGLES2::_prefill_default_batch(FillState &r_fill_state, int p_command_num) {
+// Default batches will not occur in software transform only items
+// EXCEPT IN THE CASE OF SINGLE RECTS (and this may well not occur, check the logic in prefill_join_item TYPE_RECT)
+// but can occur where transform commands have been sent during hardware batch
+_FORCE_INLINE_ void RasterizerCanvasGLES2::_prefill_default_batch(FillState &r_fill_state, int p_command_num, const Item &p_item) {
 	if (r_fill_state.curr_batch->type == Batch::BT_DEFAULT) {
-		// another default command, just add to the existing batch
-		r_fill_state.curr_batch->num_commands++;
+		// don't need to flush an extra transform command?
+		if (!r_fill_state.transform_extra_command_number_p1) {
+			// another default command, just add to the existing batch
+			r_fill_state.curr_batch->num_commands++;
+		} else {
+#ifdef DEBUG_ENABLED
+			if (r_fill_state.transform_extra_command_number_p1 != p_command_num) {
+				WARN_PRINT_ONCE("_prefill_default_batch : transform_extra_command_number_p1 != p_command_num");
+			}
+#endif
+			// we do have a pending extra transform command to flush
+			// either the extra transform is in the prior command, or not, in which case we need 2 batches
+			//			if (r_fill_state.transform_extra_command_number_p1 == p_command_num) {
+			// this should be most common case
+			r_fill_state.curr_batch->num_commands += 2;
+			//			} else {
+			//				// mad ordering .. does this even happen?
+			//				int extra_command = r_fill_state.transform_extra_command_number_p1 - 1; // plus 1 based
+
+			//				// send the extra to the GPU in a batch
+			//				r_fill_state.curr_batch = _batch_request_new();
+			//				r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
+			//				r_fill_state.curr_batch->first_command = extra_command;
+			//				r_fill_state.curr_batch->num_commands = 1;
+
+			//				// start default batch
+			//				r_fill_state.curr_batch = _batch_request_new();
+			//				r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
+			//				r_fill_state.curr_batch->first_command = p_command_num;
+			//				r_fill_state.curr_batch->num_commands = 1;
+			//			}
+
+			r_fill_state.transform_extra_command_number_p1 = 0; // mark as sent
+			r_fill_state.extra_matrix_sent = true;
+
+			// the original mode should always be hardware transform ..
+			// test this assumption
+			r_fill_state.transform_mode = r_fill_state.orig_transform_mode;
+
+			// do we need to restore anything else?
+		}
 	} else {
 		// end of previous different type batch, so start new default batch
+
+		// first consider whether there is a dirty extra matrix to send
+		if (r_fill_state.transform_extra_command_number_p1) {
+			// get which command the extra is in, and blank all the records as it no longer is stored CPU side
+			int extra_command = r_fill_state.transform_extra_command_number_p1 - 1; // plus 1 based
+			r_fill_state.transform_extra_command_number_p1 = 0;
+			r_fill_state.extra_matrix_sent = true;
+
+			// send the extra to the GPU in a batch
+			r_fill_state.curr_batch = _batch_request_new();
+			r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
+			r_fill_state.curr_batch->first_command = extra_command;
+			r_fill_state.curr_batch->num_commands = 1;
+
+			// revert to the original transform mode
+			// e.g. go back to NONE if we were in hardware transform mode
+			r_fill_state.transform_mode = r_fill_state.orig_transform_mode;
+
+			// reset the original transform if we are going back to software mode,
+			// because the extra is now done on the GPU...
+			// (any subsequent extras are sent directly to the GPU, no deferring)
+			if (r_fill_state.orig_transform_mode != TM_NONE) {
+				r_fill_state.transform_combined = p_item.final_transform;
+			}
+
+			// can possibly combine batch with the next one in some cases
+			// this is more efficient than having an extra batch especially for the extra
+			if ((extra_command + 1) == p_command_num) {
+				r_fill_state.curr_batch->num_commands = 2;
+				return;
+			}
+		}
+
+		// start default batch
 		r_fill_state.curr_batch = _batch_request_new();
 		r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
 		r_fill_state.curr_batch->first_command = p_command_num;
@@ -285,22 +369,16 @@ _FORCE_INLINE_ void RasterizerCanvasGLES2::_software_transform_vertex(Vector2 &r
 	r_v = p_tr.xform(r_v);
 }
 
-_FORCE_INLINE_ RasterizerCanvasGLES2::TransformMode RasterizerCanvasGLES2::_find_transform_mode(bool p_use_hardware_transform, const Transform2D &p_tr, Transform2D &r_tr) const {
-	if (!p_use_hardware_transform) {
-		r_tr = p_tr;
-
-		// decided whether to do translate only for software transform
-		if ((p_tr.elements[0].x == 1.0) &&
-				(p_tr.elements[0].y == 0.0) &&
-				(p_tr.elements[1].x == 0.0) &&
-				(p_tr.elements[1].y == 1.0)) {
-			return TM_TRANSLATE;
-		} else {
-			return TM_ALL;
-		}
+_FORCE_INLINE_ RasterizerCanvasGLES2::TransformMode RasterizerCanvasGLES2::_find_transform_mode(const Transform2D &p_tr) const {
+	// decided whether to do translate only for software transform
+	if ((p_tr.elements[0].x == 1.0) &&
+			(p_tr.elements[0].y == 0.0) &&
+			(p_tr.elements[1].x == 0.0) &&
+			(p_tr.elements[1].y == 1.0)) {
+		return TM_TRANSLATE;
 	}
 
-	return TM_NONE;
+	return TM_ALL;
 }
 
 #endif // RASTERIZERCANVASGLES2_H