Batching with Extra Matrix commands

Defers sending 'transform' commands within a RasterizerCanvas::Item until they are needed for default batches. Instead locally caches the extra matrix and applies it using software transform, preventing unnecessary batch breaks.

The logic is relatively complex, and the whole 'extra matrix' of the legacy renderer in addition to the final_transform is not ideal. However this is required to accelerate some user drawing techniques, and later the lines in the IDE.
This commit is contained in:
lawnjelly 2020-04-15 12:38:13 +01:00
parent 93af8e7d1b
commit a4cd274ca7
2 changed files with 159 additions and 36 deletions

View File

@ -221,9 +221,7 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
int command_count = p_item->commands.size();
Item::Command *const *commands = p_item->commands.ptr();
Transform2D transform;
TransformMode transform_mode = _find_transform_mode(r_fill_state.use_hardware_transform, p_item->final_transform, transform);
// just a local, might be more efficient in a register (check)
Vector2 texpixel_size = r_fill_state.texpixel_size;
// checking the color for not being white makes it 92/90 times faster in the case where it is white
@ -252,7 +250,36 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
switch (command->type) {
default: {
_prefill_default_batch(r_fill_state, command_num);
_prefill_default_batch(r_fill_state, command_num, *p_item);
} break;
case Item::Command::TYPE_TRANSFORM: {
// if the extra matrix has been sent already,
// break this extra matrix software path (as we don't want to unset it on the GPU etc)
if (r_fill_state.extra_matrix_sent) {
_prefill_default_batch(r_fill_state, command_num, *p_item);
} else {
// Extra matrix fast path.
// Instead of sending the command immediately, we store the modified transform (in combined)
// for software transform, and only flush this transform command if we NEED to (i.e. we want to
// render some default commands)
Item::CommandTransform *transform = static_cast<Item::CommandTransform *>(command);
const Transform2D &extra_matrix = transform->xform;
if (r_fill_state.use_hardware_transform) {
// if we are using hardware transform mode, we have already sent the final transform,
// so we only want to software transform the extra matrix
r_fill_state.transform_combined = extra_matrix;
} else {
r_fill_state.transform_combined = p_item->final_transform * extra_matrix;
}
// after a transform command, always use some form of software transform (either the combined final + extra, or just the extra)
// until we flush this dirty extra matrix because we need to render default commands.
r_fill_state.transform_mode = _find_transform_mode(r_fill_state.transform_combined);
// make a note of which command the dirty extra matrix is store in, so we can send it later
// if necessary
r_fill_state.transform_extra_command_number_p1 = command_num + 1; // plus 1 so we can test against zero
}
} break;
case Item::Command::TYPE_RECT: {
@ -277,7 +304,7 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
int command_num_next = command_num + 1;
if (command_num_next < command_count) {
Item::Command *command_next = commands[command_num_next];
if (command_next->type != Item::Command::TYPE_RECT) {
if ((command_next->type != Item::Command::TYPE_RECT) && (command_next->type != Item::Command::TYPE_TRANSFORM)) {
is_single_rect = true;
}
} else {
@ -285,7 +312,7 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
}
// if it is a rect on its own, do exactly the same as the default routine
if (is_single_rect) {
_prefill_default_batch(r_fill_state, command_num);
_prefill_default_batch(r_fill_state, command_num, *p_item);
break;
}
} // if use hardware transform
@ -352,8 +379,8 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
// fill the quad geometry
Vector2 mins = rect->rect.position;
if (transform_mode == TM_TRANSLATE) {
_software_transform_vertex(mins, transform);
if (r_fill_state.transform_mode == TM_TRANSLATE) {
_software_transform_vertex(mins, r_fill_state.transform_combined);
}
Vector2 maxs = mins + rect->rect.size;
@ -385,11 +412,11 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
SWAP(bB->pos, bC->pos);
}
if (transform_mode == TM_ALL) {
_software_transform_vertex(bA->pos, transform);
_software_transform_vertex(bB->pos, transform);
_software_transform_vertex(bC->pos, transform);
_software_transform_vertex(bD->pos, transform);
if (r_fill_state.transform_mode == TM_ALL) {
_software_transform_vertex(bA->pos, r_fill_state.transform_combined);
_software_transform_vertex(bB->pos, r_fill_state.transform_combined);
_software_transform_vertex(bC->pos, r_fill_state.transform_combined);
_software_transform_vertex(bD->pos, r_fill_state.transform_combined);
}
// uvs
@ -1452,6 +1479,7 @@ void RasterizerCanvasGLES2::render_joined_item_commands(const BItemJoined &p_bij
FillState fill_state;
fill_state.reset();
fill_state.use_hardware_transform = p_bij.use_hardware_transform();
fill_state.extra_matrix_sent = false;
for (unsigned int i = 0; i < p_bij.num_item_refs; i++) {
const BItemRef &ref = bdata.item_refs[p_bij.first_item_ref + i];
@ -1461,6 +1489,23 @@ void RasterizerCanvasGLES2::render_joined_item_commands(const BItemJoined &p_bij
int command_count = item->commands.size();
int command_start = 0;
// ONCE OFF fill state setup, that will be retained over multiple calls to
// prefill_joined_item()
fill_state.transform_combined = item->final_transform;
// decide the initial transform mode, and make a backup
// in orig_transform_mode in case we need to switch back
if (!fill_state.use_hardware_transform) {
fill_state.transform_mode = _find_transform_mode(fill_state.transform_combined);
} else {
fill_state.transform_mode = TM_NONE;
}
fill_state.orig_transform_mode = fill_state.transform_mode;
// keep track of when we added an extra matrix
// so we can defer sending until we see a default command
fill_state.transform_extra_command_number_p1 = 0;
while (command_start < command_count) {
// fill as many batches as possible (until all done, or the vertex buffer is full)
bool bFull = prefill_joined_item(fill_state, command_start, item, p_current_clip, r_reclip, p_material);
@ -1469,7 +1514,6 @@ void RasterizerCanvasGLES2::render_joined_item_commands(const BItemJoined &p_bij
// always pass first item (commands for default are always first item)
flush_render_batches(first_item, p_current_clip, r_reclip, p_material);
fill_state.reset();
fill_state.use_hardware_transform = p_bij.use_hardware_transform();
}
}
}
@ -1799,7 +1843,7 @@ bool RasterizerCanvasGLES2::try_join_item(Item *p_ci, RenderItemState &r_ris, bo
}
// non rects will break the batching anyway, we don't want to record item changes, detect this
if (_detect_batch_break(p_ci)) {
if (!r_batch_break && _detect_batch_break(p_ci)) {
join = false;
r_batch_break = true;
}
@ -1847,7 +1891,8 @@ bool RasterizerCanvasGLES2::_detect_batch_break(Item *p_ci) {
default: {
return true;
} break;
case Item::Command::TYPE_RECT: {
case Item::Command::TYPE_RECT:
case Item::Command::TYPE_TRANSFORM: {
} break;
} // switch

View File

@ -203,9 +203,10 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
struct FillState {
void reset() {
// don't reset members that need to be preserved after flushing
// half way through a list of commands
curr_batch = 0;
batch_tex_id = -1;
use_hardware_transform = true;
texpixel_size = Vector2(1, 1);
}
Batch *curr_batch;
@ -213,6 +214,13 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
bool use_hardware_transform;
Vector2 texpixel_size;
Color final_modulate;
TransformMode transform_mode;
TransformMode orig_transform_mode;
// support for extra matrices
bool extra_matrix_sent; // whether sent on this item (in which case sofware transform can't be used untl end of item)
int transform_extra_command_number_p1; // plus one to allow fast checking against zero
Transform2D transform_combined; // final * extra
};
public:
@ -247,8 +255,8 @@ private:
bool _detect_batch_break(Item *p_ci);
void _software_transform_vertex(BatchVector2 &r_v, const Transform2D &p_tr) const;
void _software_transform_vertex(Vector2 &r_v, const Transform2D &p_tr) const;
TransformMode _find_transform_mode(bool p_use_hardware_transform, const Transform2D &p_tr, Transform2D &r_tr) const;
_FORCE_INLINE_ void _prefill_default_batch(FillState &r_fill_state, int p_command_num);
TransformMode _find_transform_mode(const Transform2D &p_tr) const;
_FORCE_INLINE_ void _prefill_default_batch(FillState &r_fill_state, int p_command_num, const Item &p_item);
// light scissoring
bool _light_find_intersection(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect, Rect2 &r_cliprect) const;
@ -262,12 +270,88 @@ public:
//////////////////////////////////////////////////////////////
_FORCE_INLINE_ void RasterizerCanvasGLES2::_prefill_default_batch(FillState &r_fill_state, int p_command_num) {
// Default batches will not occur in software transform only items
// EXCEPT IN THE CASE OF SINGLE RECTS (and this may well not occur, check the logic in prefill_join_item TYPE_RECT)
// but can occur where transform commands have been sent during hardware batch
_FORCE_INLINE_ void RasterizerCanvasGLES2::_prefill_default_batch(FillState &r_fill_state, int p_command_num, const Item &p_item) {
if (r_fill_state.curr_batch->type == Batch::BT_DEFAULT) {
// another default command, just add to the existing batch
r_fill_state.curr_batch->num_commands++;
// don't need to flush an extra transform command?
if (!r_fill_state.transform_extra_command_number_p1) {
// another default command, just add to the existing batch
r_fill_state.curr_batch->num_commands++;
} else {
#ifdef DEBUG_ENABLED
if (r_fill_state.transform_extra_command_number_p1 != p_command_num) {
WARN_PRINT_ONCE("_prefill_default_batch : transform_extra_command_number_p1 != p_command_num");
}
#endif
// we do have a pending extra transform command to flush
// either the extra transform is in the prior command, or not, in which case we need 2 batches
// if (r_fill_state.transform_extra_command_number_p1 == p_command_num) {
// this should be most common case
r_fill_state.curr_batch->num_commands += 2;
// } else {
// // mad ordering .. does this even happen?
// int extra_command = r_fill_state.transform_extra_command_number_p1 - 1; // plus 1 based
// // send the extra to the GPU in a batch
// r_fill_state.curr_batch = _batch_request_new();
// r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
// r_fill_state.curr_batch->first_command = extra_command;
// r_fill_state.curr_batch->num_commands = 1;
// // start default batch
// r_fill_state.curr_batch = _batch_request_new();
// r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
// r_fill_state.curr_batch->first_command = p_command_num;
// r_fill_state.curr_batch->num_commands = 1;
// }
r_fill_state.transform_extra_command_number_p1 = 0; // mark as sent
r_fill_state.extra_matrix_sent = true;
// the original mode should always be hardware transform ..
// test this assumption
r_fill_state.transform_mode = r_fill_state.orig_transform_mode;
// do we need to restore anything else?
}
} else {
// end of previous different type batch, so start new default batch
// first consider whether there is a dirty extra matrix to send
if (r_fill_state.transform_extra_command_number_p1) {
// get which command the extra is in, and blank all the records as it no longer is stored CPU side
int extra_command = r_fill_state.transform_extra_command_number_p1 - 1; // plus 1 based
r_fill_state.transform_extra_command_number_p1 = 0;
r_fill_state.extra_matrix_sent = true;
// send the extra to the GPU in a batch
r_fill_state.curr_batch = _batch_request_new();
r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
r_fill_state.curr_batch->first_command = extra_command;
r_fill_state.curr_batch->num_commands = 1;
// revert to the original transform mode
// e.g. go back to NONE if we were in hardware transform mode
r_fill_state.transform_mode = r_fill_state.orig_transform_mode;
// reset the original transform if we are going back to software mode,
// because the extra is now done on the GPU...
// (any subsequent extras are sent directly to the GPU, no deferring)
if (r_fill_state.orig_transform_mode != TM_NONE) {
r_fill_state.transform_combined = p_item.final_transform;
}
// can possibly combine batch with the next one in some cases
// this is more efficient than having an extra batch especially for the extra
if ((extra_command + 1) == p_command_num) {
r_fill_state.curr_batch->num_commands = 2;
return;
}
}
// start default batch
r_fill_state.curr_batch = _batch_request_new();
r_fill_state.curr_batch->type = Batch::BT_DEFAULT;
r_fill_state.curr_batch->first_command = p_command_num;
@ -285,22 +369,16 @@ _FORCE_INLINE_ void RasterizerCanvasGLES2::_software_transform_vertex(Vector2 &r
r_v = p_tr.xform(r_v);
}
_FORCE_INLINE_ RasterizerCanvasGLES2::TransformMode RasterizerCanvasGLES2::_find_transform_mode(bool p_use_hardware_transform, const Transform2D &p_tr, Transform2D &r_tr) const {
if (!p_use_hardware_transform) {
r_tr = p_tr;
// decided whether to do translate only for software transform
if ((p_tr.elements[0].x == 1.0) &&
(p_tr.elements[0].y == 0.0) &&
(p_tr.elements[1].x == 0.0) &&
(p_tr.elements[1].y == 1.0)) {
return TM_TRANSLATE;
} else {
return TM_ALL;
}
_FORCE_INLINE_ RasterizerCanvasGLES2::TransformMode RasterizerCanvasGLES2::_find_transform_mode(const Transform2D &p_tr) const {
// decided whether to do translate only for software transform
if ((p_tr.elements[0].x == 1.0) &&
(p_tr.elements[0].y == 0.0) &&
(p_tr.elements[1].x == 0.0) &&
(p_tr.elements[1].y == 1.0)) {
return TM_TRANSLATE;
}
return TM_NONE;
return TM_ALL;
}
#endif // RASTERIZERCANVASGLES2_H