diff options
Diffstat (limited to 'vendor/gioui.org/shader/piet/elements.comp')
-rw-r--r-- | vendor/gioui.org/shader/piet/elements.comp | 413 |
1 files changed, 413 insertions, 0 deletions
diff --git a/vendor/gioui.org/shader/piet/elements.comp b/vendor/gioui.org/shader/piet/elements.comp new file mode 100644 index 0000000..17ef1ee --- /dev/null +++ b/vendor/gioui.org/shader/piet/elements.comp @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The element processing stage, first in the pipeline. +// +// This stage is primarily about applying transforms and computing bounding +// boxes. It is organized as a scan over the input elements, producing +// annotated output elements. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define N_ROWS 4 +#define WG_SIZE 32 +#define LG_WG_SIZE 5 +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(set = 0, binding = 2) readonly buffer SceneBuf { + uint[] scene; +}; + +// It would be better to use the Vulkan memory model than +// "volatile" but shooting for compatibility here rather +// than doing things right. +layout(set = 0, binding = 3) volatile buffer StateBuf { + uint part_counter; + uint[] state; +}; + +#include "scene.h" +#include "state.h" +#include "annotated.h" +#include "pathseg.h" +#include "tile.h" + +#define StateBuf_stride (4 + 2 * State_size) + +StateRef state_aggregate_ref(uint partition_ix) { + return StateRef(4 + partition_ix * StateBuf_stride); +} + +StateRef state_prefix_ref(uint partition_ix) { + return StateRef(4 + partition_ix * StateBuf_stride + State_size); +} + +uint state_flag_index(uint partition_ix) { + return partition_ix * (StateBuf_stride / 4); +} + +// These correspond to X, A, P respectively in the prefix sum paper. +#define FLAG_NOT_READY 0 +#define FLAG_AGGREGATE_READY 1 +#define FLAG_PREFIX_READY 2 + +#define FLAG_SET_LINEWIDTH 1 +#define FLAG_SET_BBOX 2 +#define FLAG_RESET_BBOX 4 +#define FLAG_SET_FILL_MODE 8 +// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1. +#define LG_FILL_MODE 4 +#define FILL_MODE_BITS 1 +#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE) + +// This is almost like a monoid (the interaction between transformation and +// bounding boxes is approximate) +State combine_state(State a, State b) { + State c; + c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { + c.bbox = a.bbox; + } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && + (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) + { + c.bbox.xy = min(a.bbox.xy, c.bbox.xy); + c.bbox.zw = max(a.bbox.zw, c.bbox.zw); + } + // It would be more concise to cast to matrix types; ah well. + c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; + c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; + c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; + c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; + c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; + c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; + c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; + c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags; + c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; + uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags; + fill_mode &= FILL_MODE_MASK; + c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode; + c.path_count = a.path_count + b.path_count; + c.pathseg_count = a.pathseg_count + b.pathseg_count; + c.trans_count = a.trans_count + b.trans_count; + return c; +} + +State map_element(ElementRef ref) { + // TODO: it would *probably* be more efficient to make the memory read patterns less + // divergent, though it would be more wasted memory. + uint tag = Element_tag(ref).tag; + State c; + c.bbox = vec4(0.0, 0.0, 0.0, 0.0); + c.mat = vec4(1.0, 0.0, 0.0, 1.0); + c.translate = vec2(0.0, 0.0); + c.linewidth = 1.0; // TODO should be 0.0 + c.flags = 0; + c.path_count = 0; + c.pathseg_count = 0; + c.trans_count = 0; + switch (tag) { + case Element_Line: + LineSeg line = Element_Line_read(ref); + c.bbox.xy = min(line.p0, line.p1); + c.bbox.zw = max(line.p0, line.p1); + c.pathseg_count = 1; + break; + case Element_Quad: + QuadSeg quad = Element_Quad_read(ref); + c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); + c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); + c.pathseg_count = 1; + break; + case Element_Cubic: + CubicSeg cubic = Element_Cubic_read(ref); + c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); + c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); + c.pathseg_count = 1; + break; + case Element_FillColor: + case Element_FillImage: + case Element_BeginClip: + c.flags = FLAG_RESET_BBOX; + c.path_count = 1; + break; + case Element_EndClip: + c.path_count = 1; + break; + case Element_SetLineWidth: + SetLineWidth lw = Element_SetLineWidth_read(ref); + c.linewidth = lw.width; + c.flags = FLAG_SET_LINEWIDTH; + break; + case Element_Transform: + Transform t = Element_Transform_read(ref); + c.mat = t.mat; + c.translate = t.translate; + c.trans_count = 1; + break; + case Element_SetFillMode: + SetFillMode fm = Element_SetFillMode_read(ref); + c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE); + break; + } + return c; +} + +// Get the bounding box of a circle transformed by the matrix into an ellipse. +vec2 get_linewidth(State st) { + // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm + return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); +} + +shared State sh_state[WG_SIZE]; + +shared uint sh_part_ix; +shared State sh_prefix; + +void main() { + State th_state[N_ROWS]; + // Determine partition to process by atomic counter (described in Section + // 4.4 of prefix sum paper). + if (gl_LocalInvocationID.x == 0) { + sh_part_ix = atomicAdd(part_counter, 1); + } + barrier(); + uint part_ix = sh_part_ix; + + uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; + ElementRef ref = ElementRef(ix * Element_size); + + th_state[0] = map_element(ref); + for (uint i = 1; i < N_ROWS; i++) { + // discussion question: would it be faster to load using more coherent patterns + // into thread memory? This is kinda strided. + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); + } + State agg = th_state[N_ROWS - 1]; + sh_state[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1 << i)) { + State other = sh_state[gl_LocalInvocationID.x - (1 << i)]; + agg = combine_state(other, agg); + } + barrier(); + sh_state[gl_LocalInvocationID.x] = agg; + } + + State exclusive; + exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); + exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); + exclusive.translate = vec2(0.0, 0.0); + exclusive.linewidth = 1.0; //TODO should be 0.0 + exclusive.flags = 0; + exclusive.path_count = 0; + exclusive.pathseg_count = 0; + exclusive.trans_count = 0; + + // Publish aggregate for this partition + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + // Note: with memory model, we'd want to generate the atomic store version of this. + State_write(state_aggregate_ref(part_ix), agg); + } + memoryBarrierBuffer(); + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + uint flag = FLAG_AGGREGATE_READY; + if (part_ix == 0) { + State_write(state_prefix_ref(part_ix), agg); + flag = FLAG_PREFIX_READY; + } + state[state_flag_index(part_ix)] = flag; + if (part_ix != 0) { + // step 4 of paper: decoupled lookback + uint look_back_ix = part_ix - 1; + + State their_agg; + uint their_ix = 0; + while (true) { + flag = state[state_flag_index(look_back_ix)]; + if (flag == FLAG_PREFIX_READY) { + State their_prefix = State_read(state_prefix_ref(look_back_ix)); + exclusive = combine_state(their_prefix, exclusive); + break; + } else if (flag == FLAG_AGGREGATE_READY) { + their_agg = State_read(state_aggregate_ref(look_back_ix)); + exclusive = combine_state(their_agg, exclusive); + look_back_ix--; + their_ix = 0; + continue; + } + // else spin + + // Unfortunately there's no guarantee of forward progress of other + // workgroups, so compute a bit of the aggregate before trying again. + // In the worst case, spinning stops when the aggregate is complete. + ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); + State s = map_element(ref); + if (their_ix == 0) { + their_agg = s; + } else { + their_agg = combine_state(their_agg, s); + } + their_ix++; + if (their_ix == PARTITION_SIZE) { + exclusive = combine_state(their_agg, exclusive); + if (look_back_ix == 0) { + break; + } + look_back_ix--; + their_ix = 0; + } + } + + // step 5 of paper: compute inclusive prefix + State inclusive_prefix = combine_state(exclusive, agg); + sh_prefix = exclusive; + State_write(state_prefix_ref(part_ix), inclusive_prefix); + } + } + memoryBarrierBuffer(); + if (gl_LocalInvocationID.x == WG_SIZE - 1 && part_ix != 0) { + state[state_flag_index(part_ix)] = FLAG_PREFIX_READY; + } + barrier(); + if (part_ix != 0) { + exclusive = sh_prefix; + } + + State row = exclusive; + if (gl_LocalInvocationID.x > 0) { + State other = sh_state[gl_LocalInvocationID.x - 1]; + row = combine_state(row, other); + } + for (uint i = 0; i < N_ROWS; i++) { + State st = combine_state(row, th_state[i]); + + // Here we read again from the original scene. There may be + // gains to be had from stashing in shared memory or possibly + // registers (though register pressure is an issue). + ElementRef this_ref = Element_index(ref, i); + ElementTag tag = Element_tag(this_ref); + uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE); + bool is_stroke = fill_mode == MODE_STROKE; + switch (tag.tag) { + case Element_Line: + LineSeg line = Element_Line_read(this_ref); + PathCubic path_cubic; + path_cubic.p0 = line.p0; + path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0); + path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0); + path_cubic.p3 = line.p1; + path_cubic.path_ix = st.path_count; + path_cubic.trans_ix = st.trans_count; + if (is_stroke) { + path_cubic.stroke = get_linewidth(st); + } else { + path_cubic.stroke = vec2(0.0); + } + PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); + PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); + break; + case Element_Quad: + QuadSeg quad = Element_Quad_read(this_ref); + path_cubic.p0 = quad.p0; + path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0); + path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0); + path_cubic.p3 = quad.p2; + path_cubic.path_ix = st.path_count; + path_cubic.trans_ix = st.trans_count; + if (is_stroke) { + path_cubic.stroke = get_linewidth(st); + } else { + path_cubic.stroke = vec2(0.0); + } + path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); + PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); + break; + case Element_Cubic: + CubicSeg cubic = Element_Cubic_read(this_ref); + path_cubic.p0 = cubic.p0; + path_cubic.p1 = cubic.p1; + path_cubic.p2 = cubic.p2; + path_cubic.p3 = cubic.p3; + path_cubic.path_ix = st.path_count; + path_cubic.trans_ix = st.trans_count; + if (is_stroke) { + path_cubic.stroke = get_linewidth(st); + } else { + path_cubic.stroke = vec2(0.0); + } + path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); + PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); + break; + case Element_FillColor: + FillColor fill = Element_FillColor_read(this_ref); + AnnoColor anno_fill; + anno_fill.rgba_color = fill.rgba_color; + if (is_stroke) { + vec2 lw = get_linewidth(st); + anno_fill.bbox = st.bbox + vec4(-lw, lw); + anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); + } else { + anno_fill.bbox = st.bbox; + anno_fill.linewidth = 0.0; + } + AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill); + break; + case Element_FillImage: + FillImage fill_img = Element_FillImage_read(this_ref); + AnnoImage anno_img; + anno_img.index = fill_img.index; + anno_img.offset = fill_img.offset; + if (is_stroke) { + vec2 lw = get_linewidth(st); + anno_img.bbox = st.bbox + vec4(-lw, lw); + anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); + } else { + anno_img.bbox = st.bbox; + anno_img.linewidth = 0.0; + } + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img); + break; + case Element_BeginClip: + Clip begin_clip = Element_BeginClip_read(this_ref); + AnnoBeginClip anno_begin_clip; + // This is the absolute bbox, it's been transformed during encoding. + anno_begin_clip.bbox = begin_clip.bbox; + if (is_stroke) { + vec2 lw = get_linewidth(st); + anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); + } else { + anno_fill.linewidth = 0.0; + } + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip); + break; + case Element_EndClip: + Clip end_clip = Element_EndClip_read(this_ref); + // This bbox is expected to be the same as the begin one. + AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox); + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); + break; + case Element_Transform: + TransformSeg transform = TransformSeg(st.mat, st.translate); + TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size); + TransformSeg_write(conf.trans_alloc, trans_ref, transform); + break; + } + } +} |