diff options
Diffstat (limited to 'vendor/gioui.org/shader/piet/binning.comp')
-rw-r--r-- | vendor/gioui.org/shader/piet/binning.comp | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/vendor/gioui.org/shader/piet/binning.comp b/vendor/gioui.org/shader/piet/binning.comp new file mode 100644 index 0000000..acda83c --- /dev/null +++ b/vendor/gioui.org/shader/piet/binning.comp @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The binning stage of the pipeline. +// +// Each workgroup processes N_TILE paths. +// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask +// based on the path bounding box to bin the paths. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +layout(local_size_x = N_TILE, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" +#include "bins.h" + +// scale factors useful for converting coordinates to bins +#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) +#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) + +// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) +#define INFINITY (1.0 / 0.0) + +// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. +// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) +shared uint bitmaps[N_SLICE][N_TILE]; +shared uint count[N_SLICE][N_TILE]; +shared Alloc sh_chunk_alloc[N_TILE]; +// Really a bool, but some Metal devices don't accept shared bools. +shared uint sh_alloc_failed; + +void main() { + uint my_n_elements = conf.n_elements; + uint my_partition = gl_WorkGroupID.x; + + for (uint i = 0; i < N_SLICE; i++) { + bitmaps[i][gl_LocalInvocationID.x] = 0; + } + if (gl_LocalInvocationID.x == 0) { + sh_alloc_failed = 0; + } + barrier(); + + // Read inputs and determine coverage of bins + uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + uint tag = Annotated_Nop; + if (element_ix < my_n_elements) { + tag = Annotated_tag(conf.anno_alloc, ref).tag; + } + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + switch (tag) { + case Annotated_Color: + case Annotated_Image: + case Annotated_BeginClip: + case Annotated_EndClip: + // Note: we take advantage of the fact that these drawing elements + // have the bbox at the same place in their layout. + AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref); + x0 = int(floor(clip.bbox.x * SX)); + y0 = int(floor(clip.bbox.y * SY)); + x1 = int(ceil(clip.bbox.z * SX)); + y1 = int(ceil(clip.bbox.w * SY)); + break; + } + + // At this point, we run an iterator over the coverage area, + // trying to keep divergence low. + // Right now, it's just a bbox, but we'll get finer with + // segments. + uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; + uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) y1 = y0; + int x = x0, y = y0; + uint my_slice = gl_LocalInvocationID.x / 32; + uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + while (y < y1) { + atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); + x++; + if (x == x1) { + x = x0; + y++; + } + } + + barrier(); + // Allocate output segments. + uint element_count = 0; + for (uint i = 0; i < N_SLICE; i++) { + element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); + count[i][gl_LocalInvocationID.x] = element_count; + } + // element_count is number of elements covering bin for this invocation. + Alloc chunk_alloc = new_alloc(0, 0, true); + if (element_count != 0) { + // TODO: aggregate atomic adds (subgroup is probably fastest) + MallocResult chunk = malloc(element_count * BinInstance_size); + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) { + sh_alloc_failed = 1; + } + } + // Note: it might be more efficient for reading to do this in the + // other order (each bin is a contiguous sequence of partitions) + uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; + write_mem(conf.bin_alloc, out_ix, element_count); + write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); + + barrier(); + if (sh_alloc_failed != 0 || mem_error != NO_ERROR) { + return; + } + + // Use similar strategy as Laine & Karras paper; loop over bbox of bins + // touched by this element + x = x0; + y = y0; + while (y < y1) { + uint bin_ix = y * width_in_bins + x; + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0) { + uint idx = bitCount(out_mask & (my_mask - 1)); + if (my_slice > 0) { + idx += count[my_slice - 1][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + idx * BinInstance_size; + BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); + } + x++; + if (x == x1) { + x = x0; + y++; + } + } +} |