vendor/gioui.org/shader/piet/binning.comp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The binning stage of the pipeline.
//
// Each workgroup processes N_TILE paths.
// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
// based on the path bounding box to bin the paths.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

layout(local_size_x = N_TILE, local_size_y = 1) in;

layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

#include "annotated.h"
#include "bins.h"

// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))

// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
#define INFINITY (1.0 / 0.0)

// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE];
// Really a bool, but some Metal devices don't accept shared bools.
shared uint sh_alloc_failed;

void main() {
    uint my_n_elements = conf.n_elements;
    uint my_partition = gl_WorkGroupID.x;

    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
    if (gl_LocalInvocationID.x == 0) {
        sh_alloc_failed = 0;
    }
    barrier();

    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
    uint tag = Annotated_Nop;
    if (element_ix < my_n_elements) {
        tag = Annotated_tag(conf.anno_alloc, ref).tag;
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    switch (tag) {
    case Annotated_Color:
    case Annotated_Image:
    case Annotated_BeginClip:
    case Annotated_EndClip:
        // Note: we take advantage of the fact that these drawing elements
        // have the bbox at the same place in their layout.
        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
        x0 = int(floor(clip.bbox.x * SX));
        y0 = int(floor(clip.bbox.y * SY));
        x1 = int(ceil(clip.bbox.z * SX));
        y1 = int(ceil(clip.bbox.w * SY));
        break;
    }

    // At this point, we run an iterator over the coverage area,
    // trying to keep divergence low.
    // Right now, it's just a bbox, but we'll get finer with
    // segments.
    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
    x0 = clamp(x0, 0, int(width_in_bins));
    x1 = clamp(x1, x0, int(width_in_bins));
    y0 = clamp(y0, 0, int(height_in_bins));
    y1 = clamp(y1, y0, int(height_in_bins));
    if (x0 == x1) y1 = y0;
    int x = x0, y = y0;
    uint my_slice = gl_LocalInvocationID.x / 32;
    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
    while (y < y1) {
        atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
        x++;
        if (x == x1) {
            x = x0;
            y++;
        }
    }

    barrier();
    // Allocate output segments.
    uint element_count = 0;
    for (uint i = 0; i < N_SLICE; i++) {
        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
    Alloc chunk_alloc = new_alloc(0, 0, true);
    if (element_count != 0) {
        // TODO: aggregate atomic adds (subgroup is probably fastest)
        MallocResult chunk = malloc(element_count * BinInstance_size);
        chunk_alloc = chunk.alloc;
        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
        if (chunk.failed) {
            sh_alloc_failed = 1;
        }
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
    write_mem(conf.bin_alloc, out_ix, element_count);
    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);

    barrier();
    if (sh_alloc_failed != 0 || mem_error != NO_ERROR) {
        return;
    }

    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
    x = x0;
    y = y0;
    while (y < y1) {
        uint bin_ix = y * width_in_bins + x;
        uint out_mask = bitmaps[my_slice][bin_ix];
        if ((out_mask & my_mask) != 0) {
            uint idx = bitCount(out_mask & (my_mask - 1));
            if (my_slice > 0) {
                idx += count[my_slice - 1][bin_ix];
            }
            Alloc out_alloc = sh_chunk_alloc[bin_ix];
            uint out_offset = out_alloc.offset + idx * BinInstance_size;
            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
        }
        x++;
        if (x == x1) {
            x = x0;
            y++;
        }
    }
}