render-zig

A 3D rendering engine written in Zig
git clone git://git.christianermann.dev/render-zig
Log | Files | Refs

commit e2bc0ba7e6a2b82a3e3f498cb701e5ceca0f4050
parent 1494036a2c88de72e059e729880321db83593b55
Author: Christian Ermann <christianermann@gmail.com>
Date:   Sat, 20 Jul 2024 10:45:21 -0400

Add meshlet generation algorithms

Diffstat:
Asrc/forsyth_optimize.zig | 278+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/half_edge_map.zig | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/main.zig | 342+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Msrc/mesh_render_pipeline.zig | 5+++--
Asrc/meshlets.zig | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/meshlets_adjacency.zig | 446+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/shaders/mesh_clusters.wgsl | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 1320 insertions(+), 142 deletions(-)

diff --git a/src/forsyth_optimize.zig b/src/forsyth_optimize.zig @@ -0,0 +1,278 @@ +const std = @import("std"); +const assert = std.debug.assert; + +const Mesh = @import("mesh.zig"); + +const ScoreParameters = struct { + last_tri: f32 = 0.75, + cache_decay_power: f32 = 1.5, + valence_boost_scale: f32 = 2.0, + valence_boost_power: f32 = 0.5, +}; + +const Vertex = struct { + cache_idx: ?u32 = undefined, + score: f32 = 0, + num_triangles: u32 = 0, + max_triangles: u32 = 0, + triangles: []u32 = undefined, + + fn scoreFn( + self: *const Vertex, + max_cache_idx: u32, + params: ScoreParameters, + ) f32 { + // Vertex will never be used again + const num_active = self.max_triangles - self.num_triangles; + if (num_active <= 0) { + return -1.0; + } + + // Boost score for lonely vertices so they go away fast + const valence_boost = std.math.pow( + f32, + @floatFromInt(num_active), + -params.valence_boost_power, + ); + const score = params.valence_boost_scale * valence_boost; + + // No adjustments for vertices not in the cache yet + if (self.cache_idx == null) { + return score; + } + + // Vertices from last triangle all get the same score + const cache_idx = self.cache_idx.?; + if (cache_idx < 3) { + return score + params.last_tri; + } + + // For other vertices, score depends on cache position + assert(cache_idx < max_cache_idx); + const scale = 1.0 / @as(f32, @floatFromInt(max_cache_idx - 3)); + const adjusted_idx: f32 = @floatFromInt(cache_idx - 3); + const cache_score = 1.0 - adjusted_idx * scale; + return score + std.math.pow(f32, cache_score, params.cache_decay_power); + } +}; + +const Triangle = struct { + added: bool = false, + score: f32 = 0, + vertices: [3]u32 = undefined, +}; + +fn LRUCache(comptime T: type) type { + return struct { + const Self = @This(); + const List = std.DoublyLinkedList(T); + + allocator: std.mem.Allocator, + list: List, + max_size: u32, + + pub fn init(allocator: std.mem.Allocator, max_size: u32) Self { + return .{ + .list = std.DoublyLinkedList(T){}, + .allocator = allocator, + .max_size = max_size, + }; + } + + fn find(cache: *const Self, value: T) ?*List.Node { + var it = cache.list.first; + while (it) |node| : (it = node.next) { + if (node.data == value) { + return node; + } + } + return null; + } + + pub fn insert(cache: *Self, value: T) !?*List.Node { + var node = cache.find(value); + if (node != null) { + cache.list.remove(node.?); + } else { + node = try cache.allocator.create(List.Node); + node.?.data = value; + } + cache.list.prepend(node.?); + + if (cache.list.len > cache.max_size) { + return cache.list.pop(); + } + return null; + } + + pub fn first(cache: *Self) ?*List.Node { + return cache.list.first; + } + }; +} + +pub fn optimize( + allocator: std.mem.Allocator, + mesh: *Mesh, + max_cache_idx: u32, + params: ScoreParameters, +) !void { + // Initialize vertex data structure + const vertices = try allocator.alloc(Vertex, mesh.positions.len); + @memset(vertices, Vertex{}); + for (0..mesh.indices.len / 3) |i| { + const mesh_offset = i * 3; + const a = mesh.indices[mesh_offset]; + const b = mesh.indices[mesh_offset + 1]; + const c = mesh.indices[mesh_offset + 2]; + + vertices[a].max_triangles += 1; + vertices[b].max_triangles += 1; + vertices[c].max_triangles += 1; + } + for (vertices) |*vertex| { + vertex.triangles = try allocator.alloc(u32, vertex.max_triangles); + vertex.max_triangles = 0; + } + for (0..mesh.indices.len / 3) |i| { + const mesh_offset = i * 3; + const a = mesh.indices[mesh_offset]; + const b = mesh.indices[mesh_offset + 1]; + const c = mesh.indices[mesh_offset + 2]; + + const idx: u32 = @intCast(i); + vertices[a].triangles[vertices[a].max_triangles] = idx; + vertices[b].triangles[vertices[b].max_triangles] = idx; + vertices[c].triangles[vertices[c].max_triangles] = idx; + + vertices[a].max_triangles += 1; + vertices[b].max_triangles += 1; + vertices[c].max_triangles += 1; + } + for (vertices) |*vertex| { + vertex.score = vertex.scoreFn(max_cache_idx, params); + } + + // Initialize triangle data structure + const triangles = try allocator.alloc(Triangle, mesh.indices.len / 3); + @memset(triangles, Triangle{}); + for (triangles, 0..) |*tri, i| { + const mesh_offset = i * 3; + const a = mesh.indices[mesh_offset]; + const b = mesh.indices[mesh_offset + 1]; + const c = mesh.indices[mesh_offset + 2]; + tri.vertices[0] = a; + tri.vertices[1] = b; + tri.vertices[2] = c; + tri.score = vertices[a].score + vertices[b].score + vertices[c].score; + } + + // Store new indices here + const indices = try allocator.alloc(u32, mesh.indices.len); + + // Run algorithm + var cache = LRUCache(u32).init(allocator, max_cache_idx); + for (0..triangles.len) |i| { + var tri = bestTriangleFromCache(vertices, triangles, &cache); + if (tri == null) { + tri = bestTriangleOverall(triangles); + } + assert(tri != null); + + const offset = i * 3; + indices[offset] = triangles[tri.?].vertices[0]; + indices[offset + 1] = triangles[tri.?].vertices[1]; + indices[offset + 2] = triangles[tri.?].vertices[2]; + + try addTriangleToCache( + allocator, + tri.?, + vertices, + triangles, + &cache, + params, + ); + } + + allocator.free(mesh.indices); + mesh.indices = indices; +} + +fn addTriangleToCache( + allocator: std.mem.Allocator, + triangle_idx: u32, + vertices: []Vertex, + triangles: []Triangle, + cache: *LRUCache(u32), + params: ScoreParameters, +) !void { + assert(!triangles[triangle_idx].added); + + // Insert triangle + triangles[triangle_idx].added = true; + for (triangles[triangle_idx].vertices) |idx| { + const removed = try cache.insert(idx); + if (removed != null) { + const vtx = &vertices[removed.?.data]; + vtx.cache_idx = null; + vtx.score = vtx.scoreFn(cache.max_size, params); + allocator.destroy(removed.?); + } + vertices[idx].num_triangles += 1; + } + + // Update all other cache entries + var it = cache.first(); + var cache_idx: u32 = 0; + while (it) |node| : (it = node.next) { + const vtx = &vertices[node.data]; + if (cache_idx < cache.max_size) { + vtx.cache_idx = cache_idx; + } else { + vtx.cache_idx = null; + } + cache_idx += 1; + vtx.score = vtx.scoreFn(cache.max_size, params); + } +} + +fn bestTriangleOverall(triangles: []Triangle) ?u32 { + var best_triangle: ?u32 = null; + var best_score: f32 = 0.0; + for (triangles, 0..) |tri, i| { + if (!tri.added and tri.score > best_score) { + best_triangle = @intCast(i); + best_score = tri.score; + } + } + return best_triangle; +} + +fn bestTriangleFromCache( + vertices: []Vertex, + triangles: []Triangle, + cache: *LRUCache(u32), +) ?u32 { + var best_score: f32 = 0.0; + var best_triangle: ?u32 = null; + var it = cache.first(); + while (it) |node| : (it = node.next) { + const vtx = &vertices[node.data]; + for (vtx.triangles) |idx| { + const tri = &triangles[idx]; + if (tri.added) { + continue; + } + const a = vertices[tri.vertices[0]]; + const b = vertices[tri.vertices[1]]; + const c = vertices[tri.vertices[2]]; + tri.score = a.score + b.score + c.score; + + if (tri.score > best_score) { + best_triangle = idx; + best_score = tri.score; + } + } + } + return best_triangle; +} diff --git a/src/half_edge_map.zig b/src/half_edge_map.zig @@ -0,0 +1,67 @@ +const std = @import("std"); + +const HalfEdgeMap = @This(); + +const HalfEdge = struct { + head: u32, + tail: u32, +}; + +map: std.AutoArrayHashMapUnmanaged(HalfEdge, u32), + +pub fn init(indices: []u32, allocator: std.mem.Allocator) !HalfEdgeMap { + const num_indices: u32 = @intCast(indices.len); + const num_triangles = num_indices / 3; + + var map = std.AutoArrayHashMapUnmanaged(HalfEdge, u32){}; + try map.ensureTotalCapacity(allocator, num_indices); + + for (0..num_triangles) |triangle_idx| { + const i = @as(u32, @intCast(triangle_idx)) * 3; + const a = indices[i]; + const b = indices[i + 1]; + const c = indices[i + 2]; + map.putAssumeCapacityNoClobber(.{ .head = a, .tail = b }, c); + map.putAssumeCapacityNoClobber(.{ .head = b, .tail = c }, a); + map.putAssumeCapacityNoClobber(.{ .head = c, .tail = a }, b); + } + return .{ + .map = map, + }; +} + +pub fn insert(self: *HalfEdgeMap, tri: [3]u32) void { + const a = tri[0]; + const b = tri[1]; + const c = tri[2]; + self.map.putAssumeCapacityNoClobber(.{ .head = a, .tail = b }, c); + self.map.putAssumeCapacityNoClobber(.{ .head = b, .tail = c }, a); + self.map.putAssumeCapacityNoClobber(.{ .head = c, .tail = a }, b); +} + +pub fn remove(self: *HalfEdgeMap, tri: [3]u32) bool { + const success0 = self.map.swapRemove(.{ .head = tri[0], .tail = tri[1] }); + const success1 = self.map.swapRemove(.{ .head = tri[1], .tail = tri[2] }); + const success2 = self.map.swapRemove(.{ .head = tri[2], .tail = tri[0] }); + return success0 and success1 and success2; +} + +pub fn adjacentTo(self: *HalfEdgeMap, tri: [3]u32, side: u32) ?[3]u32 { + const a = tri[(side + 1) % 3]; + const b = tri[side]; + const c = self.map.get(.{ .head = a, .tail = b }) orelse { + return null; + }; + return [3]u32{ a, b, c }; +} + +pub fn count(self: *const HalfEdgeMap) usize { + return self.map.count(); +} + +pub fn pop(self: *HalfEdgeMap) [3]u32 { + const kv = self.map.pop(); + const tri = [3]u32{ kv.key.head, kv.key.tail, kv.value }; + _ = self.remove(tri); + return tri; +} diff --git a/src/main.zig b/src/main.zig @@ -15,6 +15,7 @@ const Mesh = @import("mesh.zig"); const Camera = @import("camera.zig"); const input = @import("input.zig"); +const ClusterMesh = @import("meshlets.zig").ClusterMesh; const load_obj = @import("load_obj.zig"); @@ -298,114 +299,75 @@ pub const UniformData = struct { proj_matrix: mat4, }; -pub const Buffer = struct { - data: *gpu.Buffer, - size: u32, - offset: u32, - attrib_size: u32, - - pub fn init_vtx(n_vertices: u32, device: *gpu.Device) Buffer { - std.log.info("buffer init: {} vertices", .{n_vertices}); - const attrib_size = @sizeOf(f32x4); - const size = n_vertices * attrib_size; - const descriptor = gpu.Buffer.Descriptor{ - .size = size, - .usage = .{ .vertex = true, .copy_dst = true }, - .mapped_at_creation = .false, - }; - const data = device.createBuffer(&descriptor); - - return .{ - .data = data, - .size = size, - .offset = 0, - .attrib_size = attrib_size, - }; - } +pub fn Buffer(comptime T: type) type { + return struct { + const Self = @This(); + const attrib_size = @sizeOf(T); - pub fn init_idx(n_indices: u32, device: *gpu.Device) Buffer { - std.log.info("buffer init: {} indices", .{n_indices}); - const attrib_size = @sizeOf(u32); - const size = n_indices * attrib_size; - const descriptor = gpu.Buffer.Descriptor{ - .size = size, - .usage = .{ .index = true, .copy_dst = true }, - .mapped_at_creation = .false, - }; - const data = device.createBuffer(&descriptor); - - return .{ - .data = data, - .size = size, - .offset = 0, - .attrib_size = attrib_size, - }; - } - - pub fn init_indirect(n_meshes: u32, device: *gpu.Device) Buffer { - std.log.info("buffer init: {} indirect", .{n_meshes}); - const attrib_size = @sizeOf(u32) * 5; - const size = n_meshes * attrib_size; - const descriptor = gpu.Buffer.Descriptor{ - .size = size, - .usage = .{ .indirect = true, .copy_dst = true }, - .mapped_at_creation = .false, - }; - const data = device.createBuffer(&descriptor); - return .{ - .data = data, - .size = size, - .offset = 0, - .attrib_size = attrib_size, - }; - } + data: *gpu.Buffer, + size: u32, + offset: u32, + attrib_size: u32, + + pub fn init( + max_attribs: u32, + device: *gpu.Device, + usage: gpu.Buffer.UsageFlags, + mapped_at_creation: bool, + ) Self { + std.log.info("buffer init: {} {}", .{ max_attribs, T }); + const size = max_attribs * attrib_size; + const descriptor = gpu.Buffer.Descriptor{ + .size = size, + .usage = usage, + .mapped_at_creation = gpu.Bool32.from(mapped_at_creation), + }; + const data = device.createBuffer(&descriptor); + return .{ + .data = data, + .size = size, + .offset = 0, + .attrib_size = attrib_size, + }; + } - pub fn init_instance(n_instances: u32, device: *gpu.Device) Buffer { - std.log.info("buffer init: {} instances", .{n_instances}); - const attrib_size = @sizeOf(InstanceData); - const size = n_instances * attrib_size; - const descriptor = gpu.Buffer.Descriptor{ - .size = size, - .usage = .{ .storage = true, .copy_dst = true }, - .mapped_at_creation = .false, - }; - const data = device.createBuffer(&descriptor); - return .{ - .data = data, - .size = size, - .offset = 0, - .attrib_size = attrib_size, - }; - } + pub fn alloc(self: *Self, n_attribs: u32) !u32 { + const size = n_attribs * attrib_size; + const offset_start = self.offset; + const offset_after = self.offset + size; + std.log.info("buffer alloc: {} {}", .{ n_attribs, T }); + if (offset_after > self.size) { + std.log.err("buffer out of memory", .{}); + return error.BufferEmpty; + } + self.offset = offset_after; + return offset_start / attrib_size; + } - pub fn alloc(self: *Buffer, n_elements: u32) !u32 { - const size = n_elements * self.attrib_size; - const offset_start = self.offset; - const offset_after = self.offset + size; - std.log.info("buffer alloc: {} elements", .{n_elements}); - if (offset_after > self.size) { - std.log.err("buffer out of memory", .{}); - return error.BufferEmpty; + pub fn write( + self: *Self, + data_slice: anytype, + queue: *gpu.Queue, + offset: u32, + ) void { + queue.writeBuffer(self.data, offset * attrib_size, data_slice); } - self.offset = offset_after; - return offset_start; - } - pub fn write( - self: *Buffer, - data_slice: anytype, - queue: *gpu.Queue, - offset: u32, - ) void { - queue.writeBuffer(self.data, offset, data_slice); - } + pub fn allocWrite( + self: *Self, + data_slice: anytype, + queue: *gpu.Queue, + ) !u32 { + const offset = try self.alloc(@intCast(data_slice.len)); + queue.writeBuffer(self.data, offset * attrib_size, data_slice); + return offset; + } - pub fn allocWrite(self: *Buffer, data_slice: anytype, queue: *gpu.Queue) !u32 { - const offset = try self.alloc(@intCast(data_slice.len)); - queue.writeBuffer(self.data, offset, data_slice); - return offset; - } -}; + pub fn count(self: *const Self) u32 { + return self.offset / attrib_size; + } + }; +} pub const MeshBufferOptions = struct { device: *gpu.Device, @@ -422,19 +384,25 @@ pub const MeshOptions = struct { data: *const Mesh, }; +pub const ClusterMeshOptions = struct { + scale: f32 = 1.0, + offset: f32x3 = .{ 0, 0, 0 }, + data: *const ClusterMesh, +}; + pub const MeshBuffer = struct { device: *gpu.Device, queue: *gpu.Queue, - positions: Buffer, - normals: Buffer, - tex_coords: Buffer, - tangents: Buffer, - bitangents: Buffer, - indices: Buffer, + positions: Buffer(f32x4), + normals: Buffer(f32x4), + tex_coords: Buffer(f32x4), + tangents: Buffer(f32x4), + bitangents: Buffer(f32x4), + indices: Buffer(u32), - instances: Buffer, - indirect: Buffer, + instances: Buffer(InstanceData), + indirect: Buffer([5]u32), meshes: std.ArrayList(RenderMesh), @@ -446,14 +414,54 @@ pub const MeshBuffer = struct { return .{ .device = options.device, .queue = options.queue, - .positions = Buffer.init_vtx(n_vertices, device), - .normals = Buffer.init_vtx(n_vertices, device), - .tex_coords = Buffer.init_vtx(n_vertices, device), - .tangents = Buffer.init_vtx(n_vertices, device), - .bitangents = Buffer.init_vtx(n_vertices, device), - .indices = Buffer.init_idx(n_indices, device), - .instances = Buffer.init_instance(options.n_meshes, device), - .indirect = Buffer.init_indirect(options.n_meshes, device), + .positions = Buffer(f32x4).init( + n_vertices, + device, + .{ .vertex = true, .copy_dst = true }, + false, + ), + .normals = Buffer(f32x4).init( + n_vertices, + device, + .{ .vertex = true, .copy_dst = true }, + false, + ), + .tex_coords = Buffer(f32x4).init( + n_vertices, + device, + .{ .vertex = true, .copy_dst = true }, + false, + ), + .tangents = Buffer(f32x4).init( + n_vertices, + device, + .{ .vertex = true, .copy_dst = true }, + false, + ), + .bitangents = Buffer(f32x4).init( + n_vertices, + device, + .{ .vertex = true, .copy_dst = true }, + false, + ), + .indices = Buffer(u32).init( + n_indices, + device, + .{ .index = true, .copy_dst = true }, + false, + ), + .instances = Buffer(InstanceData).init( + options.n_meshes, + device, + .{ .storage = true, .copy_dst = true }, + false, + ), + .indirect = Buffer([5]u32).init( + options.n_meshes, + device, + .{ .indirect = true, .copy_dst = true }, + false, + ), .meshes = std.ArrayList(RenderMesh).init(options.allocator), }; } @@ -502,19 +510,67 @@ pub const MeshBuffer = struct { return &self.meshes.items[0]; } + pub fn initClusterMesh( + self: *MeshBuffer, + options: *const ClusterMeshOptions, + allocator: std.mem.Allocator, + ) !*RenderMesh { + const mesh_data = options.data; + const num_vertices = mesh_data.positions.len; + const num_indices = mesh_data.indices.len; + const num_clusters = mesh_data.descriptors.len; + std.log.info( + "init mesh with {} vertices, {} indices, {} clusters", + .{ num_vertices, num_indices, num_clusters }, + ); + + const offset_p = try self.positions.allocWrite(mesh_data.positions, self.queue); + _ = try self.normals.allocWrite(mesh_data.normals, self.queue); + _ = try self.tex_coords.allocWrite(mesh_data.tex_coords, self.queue); + _ = try self.tangents.allocWrite(mesh_data.tangents, self.queue); + _ = try self.bitangents.allocWrite(mesh_data.bitangents, self.queue); + + const offset_i = try self.indices.allocWrite(mesh_data.indices, self.queue); + + const mesh = RenderMesh{ + .num_vertices = num_vertices, + .num_indices = num_indices, + .scale = options.scale, + .offset = options.offset, + }; + + const instances = [_]InstanceData{ + mesh.instanceData(), + }; + _ = try self.instances.allocWrite(&instances, self.queue); + + const indirect_args = try allocator.alloc([5]u32, num_clusters); + defer allocator.free(indirect_args); + + var mesh_id: u32 = @intCast(self.meshes.items.len); + for (mesh_data.descriptors, 0..) |descriptor, i| { + indirect_args[i] = .{ + @intCast(descriptor.num_triangles * 3), + 1, + offset_i + descriptor.offset, + offset_p, + mesh_id, + }; + mesh_id += 1; + } + _ = try self.indirect.allocWrite(indirect_args, self.queue); + + try self.meshes.append(mesh); + return &self.meshes.items[0]; + } + pub fn updateMeshes(self: *MeshBuffer) void { - std.log.info("RUNNING!", .{}); for (self.meshes.items, 0..) |*mesh, i| { if (!mesh.dirty) { - std.log.info("{}", .{mesh.dirty}); continue; } - const offset = @sizeOf(InstanceData) * i; - const instances = [_]InstanceData{ - mesh.instanceData(), - }; - std.log.info("instance: {}", .{mesh.instanceData()}); - self.instances.write(&instances, self.queue, @intCast(offset)); + const instances = [_]InstanceData{mesh.instanceData()}; + self.instances.write(&instances, self.queue, @intCast(i)); mesh.dirty = false; } } @@ -590,30 +646,36 @@ pub fn main() !void { var mesh_buffer = MeshBuffer.init(.{ .device = app.device, .queue = app.queue, - .n_meshes = 10, + .n_meshes = 200, .n_vertices = 60000, .n_indices = 60000, .allocator = allocator, }); - const mesh = try load_obj.loadFile(.{ + var mesh = try load_obj.loadFile(.{ .allocator = allocator, .path = "Cottage_Clean/cottage.obj", }); - var render_mesh = try mesh_buffer.initMesh(&.{ - .data = &mesh, - .scale = 1, - .offset = .{ 0, -5, 15 }, - }); - _ = try load_obj.loadFile(.{ - .allocator = allocator, - .path = "teapot.obj", - //.scale = 0.05, - //.offset = .{ 0, -0.5, 0 }, - }); + //var bunny_mesh = try load_obj.loadFile(.{ + // .allocator = allocator, + // .path = "bunny-fixed.obj", + //}); + + const optimizeMesh = @import("forsyth_optimize.zig").optimize; + try optimizeMesh(allocator, &mesh, 32, .{}); + + const buildClusters = @import("meshlets.zig").buildClusters; + const cluster_mesh = try buildClusters(allocator, &mesh, 64, 126); + std.log.info("num clusters: {}", .{cluster_mesh.descriptors.len}); + + var render_mesh = try mesh_buffer.initClusterMesh(&.{ + .data = &cluster_mesh, + .scale = 1, + .offset = .{ 0, 0, 0 }, + }, allocator); - var rp = try MeshRenderPipeline.init(&app, &mesh_buffer, &textures, "shaders/mesh_pbr.wgsl"); + var rp = try MeshRenderPipeline.init(&app, &mesh_buffer, &textures, "shaders/mesh_clusters.wgsl"); const material = try Material.init( &textures, diff --git a/src/mesh_render_pipeline.zig b/src/mesh_render_pipeline.zig @@ -173,8 +173,9 @@ pub fn frame(ptr: *anyopaque, pass: *gpu.RenderPassEncoder) void { pass.setBindGroup(0, self.bind_group, null); pass.setIndexBuffer(self.mesh_buffer.indices.data, .uint32, 0, self.mesh_buffer.indices.size); - for (0..self.mesh_buffer.meshes.items.len) |mesh_idx| { - const offset = mesh_idx * 5 * @sizeOf(u32); + const num_indirect = self.mesh_buffer.indirect.count(); + for (0..num_indirect) |idx| { + const offset = idx * self.mesh_buffer.indirect.attrib_size; pass.drawIndexedIndirect(self.mesh_buffer.indirect.data, offset); } } diff --git a/src/meshlets.zig b/src/meshlets.zig @@ -0,0 +1,120 @@ +const std = @import("std"); +const assert = std.debug.assert; + +const Mesh = @import("mesh.zig"); + +const f32x3 = @Vector(3, f32); +const f32x4 = @Vector(4, f32); + +pub const ClusterDescriptor = struct { + offset: u32, + num_vertices: u32, + num_triangles: u32, +}; + +pub const ClusterMesh = struct { + positions: []f32x4, + normals: []f32x3, + tex_coords: []f32x3, + tangents: []f32x3, + bitangents: []f32x3, + indices: []u32, + descriptors: []ClusterDescriptor, +}; + +pub fn buildClusters( + allocator: std.mem.Allocator, + mesh: *const Mesh, + max_vertices: u32, + max_triangles: u32, +) !ClusterMesh { + // Only triangle meshes are supported + assert(mesh.indices.len % 3 == 0); + // Must have at least one triangle + assert(max_vertices >= 3); + assert(max_triangles >= 1); + + // 'unused' tracks which vertices have not been used + // unused[vtx_idx] = 1 -> not used + // unused[vtx_idx] = 0 -> used + const unused = try allocator.alloc(u8, mesh.positions.len); + defer allocator.free(unused); + @memset(unused, 1); + + const indices = try allocator.alloc(u32, mesh.indices.len); + var descriptors = std.ArrayList(ClusterDescriptor).init(allocator); + defer descriptors.deinit(); + + var num_vertices: u32 = 0; + var num_triangles: u32 = 0; + var cluster_offset: u32 = 0; + + for (0..mesh.indices.len / 3) |i| { + const mesh_offset = i * 3; + const a = mesh.indices[mesh_offset]; + const b = mesh.indices[mesh_offset + 1]; + const c = mesh.indices[mesh_offset + 2]; + + // Triangles must be non-degenerate + assert(a != b and a != c and b != c); + // Vertices must be contained in the mesh + const a_valid = a < mesh.positions.len; + const b_valid = b < mesh.positions.len; + const c_valid = c < mesh.positions.len; + assert(a_valid and b_valid and c_valid); + + const num_extra = unused[a] + unused[b] + unused[c]; + const too_many_vertices = num_vertices + num_extra >= max_vertices; + const too_many_triangles = num_triangles >= max_triangles; + if (too_many_vertices or too_many_triangles) { + // Save cluster info + try descriptors.append(.{ + .offset = cluster_offset, + .num_vertices = num_vertices, + .num_triangles = num_triangles, + }); + + // Reset state for next cluster + @memset(unused, 1); + cluster_offset += num_triangles * 3; + num_vertices = 0; + num_triangles = 0; + } + + // Mark vertices as used + unused[a] = 0; + unused[b] = 0; + unused[c] = 0; + + // Add triangle to cluster + const offset = cluster_offset + num_triangles * 3; + indices[offset] = a; + indices[offset + 1] = b; + indices[offset + 2] = c; + + num_vertices += num_extra; + num_triangles += 1; + } + + // Add final cluster + if (num_triangles > 0) { + try descriptors.append(.{ + .offset = cluster_offset, + .num_vertices = num_vertices, + .num_triangles = num_triangles, + }); + } + + // Total size of mesh should not change + assert(mesh.indices.len == cluster_offset + num_triangles * 3); + + return .{ + .positions = mesh.positions, + .normals = mesh.normals, + .tex_coords = mesh.tex_coords, + .tangents = mesh.tangents, + .bitangents = mesh.bitangents, + .indices = indices, + .descriptors = try descriptors.toOwnedSlice(), + }; +} diff --git a/src/meshlets_adjacency.zig b/src/meshlets_adjacency.zig @@ -0,0 +1,446 @@ +const std = @import("std"); +const assert = std.debug.assert; + +const HalfEdgeMap = @import("half_edge_map.zig"); +const Mesh = @import("mesh.zig"); + +const meshlets = @import("meshlets.zig"); +const ClusterDescriptor = meshlets.ClusterDescriptor; +const ClusterMesh = meshlets.ClusterMesh; + +pub fn buildClusters( + allocator: std.mem.Allocator, + mesh: *const Mesh, + max_vertices: u32, + max_triangles: u32, +) !ClusterMesh { + // Only triangle meshes are supported + assert(mesh.indices.len % 3 == 0); + // Must have at least one triangle + assert(max_vertices >= 3); + assert(max_triangles >= 1); + + // 'unused' tracks which vertices have not been used + // unused[vtx_idx] = 1 -> not used + // unused[vtx_idx] = 0 -> used + const unused = try allocator.alloc(u8, mesh.positions.len); + defer allocator.free(unused); + @memset(unused, 1); + + const indices = try allocator.alloc(u32, mesh.indices.len); + var descriptors = std.ArrayList(ClusterDescriptor).init(allocator); + defer descriptors.deinit(); + + var num_vertices: u32 = 0; + var num_triangles: u32 = 0; + var cluster_offset: u32 = 0; + + var map = try HalfEdgeMap.init(mesh.indices, allocator); + var tri: [3]u32 = undefined; + var adjacent = std.ArrayList(u32).init(allocator); + const usage = try buildUsageMap(mesh, allocator); + + for (0..mesh.indices.len / 3) |_| { + // Fetch next triangle + if (adjacent.items.len > 0) { + var min_extra: u32 = std.math.maxInt(u32); + var min_score: u32 = std.math.maxInt(u32); + var idx_best: u32 = 0; + const num_adjacent = adjacent.items.len / 3; + for (0..num_adjacent) |triangle_idx| { + const i = @as(u32, @intCast(triangle_idx)) * 3; + const adj = [3]u32{ + adjacent.items[i], + adjacent.items[i + 1], + adjacent.items[i + 2], + }; + + var extra: u32 = 1; + if (inSlice(u32, indices[cluster_offset..], adj[2])) { + extra = 0; + } + + if ((usage[adj[0]] == 1) or (usage[adj[1]] == 1) or (usage[adj[2]] == 1)) { + extra = 0; + } + + if (extra > min_extra) { + continue; + } + const score = usage[adj[0]] + usage[adj[1]] + usage[adj[2]] - 3; + + if ((extra < min_extra) or (score < min_score)) { + min_extra = extra; + min_score = score; + tri = adj; + idx_best = i; + } + } + tri = [3]u32{ + adjacent.items[idx_best], + adjacent.items[idx_best + 1], + adjacent.items[idx_best + 2], + }; + _ = adjacent.orderedRemove(idx_best); + _ = adjacent.orderedRemove(idx_best); + _ = adjacent.orderedRemove(idx_best); + } else { + tri = map.pop(); + } + const a = tri[0]; + const b = tri[1]; + const c = tri[2]; + usage[a] -= 1; + usage[b] -= 1; + usage[c] -= 1; + + // Triangles must be non-degenerate + assert(a != b and a != c and b != c); + + // Vertices must be contained in the mesh + const a_valid = a < mesh.positions.len; + const b_valid = b < mesh.positions.len; + const c_valid = c < mesh.positions.len; + assert(a_valid and b_valid and c_valid); + + // Verify cluster has space + const num_extra = unused[a] + unused[b] + unused[c]; + const too_many_vertices = num_vertices + num_extra >= max_vertices; + const too_many_triangles = num_triangles >= max_triangles; + if (too_many_vertices or too_many_triangles) { + // Save cluster info + try descriptors.append(.{ + .offset = cluster_offset, + .num_vertices = num_vertices, + .num_triangles = num_triangles, + }); + + // Reset state for next cluster + @memset(unused, 1); + cluster_offset += num_triangles * 3; + num_vertices = 0; + num_triangles = 0; + + // Add all adjacent triangles back to the half-edge map + for (0..adjacent.items.len / 3) |i| { + const idx = i * 3; + const adj = [3]u32{ + adjacent.items[idx], + adjacent.items[idx + 1], + adjacent.items[idx + 2], + }; + map.insert(adj); + } + adjacent.clearAndFree(); + } + + // Mark vertices as used + unused[a] = 0; + unused[b] = 0; + unused[c] = 0; + + // Add triangle to cluster + const offset = cluster_offset + num_triangles * 3; + indices[offset] = a; + indices[offset + 1] = b; + indices[offset + 2] = c; + + num_vertices += num_extra; + num_triangles += 1; + + // Get newly adjacent triangles + for (0..3) |side| { + const triangle = map.adjacentTo(tri, @intCast(side)) orelse { + continue; + }; + try adjacent.appendSlice(&triangle); + std.debug.assert(map.remove(triangle)); + } + + // If we're out of adjacent triangles, this cluster is done + const no_more_adjacent = adjacent.items.len <= 0; + if (no_more_adjacent) { + // Save cluster info + try descriptors.append(.{ + .offset = cluster_offset, + .num_vertices = num_vertices, + .num_triangles = num_triangles, + }); + + // Reset state for next cluster + @memset(unused, 1); + cluster_offset += num_triangles * 3; + num_vertices = 0; + num_triangles = 0; + continue; + } + } + + // Add final cluster + if (num_triangles > 0) { + try descriptors.append(.{ + .offset = cluster_offset, + .num_vertices = num_vertices, + .num_triangles = num_triangles, + }); + } + + // Total size of mesh should not change + assert(mesh.indices.len == cluster_offset + num_triangles * 3); + + return .{ + .positions = mesh.positions, + .normals = mesh.normals, + .tex_coords = mesh.tex_coords, + .tangents = mesh.tangents, + .bitangents = mesh.bitangents, + .indices = indices, + .descriptors = try descriptors.toOwnedSlice(), + }; +} + +pub fn buildUsageMap( + mesh: *const Mesh, + allocator: std.mem.Allocator, +) ![]u32 { + const num_vertices = mesh.positions.len; + const map = try allocator.alloc(u32, num_vertices); + @memset(map, 0); + for (mesh.indices) |idx| { + map[idx] += 1; + } + return map; +} + +fn inSlice(comptime T: type, slice: []const T, item: T) bool { + for (slice) |iter_item| { + if (iter_item == item) { + return true; + } + } + return false; +} + +//const ClusterOptions = struct { +// max_verts: u32, +// max_prims: u32, +//}; +// +//const ClusterDescriptor = struct { +// offset: u32, +// num_prims: u32, +// num_verts: u32, +//}; +// +//pub fn buildClusters(mesh: *const Mesh, allocator: std.mem.Allocator) !void { +// const max_prims = 126; +// var offset: u32 = 0; +// +// var map = try HalfEdgeMap.init(mesh.indices, allocator); +// var tri: ?[3]u32 = null; +// +// var descriptors = std.ArrayList(ClusterDescriptor).init(allocator); +// // this holds offsets and counts for all clusters +// +// const indices: []u32 = try allocator.alloc(u32, mesh.indices.len); +// // this holds the actual index data for all clusters +// // should be free'd by caller (eventually) +// +// while (map.count() > 0) { +// if (tri == null) { +// tri = map.pop(); +// } +// const cluster_end = @min(offset + max_prims * 3, mesh.indices.len); +// const cluster = indices[offset..cluster_end]; +// +// //result = try buildClusterFewest(allocator, &map, usage, tri.?); +// const result = try buildClusterSimple( +// allocator, +// &map, +// tri.?, +// cluster, +// ); +// offset += result.descriptor.num_prims * 3; +// try descriptors.append(result.descriptor); +// tri = result.next; +// } +// std.log.info("offset: {}", .{offset}); +// std.debug.assert(offset == indices.len); +// std.log.info("num clusters: {}", .{descriptors.items.len}); +//} +// +//const ClusterResult = struct { +// descriptor: ClusterDescriptor, +// next: ?[3]u32, +//}; +// +//pub const Triangle = struct { +// a: u32, +// b: u32, +// c: u32, +// +// pub fn permute(self: *const Triangle) Triangle { +// return .{ .a = self.b, .b = self.c, .c = self.a }; +// } +// +// pub fn reverse(self: *const Triangle) Triangle { +// return .{ .a = self.c, .b = self.b, .c = self.a }; +// } +//}; +// +//fn buildClusterSimple( +// allocator: std.mem.Allocator, +// map: *HalfEdgeMap, +// initial_triangle: [3]u32, +// cluster: []u32, +//) !ClusterResult { +// const max_prims = 126; +// +// var vertices = std.AutoHashMap(u32, void).init(allocator); +// var adjacent = std.ArrayList(Triangle).init(allocator); +// var tri: ?Triangle = .{ +// .a = initial_triangle[0], +// .b = initial_triangle[1], +// .c = initial_triangle[2], +// }; +// +// var idx: u32 = 0; +// while (idx < max_prims * 3) { +// // Add the most recent triangle to the cluster +// cluster[idx] = tri.?.a; +// cluster[idx + 1] = tri.?.b; +// cluster[idx + 2] = tri.?.c; +// //@memcpy(cluster[idx .. idx + 3], &tri.?); +// try vertices.put(tri.?.a, {}); +// try vertices.put(tri.?.b, {}); +// try vertices.put(tri.?.c, {}); +// idx += 3; +// +// // Get newly adjacent triangles +// for (0..3) |side| { +// const triangle = map.adjacentTo(tri.?, @intCast(side)) orelse { +// continue; +// }; +// try adjacent.append(triangle); +// std.debug.assert(map.remove(triangle)); +// } +// if (adjacent.items.len <= 0) { +// tri = null; +// break; +// } +// +// // Oldest adjacent triangle is up next +// tri = [3]u32{ +// adjacent.items[0].a, +// adjacent.items[0].b, +// adjacent.items[0].c, +// }; +// _ = adjacent.orderedRemove(0); +// } +// +// // Add all adjacent triangles back to the half-edge map +// for (adjacent.items) |adj_tri| { +// //const adj = [3]u32{ adj_tri.a, adj_tri.b, adj_tri.c }; +// map.insert(adj_tri); +// } +// if (tri != null) { +// const triangle = .{ +// .a = tri.?[0], +// .b = tri.?[1], +// .c = tri.?[2], +// }; +// map.insert(triangle); +// } +// +// return .{ +// .descriptor = .{ +// .offset = undefined, +// .num_prims = idx / 3, +// .num_verts = vertices.count(), +// }, +// .next = null, //tri, +// }; +//} +// +//fn buildClusterFewest( +// allocator: std.mem.Allocator, +// map: *HalfEdgeMap, +// usage: []u32, +// initial_triangle: [3]u32, +//) !ClusterResult { +// var cluster = std.ArrayList(u32).init(allocator); +// var adjacent = std.ArrayList(u32).init(allocator); +// var tri: ?[3]u32 = initial_triangle; +// while (cluster.items.len < 126 * 3) { +// // Add the most recent triangle to the cluster +// try cluster.appendSlice(&tri.?); +// +// // Get newly adjacent triangles +// for (0..3) |side| { +// const triangle = map.adjacentTo(tri.?, @intCast(side)) orelse { +// continue; +// }; +// try adjacent.appendSlice(&triangle); +// std.debug.assert(map.remove(triangle)); +// } +// if (adjacent.items.len <= 0) { +// tri = null; +// break; +// } +// +// var min_extra: u32 = std.math.maxInt(u32); +// var min_score: u32 = std.math.maxInt(u32); +// var idx_best: u32 = 0; +// const num_adjacent = adjacent.items.len / 3; +// for (0..num_adjacent) |triangle_idx| { +// const i = @as(u32, @intCast(triangle_idx)) * 3; +// const adj = [3]u32{ +// adjacent.items[i], +// adjacent.items[i + 1], +// adjacent.items[i + 2], +// }; +// +// var extra: u32 = 1; +// if (inSlice(u32, cluster.items, adj[2])) { +// extra = 0; +// } +// +// if ((usage[adj[0]] == 1) or (usage[adj[1]] == 1) or (usage[adj[2]] == 1)) { +// extra = 0; +// } +// +// if (extra > min_extra) { +// continue; +// } +// const score = usage[adj[0]] + usage[adj[1]] + usage[adj[2]] - 3; +// +// if ((extra < min_extra) or (score < min_score)) { +// min_extra = extra; +// min_score = score; +// tri = adj; +// idx_best = i; +// } +// } +// _ = adjacent.orderedRemove(idx_best); +// _ = adjacent.orderedRemove(idx_best); +// _ = adjacent.orderedRemove(idx_best); +// usage[tri.?[0]] -= 1; +// usage[tri.?[1]] -= 1; +// usage[tri.?[2]] -= 1; +// } +// const num_adjacent = adjacent.items.len / 3; +// for (0..num_adjacent) |triangle_idx| { +// const i = @as(u32, @intCast(triangle_idx)) * 3; +// const adj = [3]u32{ +// adjacent.items[i], +// adjacent.items[i + 1], +// adjacent.items[i + 2], +// }; +// map.insert(adj); +// } +// std.log.info("cluster size: {}", .{cluster.items.len / 3}); +// return .{ +// .cluster = cluster, +// .next = tri, +// }; +//} diff --git a/src/shaders/mesh_clusters.wgsl b/src/shaders/mesh_clusters.wgsl @@ -0,0 +1,204 @@ +struct VertexInput { + @location(0) position: vec4<f32>, + @location(1) normal: vec3<f32>, + @location(2) tex_coord: vec3<f32>, + @location(3) tangent: vec3<f32>, + @location(4) bitangent: vec3<f32>, +}; + +struct VertexOutput { + @builtin(position) clip_position: vec4<f32>, + @location(0) tex_coord: vec3<f32>, + @location(1) pt_fragment: vec3<f32>, + @location(2) pt_light: vec3<f32>, + @location(3) pt_camera: vec3<f32>, + @location(4) @interpolate(flat) instance_id: u32, +}; + +@group(0) @binding(0) +var<storage, read> instances: array<Instance>; +struct Instance { + model_matrix: mat4x4<f32>, + material: Material, +}; +struct Material { + albedo: u32, + normal: u32, + roughness: u32, + metalness: u32, +}; + +@group(0) @binding(1) +var<uniform> uniform_data: UniformData; +struct UniformData { + pw_camera: vec3<f32>, + view_matrix: mat4x4<f32>, + proj_matrix: mat4x4<f32>, +}; + +@vertex fn vertex( + in: VertexInput, + @builtin(instance_index) idx: u32, +) -> VertexOutput { + let model_matrix = instances[0].model_matrix; + let camera_matrix = uniform_data.proj_matrix * uniform_data.view_matrix; + + let t = normalize((model_matrix * vec4(in.tangent, 0.0)).xyz); + let b = normalize((model_matrix * vec4(in.bitangent, 0.0)).xyz); + let n = normalize((model_matrix * vec4(in.normal, 0.0)).xyz); + let tangent_matrix = mat3x3<f32>(t, b, n); + + let world_position = model_matrix * in.position; + let tangent_position = tangent_matrix * world_position.xyz; + + let light_position = vec3<f32>(20, 80, 20); + let tangent_light_position = tangent_matrix * light_position; + + let pt_camera = tangent_matrix * uniform_data.pw_camera; + + + return VertexOutput( + camera_matrix * world_position, + in.tex_coord, + tangent_position, + tangent_light_position, + pt_camera, + idx, + ); +} + + +@group(0) @binding(2) var material_sampler: sampler; +@group(0) @binding(3) var material_texture: texture_2d_array<f32>; + +fn diffuse_lambert(albedo: vec3<f32>) -> vec3<f32> { + return albedo * 0.31830988618; +} + +fn distribution_trowbridge_reitz_ggx( + n_dot_h: f32, + alpha: f32, +) -> f32 { + let alpha_squared = alpha * alpha; + let n_dot_h_squared = n_dot_h * n_dot_h; + let denom = n_dot_h_squared * (alpha_squared - 1) + 1; + let denom_squared = denom * denom; + return alpha_squared / (3.14159265359 * denom_squared); +} + +fn geometry_schlick_ggx( + n_dot_x: f32, + alpha: f32, +) -> f32 { + let k = 0.5 * alpha; + let denom = n_dot_x * (1.0 - k) + k; + return n_dot_x / denom; +} + +fn geometry_smith( + n_dot_l: f32, + n_dot_v: f32, + alpha: f32, +) -> f32 { + let g1 = geometry_schlick_ggx(n_dot_l, alpha); + let g2 = geometry_schlick_ggx(n_dot_v, alpha); + return g1 * g2; +} + +fn fresnel_schlick( + f0: vec3<f32>, + v_dot_h: f32, +) -> vec3<f32> { + return f0 + (1.0 - f0) * pow(1.0 - v_dot_h, 5.0); +} + +fn cook_torrance( + n_dot_v: f32, + n_dot_l: f32, + n_dot_h: f32, + roughness: f32, +) -> f32 { + let alpha = roughness * roughness; + let distribution = distribution_trowbridge_reitz_ggx( + n_dot_h, + alpha, + ); + let geometry = geometry_smith( + n_dot_l, + n_dot_v, + alpha, + ); + let denom = 4.0 * n_dot_v * n_dot_l + 0.000001; + return (distribution * geometry) / denom; +} + +fn lighting( + normal: vec3<f32>, + p_fragment: vec3<f32>, + p_camera: vec3<f32>, + p_light: vec3<f32>, + c_light: vec3<f32>, + albedo: vec3<f32>, + roughness: f32, + metalness: f32, +) -> vec3<f32> { + let v = normalize(p_camera - p_fragment); + let l = normalize(p_light - p_fragment); + let h = normalize(v + l); + + let n_dot_v = max(dot(normal, v), 0.0); + let n_dot_l = max(dot(normal, l), 0.0); + let n_dot_h = max(dot(normal, h), 0.0); + let v_dot_h = max(dot(v, h), 0.0); + + let f0 = mix(vec3<f32>(0.04), albedo, metalness); + let k_s = fresnel_schlick(f0, v_dot_h); + let k_d = 1.0 - k_s; + + let diffuse = k_d * diffuse_lambert(albedo); + let specular = k_s * cook_torrance(n_dot_v, n_dot_l, n_dot_h, roughness); + let brdf = (diffuse + specular) * n_dot_l; + return brdf * c_light; +} + +@fragment fn fragment(in: VertexOutput) -> @location(0) vec4<f32> { + let material = instances[in.instance_id].material; + let uv = vec2<f32>(in.tex_coord.x, 1 - in.tex_coord.y); + var albedo = textureSample(material_texture, material_sampler, uv, material.albedo).rgb; + albedo = pow(albedo, vec3<f32>(2.2)); + var normal = textureSample(material_texture, material_sampler, uv, material.normal).rgb; + normal = normalize(normal * 2.0 - 1.0); + var roughness = textureSample(material_texture, material_sampler, uv, material.roughness).r; + var metalness = textureSample(material_texture, material_sampler, uv, material.metalness).r; + + let light_color = normalize(vec3<f32>(23.47, 21.31, 20.79)); + //let light_color = normalize(vec3<f32>(0.9, 0.9, 0.9)); + var c = lighting( + normal, + in.pt_fragment, + in.pt_camera, + in.pt_light, + light_color, + albedo.rgb, + roughness, + metalness, + ); + + let ambient = vec3<f32>(0.03) * albedo; + c += ambient; + + c = c / (c + 1.0); + c = pow(c, vec3<f32>(1.0/2.2)); + + let colors = array<vec3<f32>, 6>( + vec3<f32>(1, 1, 0), + vec3<f32>(0, 1, 1), + vec3<f32>(1, 0, 1), + vec3<f32>(1, 0, 0), + vec3<f32>(0, 1, 0), + vec3<f32>(0, 0, 1), + ); + let color = colors[in.instance_id % 6]; + + return vec4<f32>(color, 1.0); +}