Add meshlet generation algorithms - render-zig - A 3D rendering engine written in Zig

commit e2bc0ba7e6a2b82a3e3f498cb701e5ceca0f4050
parent 1494036a2c88de72e059e729880321db83593b55
Author: Christian Ermann <christianermann@gmail.com>
Date:   Sat, 20 Jul 2024 10:45:21 -0400

Add meshlet generation algorithms

Diffstat:
A src/forsyth_optimize.zig  | 278 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/half_edge_map.zig  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/main.zig  | 342 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M src/mesh_render_pipeline.zig  | 5 +++--
A src/meshlets.zig  | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/meshlets_adjacency.zig  | 446 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/shaders/mesh_clusters.wgsl  | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

7 files changed, 1320 insertions(+), 142 deletions(-)
diff --git a/src/forsyth_optimize.zig b/src/forsyth_optimize.zig
@@ -0,0 +1,278 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+const Mesh = @import("mesh.zig");
+
+const ScoreParameters = struct {
+    last_tri: f32 = 0.75,
+    cache_decay_power: f32 = 1.5,
+    valence_boost_scale: f32 = 2.0,
+    valence_boost_power: f32 = 0.5,
+};
+
+const Vertex = struct {
+    cache_idx: ?u32 = undefined,
+    score: f32 = 0,
+    num_triangles: u32 = 0,
+    max_triangles: u32 = 0,
+    triangles: []u32 = undefined,
+
+    fn scoreFn(
+        self: *const Vertex,
+        max_cache_idx: u32,
+        params: ScoreParameters,
+    ) f32 {
+        // Vertex will never be used again
+        const num_active = self.max_triangles - self.num_triangles;
+        if (num_active <= 0) {
+            return -1.0;
+        }
+
+        // Boost score for lonely vertices so they go away fast
+        const valence_boost = std.math.pow(
+            f32,
+            @floatFromInt(num_active),
+            -params.valence_boost_power,
+        );
+        const score = params.valence_boost_scale * valence_boost;
+
+        // No adjustments for vertices not in the cache yet
+        if (self.cache_idx == null) {
+            return score;
+        }
+
+        // Vertices from last triangle all get the same score
+        const cache_idx = self.cache_idx.?;
+        if (cache_idx < 3) {
+            return score + params.last_tri;
+        }
+
+        // For other vertices, score depends on cache position
+        assert(cache_idx < max_cache_idx);
+        const scale = 1.0 / @as(f32, @floatFromInt(max_cache_idx - 3));
+        const adjusted_idx: f32 = @floatFromInt(cache_idx - 3);
+        const cache_score = 1.0 - adjusted_idx * scale;
+        return score + std.math.pow(f32, cache_score, params.cache_decay_power);
+    }
+};
+
+const Triangle = struct {
+    added: bool = false,
+    score: f32 = 0,
+    vertices: [3]u32 = undefined,
+};
+
+fn LRUCache(comptime T: type) type {
+    return struct {
+        const Self = @This();
+        const List = std.DoublyLinkedList(T);
+
+        allocator: std.mem.Allocator,
+        list: List,
+        max_size: u32,
+
+        pub fn init(allocator: std.mem.Allocator, max_size: u32) Self {
+            return .{
+                .list = std.DoublyLinkedList(T){},
+                .allocator = allocator,
+                .max_size = max_size,
+            };
+        }
+
+        fn find(cache: *const Self, value: T) ?*List.Node {
+            var it = cache.list.first;
+            while (it) |node| : (it = node.next) {
+                if (node.data == value) {
+                    return node;
+                }
+            }
+            return null;
+        }
+
+        pub fn insert(cache: *Self, value: T) !?*List.Node {
+            var node = cache.find(value);
+            if (node != null) {
+                cache.list.remove(node.?);
+            } else {
+                node = try cache.allocator.create(List.Node);
+                node.?.data = value;
+            }
+            cache.list.prepend(node.?);
+
+            if (cache.list.len > cache.max_size) {
+                return cache.list.pop();
+            }
+            return null;
+        }
+
+        pub fn first(cache: *Self) ?*List.Node {
+            return cache.list.first;
+        }
+    };
+}
+
+pub fn optimize(
+    allocator: std.mem.Allocator,
+    mesh: *Mesh,
+    max_cache_idx: u32,
+    params: ScoreParameters,
+) !void {
+    // Initialize vertex data structure
+    const vertices = try allocator.alloc(Vertex, mesh.positions.len);
+    @memset(vertices, Vertex{});
+    for (0..mesh.indices.len / 3) |i| {
+        const mesh_offset = i * 3;
+        const a = mesh.indices[mesh_offset];
+        const b = mesh.indices[mesh_offset + 1];
+        const c = mesh.indices[mesh_offset + 2];
+
+        vertices[a].max_triangles += 1;
+        vertices[b].max_triangles += 1;
+        vertices[c].max_triangles += 1;
+    }
+    for (vertices) |*vertex| {
+        vertex.triangles = try allocator.alloc(u32, vertex.max_triangles);
+        vertex.max_triangles = 0;
+    }
+    for (0..mesh.indices.len / 3) |i| {
+        const mesh_offset = i * 3;
+        const a = mesh.indices[mesh_offset];
+        const b = mesh.indices[mesh_offset + 1];
+        const c = mesh.indices[mesh_offset + 2];
+
+        const idx: u32 = @intCast(i);
+        vertices[a].triangles[vertices[a].max_triangles] = idx;
+        vertices[b].triangles[vertices[b].max_triangles] = idx;
+        vertices[c].triangles[vertices[c].max_triangles] = idx;
+
+        vertices[a].max_triangles += 1;
+        vertices[b].max_triangles += 1;
+        vertices[c].max_triangles += 1;
+    }
+    for (vertices) |*vertex| {
+        vertex.score = vertex.scoreFn(max_cache_idx, params);
+    }
+
+    // Initialize triangle data structure
+    const triangles = try allocator.alloc(Triangle, mesh.indices.len / 3);
+    @memset(triangles, Triangle{});
+    for (triangles, 0..) |*tri, i| {
+        const mesh_offset = i * 3;
+        const a = mesh.indices[mesh_offset];
+        const b = mesh.indices[mesh_offset + 1];
+        const c = mesh.indices[mesh_offset + 2];
+        tri.vertices[0] = a;
+        tri.vertices[1] = b;
+        tri.vertices[2] = c;
+        tri.score = vertices[a].score + vertices[b].score + vertices[c].score;
+    }
+
+    // Store new indices here
+    const indices = try allocator.alloc(u32, mesh.indices.len);
+
+    // Run algorithm
+    var cache = LRUCache(u32).init(allocator, max_cache_idx);
+    for (0..triangles.len) |i| {
+        var tri = bestTriangleFromCache(vertices, triangles, &cache);
+        if (tri == null) {
+            tri = bestTriangleOverall(triangles);
+        }
+        assert(tri != null);
+
+        const offset = i * 3;
+        indices[offset] = triangles[tri.?].vertices[0];
+        indices[offset + 1] = triangles[tri.?].vertices[1];
+        indices[offset + 2] = triangles[tri.?].vertices[2];
+
+        try addTriangleToCache(
+            allocator,
+            tri.?,
+            vertices,
+            triangles,
+            &cache,
+            params,
+        );
+    }
+
+    allocator.free(mesh.indices);
+    mesh.indices = indices;
+}
+
+fn addTriangleToCache(
+    allocator: std.mem.Allocator,
+    triangle_idx: u32,
+    vertices: []Vertex,
+    triangles: []Triangle,
+    cache: *LRUCache(u32),
+    params: ScoreParameters,
+) !void {
+    assert(!triangles[triangle_idx].added);
+
+    // Insert triangle
+    triangles[triangle_idx].added = true;
+    for (triangles[triangle_idx].vertices) |idx| {
+        const removed = try cache.insert(idx);
+        if (removed != null) {
+            const vtx = &vertices[removed.?.data];
+            vtx.cache_idx = null;
+            vtx.score = vtx.scoreFn(cache.max_size, params);
+            allocator.destroy(removed.?);
+        }
+        vertices[idx].num_triangles += 1;
+    }
+
+    // Update all other cache entries
+    var it = cache.first();
+    var cache_idx: u32 = 0;
+    while (it) |node| : (it = node.next) {
+        const vtx = &vertices[node.data];
+        if (cache_idx < cache.max_size) {
+            vtx.cache_idx = cache_idx;
+        } else {
+            vtx.cache_idx = null;
+        }
+        cache_idx += 1;
+        vtx.score = vtx.scoreFn(cache.max_size, params);
+    }
+}
+
+fn bestTriangleOverall(triangles: []Triangle) ?u32 {
+    var best_triangle: ?u32 = null;
+    var best_score: f32 = 0.0;
+    for (triangles, 0..) |tri, i| {
+        if (!tri.added and tri.score > best_score) {
+            best_triangle = @intCast(i);
+            best_score = tri.score;
+        }
+    }
+    return best_triangle;
+}
+
+fn bestTriangleFromCache(
+    vertices: []Vertex,
+    triangles: []Triangle,
+    cache: *LRUCache(u32),
+) ?u32 {
+    var best_score: f32 = 0.0;
+    var best_triangle: ?u32 = null;
+    var it = cache.first();
+    while (it) |node| : (it = node.next) {
+        const vtx = &vertices[node.data];
+        for (vtx.triangles) |idx| {
+            const tri = &triangles[idx];
+            if (tri.added) {
+                continue;
+            }
+            const a = vertices[tri.vertices[0]];
+            const b = vertices[tri.vertices[1]];
+            const c = vertices[tri.vertices[2]];
+            tri.score = a.score + b.score + c.score;
+
+            if (tri.score > best_score) {
+                best_triangle = idx;
+                best_score = tri.score;
+            }
+        }
+    }
+    return best_triangle;
+}
diff --git a/src/half_edge_map.zig b/src/half_edge_map.zig
@@ -0,0 +1,67 @@
+const std = @import("std");
+
+const HalfEdgeMap = @This();
+
+const HalfEdge = struct {
+    head: u32,
+    tail: u32,
+};
+
+map: std.AutoArrayHashMapUnmanaged(HalfEdge, u32),
+
+pub fn init(indices: []u32, allocator: std.mem.Allocator) !HalfEdgeMap {
+    const num_indices: u32 = @intCast(indices.len);
+    const num_triangles = num_indices / 3;
+
+    var map = std.AutoArrayHashMapUnmanaged(HalfEdge, u32){};
+    try map.ensureTotalCapacity(allocator, num_indices);
+
+    for (0..num_triangles) |triangle_idx| {
+        const i = @as(u32, @intCast(triangle_idx)) * 3;
+        const a = indices[i];
+        const b = indices[i + 1];
+        const c = indices[i + 2];
+        map.putAssumeCapacityNoClobber(.{ .head = a, .tail = b }, c);
+        map.putAssumeCapacityNoClobber(.{ .head = b, .tail = c }, a);
+        map.putAssumeCapacityNoClobber(.{ .head = c, .tail = a }, b);
+    }
+    return .{
+        .map = map,
+    };
+}
+
+pub fn insert(self: *HalfEdgeMap, tri: [3]u32) void {
+    const a = tri[0];
+    const b = tri[1];
+    const c = tri[2];
+    self.map.putAssumeCapacityNoClobber(.{ .head = a, .tail = b }, c);
+    self.map.putAssumeCapacityNoClobber(.{ .head = b, .tail = c }, a);
+    self.map.putAssumeCapacityNoClobber(.{ .head = c, .tail = a }, b);
+}
+
+pub fn remove(self: *HalfEdgeMap, tri: [3]u32) bool {
+    const success0 = self.map.swapRemove(.{ .head = tri[0], .tail = tri[1] });
+    const success1 = self.map.swapRemove(.{ .head = tri[1], .tail = tri[2] });
+    const success2 = self.map.swapRemove(.{ .head = tri[2], .tail = tri[0] });
+    return success0 and success1 and success2;
+}
+
+pub fn adjacentTo(self: *HalfEdgeMap, tri: [3]u32, side: u32) ?[3]u32 {
+    const a = tri[(side + 1) % 3];
+    const b = tri[side];
+    const c = self.map.get(.{ .head = a, .tail = b }) orelse {
+        return null;
+    };
+    return [3]u32{ a, b, c };
+}
+
+pub fn count(self: *const HalfEdgeMap) usize {
+    return self.map.count();
+}
+
+pub fn pop(self: *HalfEdgeMap) [3]u32 {
+    const kv = self.map.pop();
+    const tri = [3]u32{ kv.key.head, kv.key.tail, kv.value };
+    _ = self.remove(tri);
+    return tri;
+}
diff --git a/src/main.zig b/src/main.zig
@@ -15,6 +15,7 @@ const Mesh = @import("mesh.zig");
 
 const Camera = @import("camera.zig");
 const input = @import("input.zig");
+const ClusterMesh = @import("meshlets.zig").ClusterMesh;
 
 const load_obj = @import("load_obj.zig");
 
@@ -298,114 +299,75 @@ pub const UniformData = struct {
     proj_matrix: mat4,
 };
 
-pub const Buffer = struct {
-    data: *gpu.Buffer,
-    size: u32,
-    offset: u32,
-    attrib_size: u32,
-
-    pub fn init_vtx(n_vertices: u32, device: *gpu.Device) Buffer {
-        std.log.info("buffer init: {} vertices", .{n_vertices});
-        const attrib_size = @sizeOf(f32x4);
-        const size = n_vertices * attrib_size;
-        const descriptor = gpu.Buffer.Descriptor{
-            .size = size,
-            .usage = .{ .vertex = true, .copy_dst = true },
-            .mapped_at_creation = .false,
-        };
-        const data = device.createBuffer(&descriptor);
-
-        return .{
-            .data = data,
-            .size = size,
-            .offset = 0,
-            .attrib_size = attrib_size,
-        };
-    }
+pub fn Buffer(comptime T: type) type {
+    return struct {
+        const Self = @This();
+        const attrib_size = @sizeOf(T);
 
-    pub fn init_idx(n_indices: u32, device: *gpu.Device) Buffer {
-        std.log.info("buffer init: {} indices", .{n_indices});
-        const attrib_size = @sizeOf(u32);
-        const size = n_indices * attrib_size;
-        const descriptor = gpu.Buffer.Descriptor{
-            .size = size,
-            .usage = .{ .index = true, .copy_dst = true },
-            .mapped_at_creation = .false,
-        };
-        const data = device.createBuffer(&descriptor);
-
-        return .{
-            .data = data,
-            .size = size,
-            .offset = 0,
-            .attrib_size = attrib_size,
-        };
-    }
-
-    pub fn init_indirect(n_meshes: u32, device: *gpu.Device) Buffer {
-        std.log.info("buffer init: {} indirect", .{n_meshes});
-        const attrib_size = @sizeOf(u32) * 5;
-        const size = n_meshes * attrib_size;
-        const descriptor = gpu.Buffer.Descriptor{
-            .size = size,
-            .usage = .{ .indirect = true, .copy_dst = true },
-            .mapped_at_creation = .false,
-        };
-        const data = device.createBuffer(&descriptor);
-        return .{
-            .data = data,
-            .size = size,
-            .offset = 0,
-            .attrib_size = attrib_size,
-        };
-    }
+        data: *gpu.Buffer,
+        size: u32,
+        offset: u32,
+        attrib_size: u32,
+
+        pub fn init(
+            max_attribs: u32,
+            device: *gpu.Device,
+            usage: gpu.Buffer.UsageFlags,
+            mapped_at_creation: bool,
+        ) Self {
+            std.log.info("buffer init: {} {}", .{ max_attribs, T });
+            const size = max_attribs * attrib_size;
+            const descriptor = gpu.Buffer.Descriptor{
+                .size = size,
+                .usage = usage,
+                .mapped_at_creation = gpu.Bool32.from(mapped_at_creation),
+            };
+            const data = device.createBuffer(&descriptor);
+            return .{
+                .data = data,
+                .size = size,
+                .offset = 0,
+                .attrib_size = attrib_size,
+            };
+        }
 
-    pub fn init_instance(n_instances: u32, device: *gpu.Device) Buffer {
-        std.log.info("buffer init: {} instances", .{n_instances});
-        const attrib_size = @sizeOf(InstanceData);
-        const size = n_instances * attrib_size;
-        const descriptor = gpu.Buffer.Descriptor{
-            .size = size,
-            .usage = .{ .storage = true, .copy_dst = true },
-            .mapped_at_creation = .false,
-        };
-        const data = device.createBuffer(&descriptor);
-        return .{
-            .data = data,
-            .size = size,
-            .offset = 0,
-            .attrib_size = attrib_size,
-        };
-    }
+        pub fn alloc(self: *Self, n_attribs: u32) !u32 {
+            const size = n_attribs * attrib_size;
+            const offset_start = self.offset;
+            const offset_after = self.offset + size;
+            std.log.info("buffer alloc: {} {}", .{ n_attribs, T });
+            if (offset_after > self.size) {
+                std.log.err("buffer out of memory", .{});
+                return error.BufferEmpty;
+            }
+            self.offset = offset_after;
+            return offset_start / attrib_size;
+        }
 
-    pub fn alloc(self: *Buffer, n_elements: u32) !u32 {
-        const size = n_elements * self.attrib_size;
-        const offset_start = self.offset;
-        const offset_after = self.offset + size;
-        std.log.info("buffer alloc: {} elements", .{n_elements});
-        if (offset_after > self.size) {
-            std.log.err("buffer out of memory", .{});
-            return error.BufferEmpty;
+        pub fn write(
+            self: *Self,
+            data_slice: anytype,
+            queue: *gpu.Queue,
+            offset: u32,
+        ) void {
+            queue.writeBuffer(self.data, offset * attrib_size, data_slice);
         }
-        self.offset = offset_after;
-        return offset_start;
-    }
 
-    pub fn write(
-        self: *Buffer,
-        data_slice: anytype,
-        queue: *gpu.Queue,
-        offset: u32,
-    ) void {
-        queue.writeBuffer(self.data, offset, data_slice);
-    }
+        pub fn allocWrite(
+            self: *Self,
+            data_slice: anytype,
+            queue: *gpu.Queue,
+        ) !u32 {
+            const offset = try self.alloc(@intCast(data_slice.len));
+            queue.writeBuffer(self.data, offset * attrib_size, data_slice);
+            return offset;
+        }
 
-    pub fn allocWrite(self: *Buffer, data_slice: anytype, queue: *gpu.Queue) !u32 {
-        const offset = try self.alloc(@intCast(data_slice.len));
-        queue.writeBuffer(self.data, offset, data_slice);
-        return offset;
-    }
-};
+        pub fn count(self: *const Self) u32 {
+            return self.offset / attrib_size;
+        }
+    };
+}
 
 pub const MeshBufferOptions = struct {
     device: *gpu.Device,
@@ -422,19 +384,25 @@ pub const MeshOptions = struct {
     data: *const Mesh,
 };
 
+pub const ClusterMeshOptions = struct {
+    scale: f32 = 1.0,
+    offset: f32x3 = .{ 0, 0, 0 },
+    data: *const ClusterMesh,
+};
+
 pub const MeshBuffer = struct {
     device: *gpu.Device,
     queue: *gpu.Queue,
 
-    positions: Buffer,
-    normals: Buffer,
-    tex_coords: Buffer,
-    tangents: Buffer,
-    bitangents: Buffer,
-    indices: Buffer,
+    positions: Buffer(f32x4),
+    normals: Buffer(f32x4),
+    tex_coords: Buffer(f32x4),
+    tangents: Buffer(f32x4),
+    bitangents: Buffer(f32x4),
+    indices: Buffer(u32),
 
-    instances: Buffer,
-    indirect: Buffer,
+    instances: Buffer(InstanceData),
+    indirect: Buffer([5]u32),
 
     meshes: std.ArrayList(RenderMesh),
 
@@ -446,14 +414,54 @@ pub const MeshBuffer = struct {
         return .{
             .device = options.device,
             .queue = options.queue,
-            .positions = Buffer.init_vtx(n_vertices, device),
-            .normals = Buffer.init_vtx(n_vertices, device),
-            .tex_coords = Buffer.init_vtx(n_vertices, device),
-            .tangents = Buffer.init_vtx(n_vertices, device),
-            .bitangents = Buffer.init_vtx(n_vertices, device),
-            .indices = Buffer.init_idx(n_indices, device),
-            .instances = Buffer.init_instance(options.n_meshes, device),
-            .indirect = Buffer.init_indirect(options.n_meshes, device),
+            .positions = Buffer(f32x4).init(
+                n_vertices,
+                device,
+                .{ .vertex = true, .copy_dst = true },
+                false,
+            ),
+            .normals = Buffer(f32x4).init(
+                n_vertices,
+                device,
+                .{ .vertex = true, .copy_dst = true },
+                false,
+            ),
+            .tex_coords = Buffer(f32x4).init(
+                n_vertices,
+                device,
+                .{ .vertex = true, .copy_dst = true },
+                false,
+            ),
+            .tangents = Buffer(f32x4).init(
+                n_vertices,
+                device,
+                .{ .vertex = true, .copy_dst = true },
+                false,
+            ),
+            .bitangents = Buffer(f32x4).init(
+                n_vertices,
+                device,
+                .{ .vertex = true, .copy_dst = true },
+                false,
+            ),
+            .indices = Buffer(u32).init(
+                n_indices,
+                device,
+                .{ .index = true, .copy_dst = true },
+                false,
+            ),
+            .instances = Buffer(InstanceData).init(
+                options.n_meshes,
+                device,
+                .{ .storage = true, .copy_dst = true },
+                false,
+            ),
+            .indirect = Buffer([5]u32).init(
+                options.n_meshes,
+                device,
+                .{ .indirect = true, .copy_dst = true },
+                false,
+            ),
             .meshes = std.ArrayList(RenderMesh).init(options.allocator),
         };
     }
@@ -502,19 +510,67 @@ pub const MeshBuffer = struct {
         return &self.meshes.items[0];
     }
 
+    pub fn initClusterMesh(
+        self: *MeshBuffer,
+        options: *const ClusterMeshOptions,
+        allocator: std.mem.Allocator,
+    ) !*RenderMesh {
+        const mesh_data = options.data;
+        const num_vertices = mesh_data.positions.len;
+        const num_indices = mesh_data.indices.len;
+        const num_clusters = mesh_data.descriptors.len;
+        std.log.info(
+            "init mesh with {} vertices, {} indices, {} clusters",
+            .{ num_vertices, num_indices, num_clusters },
+        );
+
+        const offset_p = try self.positions.allocWrite(mesh_data.positions, self.queue);
+        _ = try self.normals.allocWrite(mesh_data.normals, self.queue);
+        _ = try self.tex_coords.allocWrite(mesh_data.tex_coords, self.queue);
+        _ = try self.tangents.allocWrite(mesh_data.tangents, self.queue);
+        _ = try self.bitangents.allocWrite(mesh_data.bitangents, self.queue);
+
+        const offset_i = try self.indices.allocWrite(mesh_data.indices, self.queue);
+
+        const mesh = RenderMesh{
+            .num_vertices = num_vertices,
+            .num_indices = num_indices,
+            .scale = options.scale,
+            .offset = options.offset,
+        };
+
+        const instances = [_]InstanceData{
+            mesh.instanceData(),
+        };
+        _ = try self.instances.allocWrite(&instances, self.queue);
+
+        const indirect_args = try allocator.alloc([5]u32, num_clusters);
+        defer allocator.free(indirect_args);
+
+        var mesh_id: u32 = @intCast(self.meshes.items.len);
+        for (mesh_data.descriptors, 0..) |descriptor, i| {
+            indirect_args[i] = .{
+                @intCast(descriptor.num_triangles * 3),
+                1,
+                offset_i + descriptor.offset,
+                offset_p,
+                mesh_id,
+            };
+            mesh_id += 1;
+        }
+        _ = try self.indirect.allocWrite(indirect_args, self.queue);
+
+        try self.meshes.append(mesh);
+        return &self.meshes.items[0];
+    }
+
     pub fn updateMeshes(self: *MeshBuffer) void {
-        std.log.info("RUNNING!", .{});
         for (self.meshes.items, 0..) |*mesh, i| {
             if (!mesh.dirty) {
-                std.log.info("{}", .{mesh.dirty});
                 continue;
             }
-            const offset = @sizeOf(InstanceData) * i;
-            const instances = [_]InstanceData{
-                mesh.instanceData(),
-            };
-            std.log.info("instance: {}", .{mesh.instanceData()});
-            self.instances.write(&instances, self.queue, @intCast(offset));
+            const instances = [_]InstanceData{mesh.instanceData()};
+            self.instances.write(&instances, self.queue, @intCast(i));
             mesh.dirty = false;
         }
     }
@@ -590,30 +646,36 @@ pub fn main() !void {
     var mesh_buffer = MeshBuffer.init(.{
         .device = app.device,
         .queue = app.queue,
-        .n_meshes = 10,
+        .n_meshes = 200,
         .n_vertices = 60000,
         .n_indices = 60000,
         .allocator = allocator,
     });
 
-    const mesh = try load_obj.loadFile(.{
+    var mesh = try load_obj.loadFile(.{
         .allocator = allocator,
         .path = "Cottage_Clean/cottage.obj",
     });
-    var render_mesh = try mesh_buffer.initMesh(&.{
-        .data = &mesh,
-        .scale = 1,
-        .offset = .{ 0, -5, 15 },
-    });
 
-    _ = try load_obj.loadFile(.{
-        .allocator = allocator,
-        .path = "teapot.obj",
-        //.scale = 0.05,
-        //.offset = .{ 0, -0.5, 0 },
-    });
+    //var bunny_mesh = try load_obj.loadFile(.{
+    //    .allocator = allocator,
+    //    .path = "bunny-fixed.obj",
+    //});
+
+    const optimizeMesh = @import("forsyth_optimize.zig").optimize;
+    try optimizeMesh(allocator, &mesh, 32, .{});
+
+    const buildClusters = @import("meshlets.zig").buildClusters;
+    const cluster_mesh = try buildClusters(allocator, &mesh, 64, 126);
+    std.log.info("num clusters: {}", .{cluster_mesh.descriptors.len});
+
+    var render_mesh = try mesh_buffer.initClusterMesh(&.{
+        .data = &cluster_mesh,
+        .scale = 1,
+        .offset = .{ 0, 0, 0 },
+    }, allocator);
 
-    var rp = try MeshRenderPipeline.init(&app, &mesh_buffer, &textures, "shaders/mesh_pbr.wgsl");
+    var rp = try MeshRenderPipeline.init(&app, &mesh_buffer, &textures, "shaders/mesh_clusters.wgsl");
 
     const material = try Material.init(
         &textures,
diff --git a/src/mesh_render_pipeline.zig b/src/mesh_render_pipeline.zig
@@ -173,8 +173,9 @@ pub fn frame(ptr: *anyopaque, pass: *gpu.RenderPassEncoder) void {
     pass.setBindGroup(0, self.bind_group, null);
     pass.setIndexBuffer(self.mesh_buffer.indices.data, .uint32, 0, self.mesh_buffer.indices.size);
 
-    for (0..self.mesh_buffer.meshes.items.len) |mesh_idx| {
-        const offset = mesh_idx * 5 * @sizeOf(u32);
+    const num_indirect = self.mesh_buffer.indirect.count();
+    for (0..num_indirect) |idx| {
+        const offset = idx * self.mesh_buffer.indirect.attrib_size;
         pass.drawIndexedIndirect(self.mesh_buffer.indirect.data, offset);
     }
 }
diff --git a/src/meshlets.zig b/src/meshlets.zig
@@ -0,0 +1,120 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+const Mesh = @import("mesh.zig");
+
+const f32x3 = @Vector(3, f32);
+const f32x4 = @Vector(4, f32);
+
+pub const ClusterDescriptor = struct {
+    offset: u32,
+    num_vertices: u32,
+    num_triangles: u32,
+};
+
+pub const ClusterMesh = struct {
+    positions: []f32x4,
+    normals: []f32x3,
+    tex_coords: []f32x3,
+    tangents: []f32x3,
+    bitangents: []f32x3,
+    indices: []u32,
+    descriptors: []ClusterDescriptor,
+};
+
+pub fn buildClusters(
+    allocator: std.mem.Allocator,
+    mesh: *const Mesh,
+    max_vertices: u32,
+    max_triangles: u32,
+) !ClusterMesh {
+    // Only triangle meshes are supported
+    assert(mesh.indices.len % 3 == 0);
+    // Must have at least one triangle
+    assert(max_vertices >= 3);
+    assert(max_triangles >= 1);
+
+    // 'unused' tracks which vertices have not been used
+    // unused[vtx_idx] = 1 -> not used
+    // unused[vtx_idx] = 0 -> used
+    const unused = try allocator.alloc(u8, mesh.positions.len);
+    defer allocator.free(unused);
+    @memset(unused, 1);
+
+    const indices = try allocator.alloc(u32, mesh.indices.len);
+    var descriptors = std.ArrayList(ClusterDescriptor).init(allocator);
+    defer descriptors.deinit();
+
+    var num_vertices: u32 = 0;
+    var num_triangles: u32 = 0;
+    var cluster_offset: u32 = 0;
+
+    for (0..mesh.indices.len / 3) |i| {
+        const mesh_offset = i * 3;
+        const a = mesh.indices[mesh_offset];
+        const b = mesh.indices[mesh_offset + 1];
+        const c = mesh.indices[mesh_offset + 2];
+
+        // Triangles must be non-degenerate
+        assert(a != b and a != c and b != c);
+        // Vertices must be contained in the mesh
+        const a_valid = a < mesh.positions.len;
+        const b_valid = b < mesh.positions.len;
+        const c_valid = c < mesh.positions.len;
+        assert(a_valid and b_valid and c_valid);
+
+        const num_extra = unused[a] + unused[b] + unused[c];
+        const too_many_vertices = num_vertices + num_extra >= max_vertices;
+        const too_many_triangles = num_triangles >= max_triangles;
+        if (too_many_vertices or too_many_triangles) {
+            // Save cluster info
+            try descriptors.append(.{
+                .offset = cluster_offset,
+                .num_vertices = num_vertices,
+                .num_triangles = num_triangles,
+            });
+
+            // Reset state for next cluster
+            @memset(unused, 1);
+            cluster_offset += num_triangles * 3;
+            num_vertices = 0;
+            num_triangles = 0;
+        }
+
+        // Mark vertices as used
+        unused[a] = 0;
+        unused[b] = 0;
+        unused[c] = 0;
+
+        // Add triangle to cluster
+        const offset = cluster_offset + num_triangles * 3;
+        indices[offset] = a;
+        indices[offset + 1] = b;
+        indices[offset + 2] = c;
+
+        num_vertices += num_extra;
+        num_triangles += 1;
+    }
+
+    // Add final cluster
+    if (num_triangles > 0) {
+        try descriptors.append(.{
+            .offset = cluster_offset,
+            .num_vertices = num_vertices,
+            .num_triangles = num_triangles,
+        });
+    }
+
+    // Total size of mesh should not change
+    assert(mesh.indices.len == cluster_offset + num_triangles * 3);
+
+    return .{
+        .positions = mesh.positions,
+        .normals = mesh.normals,
+        .tex_coords = mesh.tex_coords,
+        .tangents = mesh.tangents,
+        .bitangents = mesh.bitangents,
+        .indices = indices,
+        .descriptors = try descriptors.toOwnedSlice(),
+    };
+}
diff --git a/src/meshlets_adjacency.zig b/src/meshlets_adjacency.zig
@@ -0,0 +1,446 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+const HalfEdgeMap = @import("half_edge_map.zig");
+const Mesh = @import("mesh.zig");
+
+const meshlets = @import("meshlets.zig");
+const ClusterDescriptor = meshlets.ClusterDescriptor;
+const ClusterMesh = meshlets.ClusterMesh;
+
+pub fn buildClusters(
+    allocator: std.mem.Allocator,
+    mesh: *const Mesh,
+    max_vertices: u32,
+    max_triangles: u32,
+) !ClusterMesh {
+    // Only triangle meshes are supported
+    assert(mesh.indices.len % 3 == 0);
+    // Must have at least one triangle
+    assert(max_vertices >= 3);
+    assert(max_triangles >= 1);
+
+    // 'unused' tracks which vertices have not been used
+    // unused[vtx_idx] = 1 -> not used
+    // unused[vtx_idx] = 0 -> used
+    const unused = try allocator.alloc(u8, mesh.positions.len);
+    defer allocator.free(unused);
+    @memset(unused, 1);
+
+    const indices = try allocator.alloc(u32, mesh.indices.len);
+    var descriptors = std.ArrayList(ClusterDescriptor).init(allocator);
+    defer descriptors.deinit();
+
+    var num_vertices: u32 = 0;
+    var num_triangles: u32 = 0;
+    var cluster_offset: u32 = 0;
+
+    var map = try HalfEdgeMap.init(mesh.indices, allocator);
+    var tri: [3]u32 = undefined;
+    var adjacent = std.ArrayList(u32).init(allocator);
+    const usage = try buildUsageMap(mesh, allocator);
+
+    for (0..mesh.indices.len / 3) |_| {
+        // Fetch next triangle
+        if (adjacent.items.len > 0) {
+            var min_extra: u32 = std.math.maxInt(u32);
+            var min_score: u32 = std.math.maxInt(u32);
+            var idx_best: u32 = 0;
+            const num_adjacent = adjacent.items.len / 3;
+            for (0..num_adjacent) |triangle_idx| {
+                const i = @as(u32, @intCast(triangle_idx)) * 3;
+                const adj = [3]u32{
+                    adjacent.items[i],
+                    adjacent.items[i + 1],
+                    adjacent.items[i + 2],
+                };
+
+                var extra: u32 = 1;
+                if (inSlice(u32, indices[cluster_offset..], adj[2])) {
+                    extra = 0;
+                }
+
+                if ((usage[adj[0]] == 1) or (usage[adj[1]] == 1) or (usage[adj[2]] == 1)) {
+                    extra = 0;
+                }
+
+                if (extra > min_extra) {
+                    continue;
+                }
+                const score = usage[adj[0]] + usage[adj[1]] + usage[adj[2]] - 3;
+
+                if ((extra < min_extra) or (score < min_score)) {
+                    min_extra = extra;
+                    min_score = score;
+                    tri = adj;
+                    idx_best = i;
+                }
+            }
+            tri = [3]u32{
+                adjacent.items[idx_best],
+                adjacent.items[idx_best + 1],
+                adjacent.items[idx_best + 2],
+            };
+            _ = adjacent.orderedRemove(idx_best);
+            _ = adjacent.orderedRemove(idx_best);
+            _ = adjacent.orderedRemove(idx_best);
+        } else {
+            tri = map.pop();
+        }
+        const a = tri[0];
+        const b = tri[1];
+        const c = tri[2];
+        usage[a] -= 1;
+        usage[b] -= 1;
+        usage[c] -= 1;
+
+        // Triangles must be non-degenerate
+        assert(a != b and a != c and b != c);
+
+        // Vertices must be contained in the mesh
+        const a_valid = a < mesh.positions.len;
+        const b_valid = b < mesh.positions.len;
+        const c_valid = c < mesh.positions.len;
+        assert(a_valid and b_valid and c_valid);
+
+        // Verify cluster has space
+        const num_extra = unused[a] + unused[b] + unused[c];
+        const too_many_vertices = num_vertices + num_extra >= max_vertices;
+        const too_many_triangles = num_triangles >= max_triangles;
+        if (too_many_vertices or too_many_triangles) {
+            // Save cluster info
+            try descriptors.append(.{
+                .offset = cluster_offset,
+                .num_vertices = num_vertices,
+                .num_triangles = num_triangles,
+            });
+
+            // Reset state for next cluster
+            @memset(unused, 1);
+            cluster_offset += num_triangles * 3;
+            num_vertices = 0;
+            num_triangles = 0;
+
+            // Add all adjacent triangles back to the half-edge map
+            for (0..adjacent.items.len / 3) |i| {
+                const idx = i * 3;
+                const adj = [3]u32{
+                    adjacent.items[idx],
+                    adjacent.items[idx + 1],
+                    adjacent.items[idx + 2],
+                };
+                map.insert(adj);
+            }
+            adjacent.clearAndFree();
+        }
+
+        // Mark vertices as used
+        unused[a] = 0;
+        unused[b] = 0;
+        unused[c] = 0;
+
+        // Add triangle to cluster
+        const offset = cluster_offset + num_triangles * 3;
+        indices[offset] = a;
+        indices[offset + 1] = b;
+        indices[offset + 2] = c;
+
+        num_vertices += num_extra;
+        num_triangles += 1;
+
+        // Get newly adjacent triangles
+        for (0..3) |side| {
+            const triangle = map.adjacentTo(tri, @intCast(side)) orelse {
+                continue;
+            };
+            try adjacent.appendSlice(&triangle);
+            std.debug.assert(map.remove(triangle));
+        }
+
+        // If we're out of adjacent triangles, this cluster is done
+        const no_more_adjacent = adjacent.items.len <= 0;
+        if (no_more_adjacent) {
+            // Save cluster info
+            try descriptors.append(.{
+                .offset = cluster_offset,
+                .num_vertices = num_vertices,
+                .num_triangles = num_triangles,
+            });
+
+            // Reset state for next cluster
+            @memset(unused, 1);
+            cluster_offset += num_triangles * 3;
+            num_vertices = 0;
+            num_triangles = 0;
+            continue;
+        }
+    }
+
+    // Add final cluster
+    if (num_triangles > 0) {
+        try descriptors.append(.{
+            .offset = cluster_offset,
+            .num_vertices = num_vertices,
+            .num_triangles = num_triangles,
+        });
+    }
+
+    // Total size of mesh should not change
+    assert(mesh.indices.len == cluster_offset + num_triangles * 3);
+
+    return .{
+        .positions = mesh.positions,
+        .normals = mesh.normals,
+        .tex_coords = mesh.tex_coords,
+        .tangents = mesh.tangents,
+        .bitangents = mesh.bitangents,
+        .indices = indices,
+        .descriptors = try descriptors.toOwnedSlice(),
+    };
+}
+
+pub fn buildUsageMap(
+    mesh: *const Mesh,
+    allocator: std.mem.Allocator,
+) ![]u32 {
+    const num_vertices = mesh.positions.len;
+    const map = try allocator.alloc(u32, num_vertices);
+    @memset(map, 0);
+    for (mesh.indices) |idx| {
+        map[idx] += 1;
+    }
+    return map;
+}
+
+fn inSlice(comptime T: type, slice: []const T, item: T) bool {
+    for (slice) |iter_item| {
+        if (iter_item == item) {
+            return true;
+        }
+    }
+    return false;
+}
+
+//const ClusterOptions = struct {
+//    max_verts: u32,
+//    max_prims: u32,
+//};
+//
+//const ClusterDescriptor = struct {
+//    offset: u32,
+//    num_prims: u32,
+//    num_verts: u32,
+//};
+//
+//pub fn buildClusters(mesh: *const Mesh, allocator: std.mem.Allocator) !void {
+//    const max_prims = 126;
+//    var offset: u32 = 0;
+//
+//    var map = try HalfEdgeMap.init(mesh.indices, allocator);
+//    var tri: ?[3]u32 = null;
+//
+//    var descriptors = std.ArrayList(ClusterDescriptor).init(allocator);
+//    // this holds offsets and counts for all clusters
+//
+//    const indices: []u32 = try allocator.alloc(u32, mesh.indices.len);
+//    // this holds the actual index data for all clusters
+//    // should be free'd by caller (eventually)
+//
+//    while (map.count() > 0) {
+//        if (tri == null) {
+//            tri = map.pop();
+//        }
+//        const cluster_end = @min(offset + max_prims * 3, mesh.indices.len);
+//        const cluster = indices[offset..cluster_end];
+//
+//        //result = try buildClusterFewest(allocator, &map, usage, tri.?);
+//        const result = try buildClusterSimple(
+//            allocator,
+//            &map,
+//            tri.?,
+//            cluster,
+//        );
+//        offset += result.descriptor.num_prims * 3;
+//        try descriptors.append(result.descriptor);
+//        tri = result.next;
+//    }
+//    std.log.info("offset: {}", .{offset});
+//    std.debug.assert(offset == indices.len);
+//    std.log.info("num clusters: {}", .{descriptors.items.len});
+//}
+//
+//const ClusterResult = struct {
+//    descriptor: ClusterDescriptor,
+//    next: ?[3]u32,
+//};
+//
+//pub const Triangle = struct {
+//    a: u32,
+//    b: u32,
+//    c: u32,
+//
+//    pub fn permute(self: *const Triangle) Triangle {
+//        return .{ .a = self.b, .b = self.c, .c = self.a };
+//    }
+//
+//    pub fn reverse(self: *const Triangle) Triangle {
+//        return .{ .a = self.c, .b = self.b, .c = self.a };
+//    }
+//};
+//
+//fn buildClusterSimple(
+//    allocator: std.mem.Allocator,
+//    map: *HalfEdgeMap,
+//    initial_triangle: [3]u32,
+//    cluster: []u32,
+//) !ClusterResult {
+//    const max_prims = 126;
+//
+//    var vertices = std.AutoHashMap(u32, void).init(allocator);
+//    var adjacent = std.ArrayList(Triangle).init(allocator);
+//    var tri: ?Triangle = .{
+//        .a = initial_triangle[0],
+//        .b = initial_triangle[1],
+//        .c = initial_triangle[2],
+//    };
+//
+//    var idx: u32 = 0;
+//    while (idx < max_prims * 3) {
+//        // Add the most recent triangle to the cluster
+//        cluster[idx] = tri.?.a;
+//        cluster[idx + 1] = tri.?.b;
+//        cluster[idx + 2] = tri.?.c;
+//        //@memcpy(cluster[idx .. idx + 3], &tri.?);
+//        try vertices.put(tri.?.a, {});
+//        try vertices.put(tri.?.b, {});
+//        try vertices.put(tri.?.c, {});
+//        idx += 3;
+//
+//        // Get newly adjacent triangles
+//        for (0..3) |side| {
+//            const triangle = map.adjacentTo(tri.?, @intCast(side)) orelse {
+//                continue;
+//            };
+//            try adjacent.append(triangle);
+//            std.debug.assert(map.remove(triangle));
+//        }
+//        if (adjacent.items.len <= 0) {
+//            tri = null;
+//            break;
+//        }
+//
+//        // Oldest adjacent triangle is up next
+//        tri = [3]u32{
+//            adjacent.items[0].a,
+//            adjacent.items[0].b,
+//            adjacent.items[0].c,
+//        };
+//        _ = adjacent.orderedRemove(0);
+//    }
+//
+//    // Add all adjacent triangles back to the half-edge map
+//    for (adjacent.items) |adj_tri| {
+//        //const adj = [3]u32{ adj_tri.a, adj_tri.b, adj_tri.c };
+//        map.insert(adj_tri);
+//    }
+//    if (tri != null) {
+//        const triangle = .{
+//            .a = tri.?[0],
+//            .b = tri.?[1],
+//            .c = tri.?[2],
+//        };
+//        map.insert(triangle);
+//    }
+//
+//    return .{
+//        .descriptor = .{
+//            .offset = undefined,
+//            .num_prims = idx / 3,
+//            .num_verts = vertices.count(),
+//        },
+//        .next = null, //tri,
+//    };
+//}
+//
+//fn buildClusterFewest(
+//    allocator: std.mem.Allocator,
+//    map: *HalfEdgeMap,
+//    usage: []u32,
+//    initial_triangle: [3]u32,
+//) !ClusterResult {
+//    var cluster = std.ArrayList(u32).init(allocator);
+//    var adjacent = std.ArrayList(u32).init(allocator);
+//    var tri: ?[3]u32 = initial_triangle;
+//    while (cluster.items.len < 126 * 3) {
+//        // Add the most recent triangle to the cluster
+//        try cluster.appendSlice(&tri.?);
+//
+//        // Get newly adjacent triangles
+//        for (0..3) |side| {
+//            const triangle = map.adjacentTo(tri.?, @intCast(side)) orelse {
+//                continue;
+//            };
+//            try adjacent.appendSlice(&triangle);
+//            std.debug.assert(map.remove(triangle));
+//        }
+//        if (adjacent.items.len <= 0) {
+//            tri = null;
+//            break;
+//        }
+//
+//        var min_extra: u32 = std.math.maxInt(u32);
+//        var min_score: u32 = std.math.maxInt(u32);
+//        var idx_best: u32 = 0;
+//        const num_adjacent = adjacent.items.len / 3;
+//        for (0..num_adjacent) |triangle_idx| {
+//            const i = @as(u32, @intCast(triangle_idx)) * 3;
+//            const adj = [3]u32{
+//                adjacent.items[i],
+//                adjacent.items[i + 1],
+//                adjacent.items[i + 2],
+//            };
+//
+//            var extra: u32 = 1;
+//            if (inSlice(u32, cluster.items, adj[2])) {
+//                extra = 0;
+//            }
+//
+//            if ((usage[adj[0]] == 1) or (usage[adj[1]] == 1) or (usage[adj[2]] == 1)) {
+//                extra = 0;
+//            }
+//
+//            if (extra > min_extra) {
+//                continue;
+//            }
+//            const score = usage[adj[0]] + usage[adj[1]] + usage[adj[2]] - 3;
+//
+//            if ((extra < min_extra) or (score < min_score)) {
+//                min_extra = extra;
+//                min_score = score;
+//                tri = adj;
+//                idx_best = i;
+//            }
+//        }
+//        _ = adjacent.orderedRemove(idx_best);
+//        _ = adjacent.orderedRemove(idx_best);
+//        _ = adjacent.orderedRemove(idx_best);
+//        usage[tri.?[0]] -= 1;
+//        usage[tri.?[1]] -= 1;
+//        usage[tri.?[2]] -= 1;
+//    }
+//    const num_adjacent = adjacent.items.len / 3;
+//    for (0..num_adjacent) |triangle_idx| {
+//        const i = @as(u32, @intCast(triangle_idx)) * 3;
+//        const adj = [3]u32{
+//            adjacent.items[i],
+//            adjacent.items[i + 1],
+//            adjacent.items[i + 2],
+//        };
+//        map.insert(adj);
+//    }
+//    std.log.info("cluster size: {}", .{cluster.items.len / 3});
+//    return .{
+//        .cluster = cluster,
+//        .next = tri,
+//    };
+//}
diff --git a/src/shaders/mesh_clusters.wgsl b/src/shaders/mesh_clusters.wgsl
@@ -0,0 +1,204 @@
+struct VertexInput {
+    @location(0) position: vec4<f32>,
+    @location(1) normal: vec3<f32>,
+    @location(2) tex_coord: vec3<f32>,
+    @location(3) tangent: vec3<f32>,
+    @location(4) bitangent: vec3<f32>,
+};
+
+struct VertexOutput {
+    @builtin(position) clip_position: vec4<f32>,
+    @location(0) tex_coord: vec3<f32>,
+    @location(1) pt_fragment: vec3<f32>,
+    @location(2) pt_light: vec3<f32>,
+    @location(3) pt_camera: vec3<f32>,
+    @location(4) @interpolate(flat) instance_id: u32,
+};
+
+@group(0) @binding(0)
+var<storage, read> instances: array<Instance>;
+struct Instance {
+    model_matrix: mat4x4<f32>,
+    material: Material,
+};
+struct Material {
+    albedo: u32,
+    normal: u32,
+    roughness: u32,
+    metalness: u32,
+};
+
+@group(0) @binding(1)
+var<uniform> uniform_data: UniformData;
+struct UniformData {
+    pw_camera: vec3<f32>,
+    view_matrix: mat4x4<f32>,
+    proj_matrix: mat4x4<f32>,
+};
+
+@vertex fn vertex(
+    in: VertexInput,
+    @builtin(instance_index) idx: u32,
+) -> VertexOutput {
+    let model_matrix = instances[0].model_matrix;
+    let camera_matrix = uniform_data.proj_matrix * uniform_data.view_matrix;
+
+    let t = normalize((model_matrix * vec4(in.tangent, 0.0)).xyz);
+    let b = normalize((model_matrix * vec4(in.bitangent, 0.0)).xyz);
+    let n = normalize((model_matrix * vec4(in.normal, 0.0)).xyz);
+    let tangent_matrix = mat3x3<f32>(t, b, n);
+
+    let world_position = model_matrix * in.position;
+    let tangent_position = tangent_matrix * world_position.xyz;
+
+    let light_position = vec3<f32>(20, 80, 20);
+    let tangent_light_position = tangent_matrix * light_position;
+
+    let pt_camera = tangent_matrix * uniform_data.pw_camera;
+
+
+    return VertexOutput(
+        camera_matrix * world_position,
+        in.tex_coord,
+        tangent_position,
+        tangent_light_position,
+        pt_camera,
+        idx,
+    );
+}
+
+
+@group(0) @binding(2) var material_sampler: sampler;
+@group(0) @binding(3) var material_texture: texture_2d_array<f32>;
+
+fn diffuse_lambert(albedo: vec3<f32>) -> vec3<f32> {
+    return albedo * 0.31830988618;
+}
+
+fn distribution_trowbridge_reitz_ggx(
+    n_dot_h: f32,
+    alpha: f32,
+) -> f32 {
+    let alpha_squared = alpha * alpha;
+    let n_dot_h_squared = n_dot_h * n_dot_h;
+    let denom = n_dot_h_squared * (alpha_squared - 1) + 1;
+    let denom_squared = denom * denom;
+    return alpha_squared / (3.14159265359 * denom_squared);
+}
+
+fn geometry_schlick_ggx(
+    n_dot_x: f32,
+    alpha: f32,
+) -> f32 {
+    let k = 0.5 * alpha;
+    let denom = n_dot_x * (1.0 - k) + k;
+    return n_dot_x / denom;
+}
+
+fn geometry_smith(
+    n_dot_l: f32,
+    n_dot_v: f32,
+    alpha: f32,
+) -> f32 {
+    let g1 = geometry_schlick_ggx(n_dot_l, alpha);
+    let g2 = geometry_schlick_ggx(n_dot_v, alpha);
+    return g1 * g2;
+}
+
+fn fresnel_schlick(
+    f0: vec3<f32>,
+    v_dot_h: f32,
+) -> vec3<f32> {
+    return f0 + (1.0 - f0) * pow(1.0 - v_dot_h, 5.0);
+}
+
+fn cook_torrance(
+    n_dot_v: f32,
+    n_dot_l: f32,
+    n_dot_h: f32,
+    roughness: f32,
+) -> f32 {
+    let alpha = roughness * roughness;
+    let distribution = distribution_trowbridge_reitz_ggx(
+        n_dot_h,
+        alpha,
+    );
+    let geometry = geometry_smith(
+        n_dot_l,
+        n_dot_v,
+        alpha,
+    );
+    let denom = 4.0 * n_dot_v * n_dot_l + 0.000001;
+    return (distribution * geometry) / denom;
+}
+
+fn lighting(
+    normal: vec3<f32>,
+    p_fragment: vec3<f32>,
+    p_camera: vec3<f32>,
+    p_light: vec3<f32>,
+    c_light: vec3<f32>,
+    albedo: vec3<f32>,
+    roughness: f32,
+    metalness: f32,
+) -> vec3<f32> {
+    let v = normalize(p_camera - p_fragment);
+    let l = normalize(p_light - p_fragment);
+    let h = normalize(v + l);
+
+    let n_dot_v = max(dot(normal, v), 0.0);
+    let n_dot_l = max(dot(normal, l), 0.0);
+    let n_dot_h = max(dot(normal, h), 0.0);
+    let v_dot_h = max(dot(v, h), 0.0);
+
+    let f0 = mix(vec3<f32>(0.04), albedo, metalness);
+    let k_s = fresnel_schlick(f0, v_dot_h);
+    let k_d = 1.0 - k_s;
+
+    let diffuse = k_d * diffuse_lambert(albedo);
+    let specular = k_s * cook_torrance(n_dot_v, n_dot_l, n_dot_h, roughness);
+    let brdf = (diffuse + specular) * n_dot_l;
+    return brdf * c_light;
+}
+
+@fragment fn fragment(in: VertexOutput) -> @location(0) vec4<f32> {
+    let material = instances[in.instance_id].material;
+    let uv = vec2<f32>(in.tex_coord.x, 1 - in.tex_coord.y);
+    var albedo = textureSample(material_texture, material_sampler, uv, material.albedo).rgb;
+    albedo = pow(albedo, vec3<f32>(2.2));
+    var normal = textureSample(material_texture, material_sampler, uv, material.normal).rgb;
+    normal = normalize(normal * 2.0 - 1.0);
+    var roughness = textureSample(material_texture, material_sampler, uv, material.roughness).r;
+    var metalness = textureSample(material_texture, material_sampler, uv, material.metalness).r;
+
+    let light_color = normalize(vec3<f32>(23.47, 21.31, 20.79));
+    //let light_color = normalize(vec3<f32>(0.9, 0.9, 0.9));
+    var c = lighting(
+        normal,
+        in.pt_fragment,
+        in.pt_camera,
+        in.pt_light,
+        light_color,
+        albedo.rgb,
+        roughness,
+        metalness,
+    );
+
+    let ambient = vec3<f32>(0.03) * albedo;
+    c += ambient;
+
+    c = c / (c + 1.0);
+    c = pow(c, vec3<f32>(1.0/2.2));
+
+    let colors = array<vec3<f32>, 6>(
+        vec3<f32>(1, 1, 0),
+        vec3<f32>(0, 1, 1),
+        vec3<f32>(1, 0, 1),
+        vec3<f32>(1, 0, 0),
+        vec3<f32>(0, 1, 0),
+        vec3<f32>(0, 0, 1),
+    );
+    let color = colors[in.instance_id % 6];
+
+    return vec4<f32>(color, 1.0);
+}

	render-zig A 3D rendering engine written in Zig
	git clone git://git.christianermann.dev/render-zig
	Log \| Files \| Refs

A	src/forsyth_optimize.zig	\|	278	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/half_edge_map.zig	\|	67	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/main.zig	\|	342	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M	src/mesh_render_pipeline.zig	\|	5	+++--
A	src/meshlets.zig	\|	120	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/meshlets_adjacency.zig	\|	446	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/shaders/mesh_clusters.wgsl	\|	204	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++