rtx: fix corrupted geometry

TLAS creation pattern ended up allocating memory in a temporary region. It was later reused for BLAS data, and subsequently corrupted by TLAS rebuild. Also fixed memory leak, added freeze-models command, added model debug code into shaders, etc.
2024-12-16 22:20:01 +01:00 · 2021-05-08 14:34:42 -07:00 · 2021-05-08 14:34:42 -07:00 · 5ddeb6853d
commit 5ddeb6853d
parent 354eef1c0a
3 changed files with 110 additions and 14 deletions
--- a/ref_vk/TODO.md
+++ b/ref_vk/TODO.md
@ -1,8 +1,14 @@
+## 2021-05-08, E92
+- [x] rtx: weird purple bbox-like glitches on dynamic geometry (tlas vs blas memory corruption/aliasing)
+
 # Next
- [ ] make map/frame lifetime aware allocator and use it everywhere: render, rtx buffers, etc
+- [ ] rtx: improve AS lifetime/management; i.e. pre-cache them, etc
+- [ ] add debug names to all of the buffers
+- [ ] nvnsight into buffer memory and stuff
+- [ ] rtx: build acceleration structures in a single queue/cmdbuf
 - [ ] studio models: pre-compute buffer sizes and allocate them at once
+- [ ] studio models: fix lighting: should have white texture instead of lightmap
 - [ ] rtx: dynamic models AS caching
- [ ] rtx: weird purple bbox-like glitches on dynamic geometry
 - [ ] rtx: better memory handling
 	- [ ] robust tracking of memory hierarchies: global/static, map, frame
 	- or just do a generic allocator with compaction?
@ -225,3 +231,6 @@

 ## 2021-05-01, E89
 - [x] make a wrapper for descriptor sets/layouts
+
+## 2021-05-03, E90
+- [x] make map/frame lifetime aware allocator and use it everywhere: render, rtx buffers, etc
--- a/ref_vk/shaders/rtx.comp
+++ b/ref_vk/shaders/rtx.comp
@ -150,6 +150,32 @@ layout (push_constant) uniform PC {
 //uint picked_light = 76;//uint(mod(pc.t * 4., emissive_kusochki.num_kusochki));
 int time_off = int(pc.t * 8.);

+float hash(float f) { return fract(sin(f)*53478.4327); }
+
+float printTiledNumber(vec2 p, int n) {
+	if (n == 0) return 0.;
+	float t = pc.t;
+	float x = floor(p.x / 5. / 2.);
+	//p.y += 12. * fract(pc.t * (4. + 3. * hash(x)));
+    p = floor(p / 2.);
+    vec2 pc = floor(p / vec2(5.,6.));
+    vec2 pg = mod(p, vec2(5.,6.));
+    float lx = 1.;
+    float col = 0.;
+
+#define PUTN(n) printInt(n,pg,pc,lx,col)
+
+	// float ncol = floor(pc.x / 3.);
+	// float tlen = floor(16. + 32. * hash(ncol));
+	// pc.y = mod(pc.y + floor(t * (6. + 9. * hash(ncol))), tlen);
+	// if (pc.y > tlen * .6) return 0.;
+
+	pc.y = mod(pc.y, 2.);
+	pc.x = mod(pc.x, 3.);
+	PUTN(n);
+	return col;
+}
+
 float printText(in vec2 p) {
 #define PIXSZ 4.
    p = floor(p / PIXSZ);
@ -244,15 +270,15 @@ float rand01() {
 	return uintBitsToFloat(0x3f800000 | (rand() & 0x007fffff)) - 1.;
 }

-float hash(float f) { return fract(sin(f)*53478.4327); }
-
 bool shadowed(vec3 pos, vec3 dir, float dist) {
 	rayQueryEXT shadowRayQuery;
 	rayQueryInitializeEXT(shadowRayQuery, tlas,
 		gl_RayFlagsOpaqueEXT | gl_RayFlagsTerminateOnFirstHitEXT,
 		0xff,
 		pos, 0., dir, dist);
-	while(rayQueryProceedEXT(shadowRayQuery)) {}
+	while(rayQueryProceedEXT(shadowRayQuery)) {
+		rayQueryConfirmIntersectionEXT(shadowRayQuery);
+	}
 	return rayQueryGetIntersectionTypeEXT(shadowRayQuery, true) != gl_RayQueryCommittedIntersectionNoneEXT;
 }

@ -274,8 +300,18 @@ void main() {
 	for (int bounce = 0; bounce < pc.bounces; ++bounce) {
 		rayQueryEXT rayQuery;
 		rayQueryInitializeEXT(rayQuery, tlas, gl_RayFlagsOpaqueEXT, 0xff, O, 0., D, L);
-		while(rayQueryProceedEXT(rayQuery)) {}
+		while(rayQueryProceedEXT(rayQuery)) {
+			rayQueryConfirmIntersectionEXT(rayQuery);
+		}
 		const float l = rayQueryGetIntersectionTEXT(rayQuery, true);
+		if (rayQueryGetIntersectionTypeEXT(rayQuery, true) == gl_RayQueryCommittedIntersectionGeneratedEXT) {
+			C += kc * vec3(0., 1., 0.);
+			break;
+		}
+		if (rayQueryGetIntersectionTypeEXT(rayQuery, true) == gl_RayQueryCommittedIntersectionNoneEXT) {
+			C += kc * vec3(0., 0., 0.);
+			break;
+		}
 		if (rayQueryGetIntersectionTypeEXT(rayQuery, true) != gl_RayQueryCommittedIntersectionTriangleEXT) {
 			C += kc * vec3(1., 0., 1.);
 			break;
@ -284,14 +320,20 @@ void main() {
 		vec3 pos = O+D*l;

 		const int instance_kusochki_offset = rayQueryGetIntersectionInstanceCustomIndexEXT(rayQuery, true);
-		//const int instance_index = rayQueryGetIntersectionInstanceIdEXT(rayQuery, true);
+		const int instance_index = rayQueryGetIntersectionInstanceIdEXT(rayQuery, true);
 		const int geom_index = rayQueryGetIntersectionGeometryIndexEXT(rayQuery, true);
 		const int kusok_index = instance_kusochki_offset + geom_index;
 		const Kusok kusok = kusochki[kusok_index];
 		//const uint leaf = kusochki[kusok_index].leaf-1;

 		//C = fract(pos / LIGHT_GRID_CELL_SIZE);	break;
-		//C = vec3(hash(float(geom_index)), hash(float(geom_index)+15.43), hash(float(geom_index)+34.));
+		//C = vec3(hash(float(geom_index)), hash(float(geom_index)+15.43), hash(float(geom_index)+34.)); break;
+
+		// C = vec3(hash(float(instance_index)), hash(float(instance_index)+15.43), hash(float(instance_index)+34.)) + .1 * fract(pos/LIGHT_GRID_CELL_SIZE);
+		// vec2 pix = vec2(1.,-1.) * vec2(gl_GlobalInvocationID.xy) + vec2(0., imageSize(image).y);
+		// C = mix(C*.5, vec3(0., 1., 0.), printTiledNumber(pix, instance_index));
+		// break;
+
 		//C = vec3(hash(float(leaf)), hash(float(leaf)+15.43), hash(float(leaf)+34.));
 		//C = vec3(hash(float(leaf)), float(kusok.num_surface_lights) / 63., float(kusok.is_emissive));
 		//break;
--- a/ref_vk/vk_rtx.c
+++ b/ref_vk/vk_rtx.c
@ -166,6 +166,7 @@ static struct {
 	vk_image_t frames[2];

 	qboolean reload_pipeline;
+	qboolean freeze_models;

 	// HACK: we don't have a way to properly destroy all models and their Vulkan objects on shutdown.
 	// This makes validation layers unhappy. Remember created objects here and destroy them manually.
@ -229,7 +230,7 @@ static qboolean createOrUpdateAccelerationStructure(VkCommandBuffer cmdbuf, cons
 			"AS max_prims=%u, n_geoms=%u, build size: %d, scratch size: %d\n", max_prims, args->n_geoms, build_size.accelerationStructureSize, build_size.buildScratchSize);
 	}

-	if (MAX_SCRATCH_BUFFER - g_rtx.frame.scratch_offset < scratch_buffer_size) {
+	if (MAX_SCRATCH_BUFFER < g_rtx.frame.scratch_offset + scratch_buffer_size) {
 		gEngine.Con_Printf(S_ERROR "Scratch buffer overflow: left %u bytes, but need %u\n",
 			MAX_SCRATCH_BUFFER - g_rtx.frame.scratch_offset,
 			scratch_buffer_size);
@ -253,11 +254,17 @@ static qboolean createOrUpdateAccelerationStructure(VkCommandBuffer cmdbuf, cons
 		}

 		XVK_CHECK(vkCreateAccelerationStructureKHR(vk_core.device, &asci, NULL, args->accel));
+
+		// gEngine.Con_Reportf("AS=%p, n_geoms=%u, build: %#x %d %#x\n", *args->accel, args->n_geoms, buffer_offset, build_size.accelerationStructureSize, buffer_offset + build_size.accelerationStructureSize);
 	}

 	build_info.dstAccelerationStructure = *args->accel;
 	build_info.scratchData.deviceAddress = g_rtx.scratch_buffer_addr + g_rtx.frame.scratch_offset;
+	uint32_t scratch_offset_initial = g_rtx.frame.scratch_offset;
 	g_rtx.frame.scratch_offset += scratch_buffer_size;
+	g_rtx.frame.scratch_offset = ALIGN_UP(g_rtx.frame.scratch_offset, vk_core.physical_device.properties_accel.minAccelerationStructureScratchOffsetAlignment);
+
+	//gEngine.Con_Reportf("AS=%p, n_geoms=%u, scratch: %#x %d %#x\n", *args->accel, args->n_geoms, scratch_offset_initial, scratch_buffer_size, scratch_offset_initial + scratch_buffer_size);

 	vkCmdBuildAccelerationStructuresKHR(cmdbuf, 1, &build_info, args->build_ranges);
 	return true;
@ -297,6 +304,9 @@ void VK_RayFrameBegin( void )
 {
 	ASSERT(vk_core.rtx);

+	if (g_rtx.freeze_models)
+		return;
+
 	// FIXME we depend on the fact that only a single frame can be in flight
 	// currently framectl waits for the queue to complete before returning
 	// so we can be sure here that previous frame is complete and we're free to
@ -305,17 +315,26 @@ void VK_RayFrameBegin( void )
 		vk_ray_model_t *model = g_rtx.frame.models + i;
 		if (!model->dynamic)
 			continue;
+		if (model->accel == NULL)
+			continue;

 		// TODO cache and reuse
 		for (int j = 0; j < ARRAYSIZE(g_rtx.blases); ++j) {
 			if (g_rtx.blases[j] == model->accel) {
+				//gEngine.Con_Reportf("FrameBegin: frame model %d destroying AS=%p blas_index=%d\n", i, model->accel, j);
 				vkDestroyAccelerationStructureKHR(vk_core.device, g_rtx.blases[j], NULL);
 				g_rtx.blases[j] = VK_NULL_HANDLE;
 				model->accel = VK_NULL_HANDLE;
+				break;
 			}
 		}
 	}

+	if (g_rtx.tlas != VK_NULL_HANDLE) {
+		vkDestroyAccelerationStructureKHR(vk_core.device, g_rtx.tlas, NULL);
+		g_rtx.tlas = VK_NULL_HANDLE;
+	}
+
 	g_rtx.frame.scratch_offset = 0;
 	g_rtx.frame.num_models = 0;
 	g_rtx.frame.num_lighttextures = 0;
@ -448,6 +467,7 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
 		for (int i = 0; i < g_rtx.frame.num_models; ++i) {
 			const vk_ray_model_t* const model = g_rtx.frame.models + i;
 			ASSERT(model->accel != VK_NULL_HANDLE);
+			//gEngine.Con_Reportf("  %d: AS=%p\n", i, model->accel);
 			inst[i] = (VkAccelerationStructureInstanceKHR){
 				.instanceCustomIndex = model->kusochki_offset,
 				.mask = 0xff,
@ -492,7 +512,7 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
 					},
 			},
 		};
-		const uint32_t tl_max_prim_counts[ARRAYSIZE(tl_geom)] = { MAX_ACCELS };
+		const uint32_t tl_max_prim_counts[ARRAYSIZE(tl_geom)] = { g_rtx.frame.num_models };
 		const VkAccelerationStructureBuildRangeInfoKHR tl_build_range = {
 			.primitiveCount = g_rtx.frame.num_models,
 		};
@ -503,7 +523,8 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
 			.build_ranges = tl_build_ranges,
 			.n_geoms = ARRAYSIZE(tl_geom),
 			.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR,
-			.dynamic = true,
+			// we can't really rebuild TLAS because instance count changes are not allowed .dynamic = true,
+			.dynamic = false,
 			.accel = &g_rtx.tlas,
 		};
 		if (!createOrUpdateAccelerationStructure(cmdbuf, &asrgs)) {
@ -794,6 +815,10 @@ static void reloadPipeline( void ) {
 	g_rtx.reload_pipeline = true;
 }

+static void freezeModels( void ) {
+	g_rtx.freeze_models = !g_rtx.freeze_models;
+}
+
 qboolean VK_RayInit( void )
 {
 	ASSERT(vk_core.rtx);
@ -896,8 +921,10 @@ qboolean VK_RayInit( void )
 		}
 	}

-	if (vk_core.debug)
+	if (vk_core.debug) {
 		gEngine.Cmd_AddCommand("vk_rtx_reload", reloadPipeline, "Reload RTX shader");
+		gEngine.Cmd_AddCommand("vk_rtx_freeze", freezeModels, "Freeze models, do not update/add/delete models from to-draw list");
+	}

 	return true;
 }
@ -940,14 +967,18 @@ qboolean VK_RayModelInit( vk_ray_model_init_t args ) {

 	ASSERT(vk_core.rtx);

+	if (g_rtx.freeze_models)
+		return;
+
 	if (kusochki_count_offset == AllocFailed) {
 		gEngine.Con_Printf(S_ERROR "Maximum number of kusochki exceeded on model %s\n", args.model->debug_name);
 		return false;
 	}

-	geoms = Mem_Malloc(vk_core.pool, args.model->num_geometries * sizeof(*geoms));
+	// FIXME don't touch allocator each frame many times pls
+	geoms = Mem_Calloc(vk_core.pool, args.model->num_geometries * sizeof(*geoms));
 	geom_max_prim_counts = Mem_Malloc(vk_core.pool, args.model->num_geometries * sizeof(*geom_max_prim_counts));
-	geom_build_ranges = Mem_Malloc(vk_core.pool, args.model->num_geometries * sizeof(*geom_build_ranges));
+	geom_build_ranges = Mem_Calloc(vk_core.pool, args.model->num_geometries * sizeof(*geom_build_ranges));
 	geom_build_ranges_ptr = Mem_Malloc(vk_core.pool, args.model->num_geometries * sizeof(*geom_build_ranges));

 	kusochki = (vk_kusok_data_t*)(g_rtx.kusochki_buffer.mapped) + kusochki_count_offset;
@ -980,6 +1011,10 @@ qboolean VK_RayModelInit( vk_ray_model_init_t args ) {
 					},
 			};

+		// gEngine.Con_Printf("  g%d: v(%#x %d %#x) V%d i(%#x %d %#x) I%d\n", i,
+		// 	vertex_offset*sizeof(vk_vertex_t), mg->vertex_count * sizeof(vk_vertex_t), (vertex_offset + mg->vertex_count) * sizeof(vk_vertex_t), mg->vertex_count,
+		// 	index_offset*sizeof(uint16_t), mg->element_count * sizeof(uint16_t), (index_offset + mg->element_count) * sizeof(uint16_t), mg->element_count);
+
 		geom_build_ranges[i] = (VkAccelerationStructureBuildRangeInfoKHR) {
 			.primitiveCount = prim_count,
 		};
@ -1040,6 +1075,7 @@ qboolean VK_RayModelInit( vk_ray_model_init_t args ) {
 		g_rtx.frame.scratch_offset = 0;
 	}

+	Mem_Free(geom_build_ranges_ptr);
 	Mem_Free(geom_build_ranges);
 	Mem_Free(geom_max_prim_counts);
 	Mem_Free(geoms);
@ -1053,6 +1089,8 @@ qboolean VK_RayModelInit( vk_ray_model_init_t args ) {
 			}
 		}

+		// gEngine.Con_Reportf("Model %s generated AS=%p blas_index=%d\n", args.model->debug_name, args.model->rtx.blas, blas_index);
+
 		if (blas_index == ARRAYSIZE(g_rtx.blases))
 			gEngine.Con_Printf(S_WARN "Too many BLASes created :(\n");
 	}
@ -1063,6 +1101,8 @@ qboolean VK_RayModelInit( vk_ray_model_init_t args ) {
 }

 void VK_RayModelDestroy( struct vk_render_model_s *model ) {
+	ASSERT(!g_rtx.freeze_models);
+
 	ASSERT(vk_core.rtx);
 	if (model->rtx.blas != VK_NULL_HANDLE) {
 		int blas_index;
@ -1075,6 +1115,8 @@ void VK_RayModelDestroy( struct vk_render_model_s *model ) {
 		if (blas_index == ARRAYSIZE(g_rtx.blases))
 			gEngine.Con_Printf(S_WARN "Model BLAS was missing\n");

+		// gEngine.Con_Reportf("Model %s destroying AS=%p blas_index=%d\n", model->debug_name, model->rtx.blas, blas_index);
+
 		vkDestroyAccelerationStructureKHR(vk_core.device, model->rtx.blas, NULL);
 		model->rtx.blas = VK_NULL_HANDLE;
 	}
@ -1085,6 +1127,9 @@ void VK_RayFrameAddModel( const struct vk_render_model_s *model, const matrix3x4

 	ASSERT(g_rtx.frame.num_models <= ARRAYSIZE(g_rtx.frame.models));

+	if (g_rtx.freeze_models)
+		return;
+
 	if (g_rtx.frame.num_models == ARRAYSIZE(g_rtx.frame.models)) {
 		gEngine.Con_Printf(S_ERROR "Ran out of AccelerationStructure slots\n");
 		return;