Merge pull request #739 from w23/stream-E374

Synchronization stuff started on stream E374

Addresses a bunch of validation synchronization hazards. Not all of them. And the remaining ones require more involved state tracking, a big effort.

Also contains a few minor tweaks:
- stop using SDL_GL functions, silence errors
- increase acceleration structures buffer size -- with recent mesa update BLASes got bigger somewhat (?)
This commit is contained in:
Ivan Avdeev 2024-05-07 07:54:00 -07:00 committed by GitHub
commit f5eb2dada8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 182 additions and 29 deletions

View File

@ -487,6 +487,9 @@ GL_UpdateSwapInterval
void GL_UpdateSwapInterval( void )
{
#if SDL_VERSION_ATLEAST( 2, 0, 0 )
if (glw_state.context_type != REF_GL)
return;
// disable VSync while level is loading
if( cls.state < ca_active )
{

View File

@ -1,3 +1,9 @@
# cvars
## `rt_force_disable`
On GPUs that support ray tracing forcefully disables it as if it wasn't supported at all. I.e. no RT extensions and modules are initialized. Useful for testing sometimes.
Note: this cvar is read early in `R_VkInit()`, which gets executed before `autoexec.cfg`, `config.cfg`, etc are read. So putting it there will not work.
`video.cfg` and `vk.cfg` are read before Vk initialization, so this cvar should go there.
# Frame structure wrt calls from the engine
- (eng) SCR_UpdateScreen()
- (eng) V_PreRender()

View File

@ -1,6 +1,33 @@
## Next
- [ ] Render graph
- [ ] performance profiling and comparison
## 2024-04-12 E374
- [x] ~~`-vknort` arg to force-disable RT at init time~~ -- reverted on 2024-04-29
## 2024-03-21 E372: agonizig over agenda
### Player-visible essentials and blockers. Big projects.
- [ ] Light clusters, sampling, and performance -- 90fps HDR on a Steam Deck
- [ ] Transparency, refractions: glass, water, etc
- [ ] Moar and moar correct bounces
- [ ] Denoiser
- [ ] Decals
- [ ] Volumetrics and fog
- [ ] HDR and tonemapping
### Invisible blockers -- foundation/systems stuff
- [ ] Render graph and resource tracking -- track textures, buffers+regions ownership and usage, automatic barriers, etc.
- [ ] Modules and dependencies tracking
- [ ] Integrate rendertests into CI
### Small things
- [ ] Material patching refactoring: do not load any patched textures before they are referenced by the engine itself.
Only load patched textures for the textures that are in fact used by something.
### Nice-to-have
- [ ] Split Vulkan+RT from xash specifics, start preparing it for being a standalone thing.
- [ ] clang-format for it
# Previously
## 2024-02-05 E373
- [x] Skybox for traditional renderer

View File

@ -1,8 +1,7 @@
#include "ray_resources.h"
#include "vk_core.h"
#include "vk_image.h"
#include "shaders/ray_interop.h" // FIXME temp for type validation
#include "vk_common.h"
#include <stdlib.h>
@ -23,23 +22,28 @@ void R_VkResourcesPrepareDescriptorsValues(VkCommandBuffer cmdbuf, vk_resources_
const qboolean write = i >= args.write_begin;
if (res->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
ASSERT(image_barriers_count < COUNTOF(image_barriers));
if (write) {
// No reads are happening
ASSERT(res->read.pipelines == 0);
//ASSERT(res->read.pipelines == 0);
res->write = (ray_resource_state_t) {
src_stage_mask |= res->read.pipelines | res->write.pipelines;
const ray_resource_state_t new_state = {
.pipelines = args.pipeline,
.access_mask = VK_ACCESS_SHADER_WRITE_BIT,
.image_layout = VK_IMAGE_LAYOUT_GENERAL,
.pipelines = args.pipeline,
};
image_barriers[image_barriers_count++] = (VkImageMemoryBarrier) {
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.image = src_value->image_object->image,
.srcAccessMask = 0,
.dstAccessMask = res->write.access_mask,
// FIXME MEMORY_WRITE is needed to silence write-after-write layout-transition validation hazard
.srcAccessMask = res->read.access_mask | res->write.access_mask | VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = new_state.access_mask,
.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = res->write.image_layout,
.newLayout = new_state.image_layout,
.subresourceRange = (VkImageSubresourceRange) {
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.baseMipLevel = 0,
@ -49,18 +53,23 @@ void R_VkResourcesPrepareDescriptorsValues(VkCommandBuffer cmdbuf, vk_resources_
},
};
// Mark that read would need a transition
res->read = (ray_resource_state_t){0};
res->write = new_state;
} else {
// Write happened
ASSERT(res->write.pipelines != 0);
// No barrier was issued
if (!(res->read.pipelines & args.pipeline)) {
res->read.access_mask = VK_ACCESS_SHADER_READ_BIT;
res->read.pipelines |= args.pipeline;
res->read.image_layout = VK_IMAGE_LAYOUT_GENERAL;
src_stage_mask |= res->write.pipelines;
res->read = (ray_resource_state_t) {
.pipelines = res->read.pipelines | args.pipeline,
.access_mask = VK_ACCESS_SHADER_READ_BIT,
.image_layout = VK_IMAGE_LAYOUT_GENERAL,
};
image_barriers[image_barriers_count++] = (VkImageMemoryBarrier) {
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.image = src_value->image_object->image,

View File

@ -36,7 +36,7 @@ extern ref_globals_t *gpGlobals;
struct { \
TYPE items[MAX_SIZE]; \
int count; \
} NAME
} NAME = {0}
#define BOUNDED_ARRAY_APPEND(var, item) \
do { \

View File

@ -41,6 +41,8 @@
#include <string.h>
#include <errno.h>
#define LOG_MODULE core
#define NULLINST_FUNCS(X) \
X(vkEnumerateInstanceVersion) \
X(vkCreateInstance) \
@ -189,7 +191,7 @@ static qboolean createInstance( void )
.pEngineName = "xash3d-fwgs",
};
BOUNDED_ARRAY(validation_features, VkValidationFeatureEnableEXT, 8) = {0};
BOUNDED_ARRAY(validation_features, VkValidationFeatureEnableEXT, 8);
BOUNDED_ARRAY_APPEND(validation_features, VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT);
BOUNDED_ARRAY_APPEND(validation_features, VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT);
@ -518,8 +520,11 @@ static qboolean createDevice( void ) {
is_target_device_found = true;
}
if (candidate_device->ray_tracing && !CVAR_TO_BOOL(rt_force_disable)) {
vk_core.rtx = true;
if (candidate_device->ray_tracing) {
const qboolean force_disabled = CVAR_TO_BOOL(rt_force_disable);
if (force_disabled)
WARN("GPU[%d] supports ray tracing, but rt_force_disable is set, force-disabling ray tracing support", i);
vk_core.rtx = !force_disabled;
}
VkPhysicalDeviceAccelerationStructureFeaturesKHR accel_feature = {

View File

@ -3,7 +3,6 @@
#include "vk_overlay.h"
#include "vk_scene.h"
#include "vk_render.h"
#include "vk_rtx.h"
#include "vk_cvar.h"
#include "vk_devmem.h"
#include "vk_swapchain.h"
@ -142,12 +141,50 @@ static VkRenderPass createRenderPass( VkFormat depth_format, qboolean ray_tracin
.pDepthStencilAttachment = &depth_attachment,
};
BOUNDED_ARRAY(dependencies, VkSubpassDependency, 2);
if (vk_core.rtx) {
const VkSubpassDependency color = {
.srcSubpass = VK_SUBPASS_EXTERNAL,
.dstSubpass = 0,
.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT,
};
BOUNDED_ARRAY_APPEND(dependencies, color);
} else {
const VkSubpassDependency color = {
.srcSubpass = VK_SUBPASS_EXTERNAL,
.dstSubpass = 0,
.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
.srcAccessMask = 0,
.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT,
};
BOUNDED_ARRAY_APPEND(dependencies, color);
}
const VkSubpassDependency depth = {
.srcSubpass = VK_SUBPASS_EXTERNAL,
.dstSubpass = 0,
.srcStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
.dstStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT,
.dependencyFlags = 0,
};
BOUNDED_ARRAY_APPEND(dependencies, depth);
const VkRenderPassCreateInfo rpci = {
.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
.attachmentCount = ARRAYSIZE(attachments),
.pAttachments = attachments,
.subpassCount = 1,
.pSubpasses = &subdesc,
.dependencyCount = dependencies.count,
.pDependencies = dependencies.items,
};
XVK_CHECK(vkCreateRenderPass(vk_core.device, &rpci, NULL, &render_pass));
@ -340,8 +377,8 @@ static void submit( vk_combuf_t* combuf, qboolean wait, qboolean draw ) {
};
// TODO for RT renderer we only touch framebuffer at the very end of rendering/cmdbuf.
// Can we postpone waitinf for framebuffer semaphore until we actually need it.
BOUNDED_ARRAY(waitophores, VkSemaphore, 2) = {0};
BOUNDED_ARRAY(signalphores, VkSemaphore, 2) = {0};
BOUNDED_ARRAY(waitophores, VkSemaphore, 2);
BOUNDED_ARRAY(signalphores, VkSemaphore, 2);
if (draw) {
BOUNDED_ARRAY_APPEND(waitophores, frame->sem_framebuffer_ready);

View File

@ -3,6 +3,7 @@
#include "vk_common.h"
#define LIST_LOG_MODULES(X) \
X(core) \
X(misc) \
X(tex) \
X(brush) \

View File

@ -41,7 +41,8 @@ struct vk_combuf_s;
qboolean createOrUpdateAccelerationStructure(struct vk_combuf_s *combuf, const as_build_args_t *args);
#define MAX_SCRATCH_BUFFER (32*1024*1024)
#define MAX_ACCELS_BUFFER (128*1024*1024)
// FIXME compute this by lazily allocating #define MAX_ACCELS_BUFFER (128*1024*1024)
#define MAX_ACCELS_BUFFER (256*1024*1024)
typedef struct {
// Geometry metadata. Lifetime is similar to geometry lifetime itself.

View File

@ -641,24 +641,46 @@ static uint32_t writeDlightsToUBO( void )
return ubo_lights_offset;
}
/*
static void debugBarrier( VkCommandBuffer cmdbuf, VkBuffer buf) {
const VkBufferMemoryBarrier bmb[] = { {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
.buffer = buf,
.offset = 0,
.size = VK_WHOLE_SIZE,
} };
vkCmdPipelineBarrier(cmdbuf,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, 0, NULL, ARRAYSIZE(bmb), bmb, 0, NULL);
}
*/
void VK_Render_FIXME_Barrier( VkCommandBuffer cmdbuf ) {
const VkBuffer geom_buffer = R_GeometryBuffer_Get();
// FIXME
//debugBarrier(cmdbuf, geom_buffer);
// FIXME: this should be automatic and dynamically depend on actual usage, resolving this with render graph
{
const VkBufferMemoryBarrier bmb[] = { {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
//.dstAccessMask = VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR, // FIXME
.dstAccessMask = VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT , // FIXME
.dstAccessMask
= VK_ACCESS_INDEX_READ_BIT
| VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT
| (vk_core.rtx ? ( VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR | VK_ACCESS_SHADER_READ_BIT) : 0),
.buffer = geom_buffer,
.offset = 0, // FIXME
.size = VK_WHOLE_SIZE, // FIXME
.offset = 0,
.size = VK_WHOLE_SIZE,
} };
vkCmdPipelineBarrier(cmdbuf,
VK_PIPELINE_STAGE_TRANSFER_BIT,
//VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR,
//VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | (vk_core.rtx
? VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR
| VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR
| VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
: 0),
0, 0, NULL, ARRAYSIZE(bmb), bmb, 0, NULL);
}
}

View File

@ -378,7 +378,8 @@ static void performTracing( vk_combuf_t *combuf, const perform_tracing_args_t* a
if (!res->name[0] || !res->image.image || res->source_index_plus_1 > 0)
continue;
res->resource.read = res->resource.write = (ray_resource_state_t){0};
//res->resource.read = res->resource.write = (ray_resource_state_t){0};
res->resource.write = (ray_resource_state_t){0};
}
DEBUG_BEGIN(cmdbuf, "yay tracing");

View File

@ -5,10 +5,12 @@
#include "profiler.h"
#include "r_speeds.h"
#include "vk_combuf.h"
#include "vk_logs.h"
#include <memory.h>
#define MODULE_NAME "staging"
#define LOG_MODULE staging
#define DEFAULT_STAGING_SIZE (128*1024*1024)
#define MAX_STAGING_ALLOCS (2048)
@ -198,6 +200,45 @@ static void commitBuffers(vk_combuf_t *combuf) {
// - upload once per buffer
// - join adjacent regions
BOUNDED_ARRAY(barriers, VkBufferMemoryBarrier, 4);
for (int i = 0; i < g_staging.buffers.count; i++) {
const VkBuffer dst_buf = g_staging.buffers.dest[i];
for (int j = 0;; ++j) {
if (j == COUNTOF(barriers.items)) {
ERR("Ran out of buffer barrier slots, oh no");
break;
}
// Instert last
if (j == barriers.count) {
barriers.count++;
barriers.items[j] = (VkBufferMemoryBarrier){
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.buffer = dst_buf,
.offset = 0,
.size = VK_WHOLE_SIZE,
};
break;
}
// Already inserted
if (barriers.items[j].buffer == dst_buf)
break;
}
}
if (barriers.count) {
vkCmdPipelineBarrier(cmdbuf,
// FIXME this should be more concrete. Will need to pass buffer "state" around.
// For now it works, but makes validation uhappy.
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0, 0, NULL, barriers.count, barriers.items, 0, NULL);
}
VkBuffer prev_buffer = VK_NULL_HANDLE;
int first_copy = 0;
for (int i = 0; i < g_staging.buffers.count; i++) {