Merge remote-tracking branch 'upstream/master' into int-flags

2025-06-18 04:27:58 -05:00 · 2021-01-10 22:09:56 -07:00
parent bc69cc1511 46cd71d1c7
commit 7a3c884e39
912 changed files with 91129 additions and 25508 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -5,8 +5,27 @@ add_library(video_core STATIC
    buffer_cache/buffer_cache.h
    buffer_cache/map_interval.cpp
    buffer_cache/map_interval.h
+    cdma_pusher.cpp
+    cdma_pusher.h
+    command_classes/codecs/codec.cpp
+    command_classes/codecs/codec.h
+    command_classes/codecs/h264.cpp
+    command_classes/codecs/h264.h
+    command_classes/codecs/vp9.cpp
+    command_classes/codecs/vp9.h
+    command_classes/codecs/vp9_types.h
+    command_classes/host1x.cpp
+    command_classes/host1x.h
+    command_classes/nvdec.cpp
+    command_classes/nvdec.h
+    command_classes/nvdec_common.h
+    command_classes/sync_manager.cpp
+    command_classes/sync_manager.h
+    command_classes/vic.cpp
+    command_classes/vic.h
    compatible_formats.cpp
    compatible_formats.h
+    delayed_destruction_ring.h
    dirty_flags.cpp
    dirty_flags.h
    dma_pusher.cpp
@ -29,6 +48,7 @@ add_library(video_core STATIC
    engines/shader_bytecode.h
    engines/shader_header.h
    engines/shader_type.h
+    framebuffer_config.h
    macro/macro.cpp
    macro/macro.h
    macro/macro_hle.cpp
@ -40,10 +60,6 @@ add_library(video_core STATIC
    fence_manager.h
    gpu.cpp
    gpu.h
-    gpu_asynch.cpp
-    gpu_asynch.h
-    gpu_synch.cpp
-    gpu_synch.h
    gpu_thread.cpp
    gpu_thread.h
    guest_driver.cpp
@ -66,14 +82,10 @@ add_library(video_core STATIC
    renderer_opengl/gl_device.h
    renderer_opengl/gl_fence_manager.cpp
    renderer_opengl/gl_fence_manager.h
-    renderer_opengl/gl_framebuffer_cache.cpp
-    renderer_opengl/gl_framebuffer_cache.h
    renderer_opengl/gl_rasterizer.cpp
    renderer_opengl/gl_rasterizer.h
    renderer_opengl/gl_resource_manager.cpp
    renderer_opengl/gl_resource_manager.h
-    renderer_opengl/gl_sampler_cache.cpp
-    renderer_opengl/gl_sampler_cache.h
    renderer_opengl/gl_shader_cache.cpp
    renderer_opengl/gl_shader_cache.h
    renderer_opengl/gl_shader_decompiler.cpp
@ -95,10 +107,62 @@ add_library(video_core STATIC
    renderer_opengl/maxwell_to_gl.h
    renderer_opengl/renderer_opengl.cpp
    renderer_opengl/renderer_opengl.h
-    renderer_opengl/utils.cpp
-    renderer_opengl/utils.h
-    sampler_cache.cpp
-    sampler_cache.h
+    renderer_opengl/util_shaders.cpp
+    renderer_opengl/util_shaders.h
+    renderer_vulkan/blit_image.cpp
+    renderer_vulkan/blit_image.h
+    renderer_vulkan/fixed_pipeline_state.cpp
+    renderer_vulkan/fixed_pipeline_state.h
+    renderer_vulkan/maxwell_to_vk.cpp
+    renderer_vulkan/maxwell_to_vk.h
+    renderer_vulkan/renderer_vulkan.h
+    renderer_vulkan/renderer_vulkan.cpp
+    renderer_vulkan/vk_blit_screen.cpp
+    renderer_vulkan/vk_blit_screen.h
+    renderer_vulkan/vk_buffer_cache.cpp
+    renderer_vulkan/vk_buffer_cache.h
+    renderer_vulkan/vk_command_pool.cpp
+    renderer_vulkan/vk_command_pool.h
+    renderer_vulkan/vk_compute_pass.cpp
+    renderer_vulkan/vk_compute_pass.h
+    renderer_vulkan/vk_compute_pipeline.cpp
+    renderer_vulkan/vk_compute_pipeline.h
+    renderer_vulkan/vk_descriptor_pool.cpp
+    renderer_vulkan/vk_descriptor_pool.h
+    renderer_vulkan/vk_fence_manager.cpp
+    renderer_vulkan/vk_fence_manager.h
+    renderer_vulkan/vk_graphics_pipeline.cpp
+    renderer_vulkan/vk_graphics_pipeline.h
+    renderer_vulkan/vk_master_semaphore.cpp
+    renderer_vulkan/vk_master_semaphore.h
+    renderer_vulkan/vk_memory_manager.cpp
+    renderer_vulkan/vk_memory_manager.h
+    renderer_vulkan/vk_pipeline_cache.cpp
+    renderer_vulkan/vk_pipeline_cache.h
+    renderer_vulkan/vk_query_cache.cpp
+    renderer_vulkan/vk_query_cache.h
+    renderer_vulkan/vk_rasterizer.cpp
+    renderer_vulkan/vk_rasterizer.h
+    renderer_vulkan/vk_resource_pool.cpp
+    renderer_vulkan/vk_resource_pool.h
+    renderer_vulkan/vk_scheduler.cpp
+    renderer_vulkan/vk_scheduler.h
+    renderer_vulkan/vk_shader_decompiler.cpp
+    renderer_vulkan/vk_shader_decompiler.h
+    renderer_vulkan/vk_shader_util.cpp
+    renderer_vulkan/vk_shader_util.h
+    renderer_vulkan/vk_staging_buffer_pool.cpp
+    renderer_vulkan/vk_staging_buffer_pool.h
+    renderer_vulkan/vk_state_tracker.cpp
+    renderer_vulkan/vk_state_tracker.h
+    renderer_vulkan/vk_stream_buffer.cpp
+    renderer_vulkan/vk_stream_buffer.h
+    renderer_vulkan/vk_swapchain.cpp
+    renderer_vulkan/vk_swapchain.h
+    renderer_vulkan/vk_texture_cache.cpp
+    renderer_vulkan/vk_texture_cache.h
+    renderer_vulkan/vk_update_descriptor.cpp
+    renderer_vulkan/vk_update_descriptor.h
    shader_cache.h
    shader_notify.cpp
    shader_notify.h
@ -155,109 +219,71 @@ add_library(video_core STATIC
    shader/transform_feedback.h
    surface.cpp
    surface.h
+    texture_cache/accelerated_swizzle.cpp
+    texture_cache/accelerated_swizzle.h
+    texture_cache/decode_bc4.cpp
+    texture_cache/decode_bc4.h
+    texture_cache/descriptor_table.h
+    texture_cache/formatter.cpp
+    texture_cache/formatter.h
    texture_cache/format_lookup_table.cpp
    texture_cache/format_lookup_table.h
-    texture_cache/surface_base.cpp
-    texture_cache/surface_base.h
-    texture_cache/surface_params.cpp
-    texture_cache/surface_params.h
-    texture_cache/surface_view.cpp
-    texture_cache/surface_view.h
+    texture_cache/image_base.cpp
+    texture_cache/image_base.h
+    texture_cache/image_info.cpp
+    texture_cache/image_info.h
+    texture_cache/image_view_base.cpp
+    texture_cache/image_view_base.h
+    texture_cache/image_view_info.cpp
+    texture_cache/image_view_info.h
+    texture_cache/render_targets.h
+    texture_cache/samples_helper.h
+    texture_cache/slot_vector.h
    texture_cache/texture_cache.h
+    texture_cache/types.h
+    texture_cache/util.cpp
+    texture_cache/util.h
    textures/astc.cpp
    textures/astc.h
-    textures/convert.cpp
-    textures/convert.h
    textures/decoders.cpp
    textures/decoders.h
    textures/texture.cpp
    textures/texture.h
    video_core.cpp
    video_core.h
+    vulkan_common/vulkan_debug_callback.cpp
+    vulkan_common/vulkan_debug_callback.h
+    vulkan_common/vulkan_device.cpp
+    vulkan_common/vulkan_device.h
+    vulkan_common/vulkan_instance.cpp
+    vulkan_common/vulkan_instance.h
+    vulkan_common/vulkan_library.cpp
+    vulkan_common/vulkan_library.h
+    vulkan_common/vulkan_surface.cpp
+    vulkan_common/vulkan_surface.h
+    vulkan_common/vulkan_wrapper.cpp
+    vulkan_common/vulkan_wrapper.h
+    vulkan_common/nsight_aftermath_tracker.cpp
+    vulkan_common/nsight_aftermath_tracker.h
 )

-if (ENABLE_VULKAN)
-    target_sources(video_core PRIVATE
-        renderer_vulkan/fixed_pipeline_state.cpp
-        renderer_vulkan/fixed_pipeline_state.h
-        renderer_vulkan/maxwell_to_vk.cpp
-        renderer_vulkan/maxwell_to_vk.h
-        renderer_vulkan/nsight_aftermath_tracker.cpp
-        renderer_vulkan/nsight_aftermath_tracker.h
-        renderer_vulkan/renderer_vulkan.h
-        renderer_vulkan/renderer_vulkan.cpp
-        renderer_vulkan/vk_blit_screen.cpp
-        renderer_vulkan/vk_blit_screen.h
-        renderer_vulkan/vk_buffer_cache.cpp
-        renderer_vulkan/vk_buffer_cache.h
-        renderer_vulkan/vk_command_pool.cpp
-        renderer_vulkan/vk_command_pool.h
-        renderer_vulkan/vk_compute_pass.cpp
-        renderer_vulkan/vk_compute_pass.h
-        renderer_vulkan/vk_compute_pipeline.cpp
-        renderer_vulkan/vk_compute_pipeline.h
-        renderer_vulkan/vk_descriptor_pool.cpp
-        renderer_vulkan/vk_descriptor_pool.h
-        renderer_vulkan/vk_device.cpp
-        renderer_vulkan/vk_device.h
-        renderer_vulkan/vk_fence_manager.cpp
-        renderer_vulkan/vk_fence_manager.h
-        renderer_vulkan/vk_graphics_pipeline.cpp
-        renderer_vulkan/vk_graphics_pipeline.h
-        renderer_vulkan/vk_image.cpp
-        renderer_vulkan/vk_image.h
-        renderer_vulkan/vk_master_semaphore.cpp
-        renderer_vulkan/vk_master_semaphore.h
-        renderer_vulkan/vk_memory_manager.cpp
-        renderer_vulkan/vk_memory_manager.h
-        renderer_vulkan/vk_pipeline_cache.cpp
-        renderer_vulkan/vk_pipeline_cache.h
-        renderer_vulkan/vk_query_cache.cpp
-        renderer_vulkan/vk_query_cache.h
-        renderer_vulkan/vk_rasterizer.cpp
-        renderer_vulkan/vk_rasterizer.h
-        renderer_vulkan/vk_renderpass_cache.cpp
-        renderer_vulkan/vk_renderpass_cache.h
-        renderer_vulkan/vk_resource_pool.cpp
-        renderer_vulkan/vk_resource_pool.h
-        renderer_vulkan/vk_sampler_cache.cpp
-        renderer_vulkan/vk_sampler_cache.h
-        renderer_vulkan/vk_scheduler.cpp
-        renderer_vulkan/vk_scheduler.h
-        renderer_vulkan/vk_shader_decompiler.cpp
-        renderer_vulkan/vk_shader_decompiler.h
-        renderer_vulkan/vk_shader_util.cpp
-        renderer_vulkan/vk_shader_util.h
-        renderer_vulkan/vk_staging_buffer_pool.cpp
-        renderer_vulkan/vk_staging_buffer_pool.h
-        renderer_vulkan/vk_state_tracker.cpp
-        renderer_vulkan/vk_state_tracker.h
-        renderer_vulkan/vk_stream_buffer.cpp
-        renderer_vulkan/vk_stream_buffer.h
-        renderer_vulkan/vk_swapchain.cpp
-        renderer_vulkan/vk_swapchain.h
-        renderer_vulkan/vk_texture_cache.cpp
-        renderer_vulkan/vk_texture_cache.h
-        renderer_vulkan/vk_update_descriptor.cpp
-        renderer_vulkan/vk_update_descriptor.h
-        renderer_vulkan/wrapper.cpp
-        renderer_vulkan/wrapper.h
-    )
-endif()
-
 create_target_directory_groups(video_core)

 target_link_libraries(video_core PUBLIC common core)
 target_link_libraries(video_core PRIVATE glad xbyak)

+if (MSVC)
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
+else()
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
+endif()
+
 add_dependencies(video_core host_shaders)
 target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
-
-if (ENABLE_VULKAN)
-    target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
-    target_compile_definitions(video_core PRIVATE HAS_VULKAN)
-    target_link_libraries(video_core PRIVATE sirit)
-endif()
+target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
+target_link_libraries(video_core PRIVATE sirit)

 if (ENABLE_NSIGHT_AFTERMATH)
    if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})
@ -271,7 +297,27 @@ if (ENABLE_NSIGHT_AFTERMATH)
 endif()

 if (MSVC)
-    target_compile_options(video_core PRIVATE /we4267)
+    target_compile_options(video_core PRIVATE
+        /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data
+        /we4456 # Declaration of 'identifier' hides previous local declaration
+        /we4457 # Declaration of 'identifier' hides function parameter
+        /we4458 # Declaration of 'identifier' hides class member
+        /we4459 # Declaration of 'identifier' hides global declaration
+        /we4715 # 'function' : not all control paths return a value
+    )
 else()
-    target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion -Werror=switch)
+    target_compile_options(video_core PRIVATE
+        -Werror=conversion
+        -Wno-error=sign-conversion
+        -Werror=pessimizing-move
+        -Werror=redundant-move
+        -Werror=shadow
+        -Werror=switch
+        -Werror=type-limits
+        -Werror=unused-variable
+
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable>
+    )
 endif()
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@ -4,34 +4,29 @@

 #pragma once

-#include <unordered_set>
-#include <utility>
-
-#include "common/alignment.h"
 #include "common/common_types.h"
-#include "video_core/gpu.h"

 namespace VideoCommon {

 class BufferBlock {
 public:
-    bool Overlaps(VAddr start, VAddr end) const {
+    [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
        return (cpu_addr < end) && (cpu_addr_end > start);
    }

-    bool IsInside(VAddr other_start, VAddr other_end) const {
+    [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
        return cpu_addr <= other_start && other_end <= cpu_addr_end;
    }

-    std::size_t Offset(VAddr in_addr) const {
+    [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
        return static_cast<std::size_t>(in_addr - cpu_addr);
    }

-    VAddr CpuAddr() const {
+    [[nodiscard]] VAddr CpuAddr() const {
        return cpu_addr;
    }

-    VAddr CpuAddrEnd() const {
+    [[nodiscard]] VAddr CpuAddrEnd() const {
        return cpu_addr_end;
    }

@ -40,11 +35,11 @@ public:
        cpu_addr_end = new_addr + size;
    }

-    std::size_t Size() const {
+    [[nodiscard]] std::size_t Size() const {
        return size;
    }

-    u64 Epoch() const {
+    [[nodiscard]] u64 Epoch() const {
        return epoch;
    }

--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -118,20 +118,17 @@ public:
    /// Prepares the buffer cache for data uploading
    /// @param max_size Maximum number of bytes that will be uploaded
    /// @return True when a stream buffer invalidation was required, false otherwise
-    bool Map(std::size_t max_size) {
+    void Map(std::size_t max_size) {
        std::lock_guard lock{mutex};

-        bool invalidated;
-        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
        buffer_offset = buffer_offset_base;
-
-        return invalidated;
    }

    /// Finishes the upload stream
    void Unmap() {
        std::lock_guard lock{mutex};
-        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        stream_buffer.Unmap(buffer_offset - buffer_offset_base);
    }

    /// Function called at the end of each frame, inteded for deferred operations
@ -261,9 +258,9 @@ public:
 protected:
    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
-                         std::unique_ptr<StreamBuffer> stream_buffer_)
+                         StreamBuffer& stream_buffer_)
        : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
-          stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {}
+          stream_buffer{stream_buffer_} {}

    ~BufferCache() = default;

@ -441,7 +438,7 @@ private:

        buffer_ptr += size;
        buffer_offset += size;
-        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
+        return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
    }

    void AlignBuffer(std::size_t alignment) {
@ -545,7 +542,7 @@ private:
    bool IsRegionWritten(VAddr start, VAddr end) const {
        const u64 page_end = end >> WRITE_PAGE_BIT;
        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
-            if (written_pages.count(page_start) > 0) {
+            if (written_pages.contains(page_start)) {
                return true;
            }
        }
@ -567,9 +564,7 @@ private:
    VideoCore::RasterizerInterface& rasterizer;
    Tegra::MemoryManager& gpu_memory;
    Core::Memory::Memory& cpu_memory;
-
-    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle;
+    StreamBuffer& stream_buffer;

    u8* buffer_ptr = nullptr;
    u64 buffer_offset = 0;
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@ -84,9 +84,10 @@ private:
    void FillFreeList(Chunk& chunk);

    std::vector<MapInterval*> free_list;
-    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;

    Chunk first_chunk;
+
+    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
 };

 } // namespace VideoCommon
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@ -0,0 +1,170 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "command_classes/host1x.h"
+#include "command_classes/nvdec.h"
+#include "command_classes/vic.h"
+#include "common/bit_util.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/command_classes/nvdec_common.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+CDmaPusher::CDmaPusher(GPU& gpu_)
+    : gpu{gpu_}, nvdec_processor(std::make_shared<Nvdec>(gpu)),
+      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
+      host1x_processor(std::make_unique<Host1x>(gpu)),
+      sync_manager(std::make_unique<SyncptIncrManager>(gpu)) {}
+
+CDmaPusher::~CDmaPusher() = default;
+
+void CDmaPusher::Push(ChCommandHeaderList&& entries) {
+    cdma_queue.push(std::move(entries));
+}
+
+void CDmaPusher::DispatchCalls() {
+    while (!cdma_queue.empty()) {
+        Step();
+    }
+}
+
+void CDmaPusher::Step() {
+    const auto entries{cdma_queue.front()};
+    cdma_queue.pop();
+
+    std::vector<u32> values(entries.size());
+    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
+
+    for (const u32 value : values) {
+        if (mask != 0) {
+            const u32 lbs = Common::CountTrailingZeroes32(mask);
+            mask &= ~(1U << lbs);
+            ExecuteCommand(static_cast<u32>(offset + lbs), value);
+            continue;
+        } else if (count != 0) {
+            --count;
+            ExecuteCommand(static_cast<u32>(offset), value);
+            if (incrementing) {
+                ++offset;
+            }
+            continue;
+        }
+        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
+        switch (mode) {
+        case ChSubmissionMode::SetClass: {
+            mask = value & 0x3f;
+            offset = (value >> 16) & 0xfff;
+            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
+            break;
+        }
+        case ChSubmissionMode::Incrementing:
+        case ChSubmissionMode::NonIncrementing:
+            count = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            incrementing = mode == ChSubmissionMode::Incrementing;
+            break;
+        case ChSubmissionMode::Mask:
+            mask = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            break;
+        case ChSubmissionMode::Immediate: {
+            const u32 data = value & 0xfff;
+            offset = (value >> 16) & 0xfff;
+            ExecuteCommand(static_cast<u32>(offset), data);
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
+            break;
+        }
+    }
+}
+
+void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
+    switch (current_class) {
+    case ChClassId::NvDec:
+        ThiStateWrite(nvdec_thi_state, state_offset, {data});
+        switch (static_cast<ThiMethod>(state_offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                sync_manager->Increment(syncpoint_id);
+            } else {
+                sync_manager->SignalDone(
+                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
+                      static_cast<u32>(nvdec_thi_state.method_0));
+            nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0),
+                                           {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::GraphicsVic:
+        ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
+        switch (static_cast<ThiMethod>(state_offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                sync_manager->Increment(syncpoint_id);
+            } else {
+                sync_manager->SignalDone(
+                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
+                      static_cast<u32>(vic_thi_state.method_0), data);
+            vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::Host1x:
+        // This device is mainly for syncpoint synchronization
+        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
+        host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data});
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
+        break;
+    }
+}
+
+void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset,
+                               const std::vector<u32>& arguments) {
+    u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
+    std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size());
+}
+
+} // namespace Tegra
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@ -0,0 +1,136 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/sync_manager.h"
+
+namespace Tegra {
+
+class GPU;
+class Nvdec;
+class Vic;
+class Host1x;
+
+enum class ChSubmissionMode : u32 {
+    SetClass = 0,
+    Incrementing = 1,
+    NonIncrementing = 2,
+    Mask = 3,
+    Immediate = 4,
+    Restart = 5,
+    Gather = 6,
+};
+
+enum class ChClassId : u32 {
+    NoClass = 0x0,
+    Host1x = 0x1,
+    VideoEncodeMpeg = 0x20,
+    VideoEncodeNvEnc = 0x21,
+    VideoStreamingVi = 0x30,
+    VideoStreamingIsp = 0x32,
+    VideoStreamingIspB = 0x34,
+    VideoStreamingViI2c = 0x36,
+    GraphicsVic = 0x5d,
+    Graphics3D = 0x60,
+    GraphicsGpu = 0x61,
+    Tsec = 0xe0,
+    TsecB = 0xe1,
+    NvJpg = 0xc0,
+    NvDec = 0xf0
+};
+
+enum class ChMethod : u32 {
+    Empty = 0,
+    SetMethod = 0x10,
+    SetData = 0x11,
+};
+
+union ChCommandHeader {
+    u32 raw;
+    BitField<0, 16, u32> value;
+    BitField<16, 12, ChMethod> method_offset;
+    BitField<28, 4, ChSubmissionMode> submission_mode;
+};
+static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
+
+struct ChCommand {
+    ChClassId class_id{};
+    int method_offset{};
+    std::vector<u32> arguments;
+};
+
+using ChCommandHeaderList = std::vector<ChCommandHeader>;
+using ChCommandList = std::vector<ChCommand>;
+
+struct ThiRegisters {
+    u32_le increment_syncpt{};
+    INSERT_PADDING_WORDS(1);
+    u32_le increment_syncpt_error{};
+    u32_le ctx_switch_incremement_syncpt{};
+    INSERT_PADDING_WORDS(4);
+    u32_le ctx_switch{};
+    INSERT_PADDING_WORDS(1);
+    u32_le ctx_syncpt_eof{};
+    INSERT_PADDING_WORDS(5);
+    u32_le method_0{};
+    u32_le method_1{};
+    INSERT_PADDING_WORDS(12);
+    u32_le int_status{};
+    u32_le int_mask{};
+};
+
+enum class ThiMethod : u32 {
+    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
+    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
+    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
+};
+
+class CDmaPusher {
+public:
+    explicit CDmaPusher(GPU& gpu_);
+    ~CDmaPusher();
+
+    /// Push NVDEC command buffer entries into queue
+    void Push(ChCommandHeaderList&& entries);
+
+    /// Process queued command buffer entries
+    void DispatchCalls();
+
+    /// Process one queue element
+    void Step();
+
+    /// Invoke command class devices to execute the command based on the current state
+    void ExecuteCommand(u32 state_offset, u32 data);
+
+private:
+    /// Write arguments value to the ThiRegisters member at the specified offset
+    void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments);
+
+    GPU& gpu;
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+    std::unique_ptr<Tegra::Vic> vic_processor;
+    std::unique_ptr<Tegra::Host1x> host1x_processor;
+    std::unique_ptr<SyncptIncrManager> sync_manager;
+    ChClassId current_class{};
+    ThiRegisters vic_thi_state{};
+    ThiRegisters nvdec_thi_state{};
+
+    s32 count{};
+    s32 offset{};
+    s32 mask{};
+    bool incrementing{};
+
+    // Queue of command lists to be processed
+    std::queue<ChCommandHeaderList> cdma_queue;
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@ -0,0 +1,129 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <fstream>
+#include <vector>
+#include "common/assert.h"
+#include "video_core/command_classes/codecs/codec.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+extern "C" {
+#include <libavutil/opt.h>
+}
+
+namespace Tegra {
+
+void AVFrameDeleter(AVFrame* ptr) {
+    av_frame_unref(ptr);
+    av_free(ptr);
+}
+
+Codec::Codec(GPU& gpu_)
+    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
+      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
+
+Codec::~Codec() {
+    if (!initialized) {
+        return;
+    }
+    // Free libav memory
+    AVFrame* av_frame{nullptr};
+    avcodec_send_packet(av_codec_ctx, nullptr);
+    av_frame = av_frame_alloc();
+    avcodec_receive_frame(av_codec_ctx, av_frame);
+    avcodec_flush_buffers(av_codec_ctx);
+
+    av_frame_unref(av_frame);
+    av_free(av_frame);
+    avcodec_close(av_codec_ctx);
+}
+
+void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
+    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec);
+    current_codec = codec;
+}
+
+void Codec::StateWrite(u32 offset, u64 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
+    std::memcpy(state_offset, &arguments, sizeof(u64));
+}
+
+void Codec::Decode() {
+    bool is_first_frame = false;
+
+    if (!initialized) {
+        if (current_codec == NvdecCommon::VideoCodec::H264) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
+        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
+        } else {
+            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", current_codec);
+            return;
+        }
+
+        av_codec_ctx = avcodec_alloc_context3(av_codec);
+        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
+
+        // TODO(ameerj): libavcodec gpu hw acceleration
+
+        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
+        if (av_error < 0) {
+            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
+            avcodec_close(av_codec_ctx);
+            return;
+        }
+        initialized = true;
+        is_first_frame = true;
+    }
+    bool vp9_hidden_frame = false;
+
+    AVPacket packet{};
+    av_init_packet(&packet);
+    std::vector<u8> frame_data;
+
+    if (current_codec == NvdecCommon::VideoCodec::H264) {
+        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
+    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+        frame_data = vp9_decoder->ComposeFrameHeader(state);
+        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
+    }
+
+    packet.data = frame_data.data();
+    packet.size = static_cast<int>(frame_data.size());
+
+    avcodec_send_packet(av_codec_ctx, &packet);
+
+    if (!vp9_hidden_frame) {
+        // Only receive/store visible frames
+        AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
+        avcodec_receive_frame(av_codec_ctx, frame.get());
+        av_frames.push(std::move(frame));
+        // Limit queue to 10 frames. Workaround for ZLA decode and queue spam
+        if (av_frames.size() > 10) {
+            av_frames.pop();
+        }
+    }
+}
+
+AVFramePtr Codec::GetCurrentFrame() {
+    // Sometimes VIC will request more frames than have been decoded.
+    // in this case, return a nullptr and don't overwrite previous frame data
+    if (av_frames.empty()) {
+        return AVFramePtr{nullptr, AVFrameDeleter};
+    }
+
+    AVFramePtr frame = std::move(av_frames.front());
+    av_frames.pop();
+    return frame;
+}
+
+NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
+    return current_codec;
+}
+
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <queue>
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+#include <libavcodec/avcodec.h>
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+namespace Tegra {
+class GPU;
+struct VicRegisters;
+
+void AVFrameDeleter(AVFrame* ptr);
+using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
+
+namespace Decoder {
+class H264;
+class VP9;
+} // namespace Decoder
+
+class Codec {
+public:
+    explicit Codec(GPU& gpu);
+    ~Codec();
+
+    /// Sets NVDEC video stream codec
+    void SetTargetCodec(NvdecCommon::VideoCodec codec);
+
+    /// Populate NvdecRegisters state with argument value at the provided offset
+    void StateWrite(u32 offset, u64 arguments);
+
+    /// Call decoders to construct headers, decode AVFrame with ffmpeg
+    void Decode();
+
+    /// Returns next decoded frame
+    [[nodiscard]] AVFramePtr GetCurrentFrame();
+
+    /// Returns the value of current_codec
+    [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
+
+private:
+    bool initialized{};
+    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
+
+    AVCodec* av_codec{nullptr};
+    AVCodecContext* av_codec_ctx{nullptr};
+
+    GPU& gpu;
+    std::unique_ptr<Decoder::H264> h264_decoder;
+    std::unique_ptr<Decoder::VP9> vp9_decoder;
+
+    NvdecCommon::NvdecRegisters state{};
+    std::queue<AVFramePtr> av_frames{};
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@ -0,0 +1,293 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <array>
+#include "common/bit_util.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// ZigZag LUTs from libavcodec.
+constexpr std::array<u8, 64> zig_zag_direct{
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48,
+    41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
+    30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+constexpr std::array<u8, 16> zig_zag_scan{
+    0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
+    1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
+};
+} // Anonymous namespace
+
+H264::H264(GPU& gpu_) : gpu(gpu_) {}
+
+H264::~H264() = default;
+
+const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state,
+                                                bool is_first_frame) {
+    H264DecoderContext context{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+
+    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
+    if (!is_first_frame && frame_number != 0) {
+        frame.resize(context.frame_data_size);
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+    } else {
+        /// Encode header
+        H264BitWriter writer{};
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(7, 5);
+        writer.WriteU(100, 8);
+        writer.WriteU(0, 8);
+        writer.WriteU(31, 8);
+        writer.WriteUe(0);
+        const auto chroma_format_idc =
+            static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3);
+        writer.WriteUe(chroma_format_idc);
+        if (chroma_format_idc == 3) {
+            writer.WriteBit(false);
+        }
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+        writer.WriteBit(false); // Scaling matrix present flag
+
+        const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3);
+        writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf));
+        writer.WriteUe(order_cnt_type);
+        if (order_cnt_type == 0) {
+            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
+        } else if (order_cnt_type == 1) {
+            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+
+            writer.WriteSe(0);
+            writer.WriteSe(0);
+            writer.WriteUe(0);
+        }
+
+        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
+                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+
+        writer.WriteUe(16);
+        writer.WriteBit(false);
+        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+        writer.WriteUe(pic_height - 1);
+        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+
+        if (!context.h264_parameter_set.frame_mbs_only_flag) {
+            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
+        }
+
+        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
+        writer.WriteBit(false); // Frame cropping flag
+        writer.WriteBit(false); // VUI parameter present flag
+
+        writer.End();
+
+        // H264 PPS
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(8, 5);
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+
+        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
+        writer.WriteBit(false);
+        writer.WriteUe(0);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
+        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
+        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
+        pic_init_qp = (pic_init_qp << 26) >> 26;
+        writer.WriteSe(pic_init_qp);
+        writer.WriteSe(0);
+        s32 chroma_qp_index_offset =
+            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
+        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset);
+        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
+        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
+        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+
+        writer.WriteBit(true);
+
+        for (s32 index = 0; index < 6; index++) {
+            writer.WriteBit(true);
+            const auto matrix_x4 =
+                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
+            writer.WriteScalingList(matrix_x4, index * 16, 16);
+        }
+
+        if (context.h264_parameter_set.transform_8x8_mode_flag) {
+            for (s32 index = 0; index < 2; index++) {
+                writer.WriteBit(true);
+                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
+                                                       context.scaling_matrix_8.end());
+
+                writer.WriteScalingList(matrix_x8, index * 64, 64);
+            }
+        }
+
+        s32 chroma_qp_index_offset2 =
+            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
+        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset2);
+
+        writer.End();
+
+        const auto& encoded_header = writer.GetByteArray();
+        frame.resize(encoded_header.size() + context.frame_data_size);
+        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
+                                      frame.data() + encoded_header.size(),
+                                      context.frame_data_size);
+    }
+
+    return frame;
+}
+
+H264BitWriter::H264BitWriter() = default;
+
+H264BitWriter::~H264BitWriter() = default;
+
+void H264BitWriter::WriteU(s32 value, s32 value_sz) {
+    WriteBits(value, value_sz);
+}
+
+void H264BitWriter::WriteSe(s32 value) {
+    WriteExpGolombCodedInt(value);
+}
+
+void H264BitWriter::WriteUe(u32 value) {
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::End() {
+    WriteBit(true);
+    Flush();
+}
+
+void H264BitWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
+    std::vector<u8> scan(count);
+    if (count == 16) {
+        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
+    } else {
+        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
+    }
+    u8 last_scale = 8;
+
+    for (s32 index = 0; index < count; index++) {
+        const u8 value = list[start + scan[index]];
+        const s32 delta_scale = static_cast<s32>(value - last_scale);
+
+        WriteSe(delta_scale);
+
+        last_scale = value;
+    }
+}
+
+std::vector<u8>& H264BitWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& H264BitWriter::GetByteArray() const {
+    return byte_array;
+}
+
+void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
+    s32 value_pos = 0;
+
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free_bits = GetFreeBufferBits();
+
+        if (copy_size > free_bits) {
+            copy_size = free_bits;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
+    const s32 sign = value <= 0 ? 0 : 1;
+    if (value < 0) {
+        value = -value;
+    }
+    value = (value << 1) - sign;
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
+    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
+    WriteBits(1, size);
+
+    value -= (1U << (size - 1)) - 1;
+    WriteBits(static_cast<s32>(value), size - 1);
+}
+
+s32 H264BitWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void H264BitWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+
+    buffer = 0;
+    buffer_pos = 0;
+}
+} // namespace Tegra::Decoder
--- a/src/video_core/command_classes/codecs/h264.h
+++ b/src/video_core/command_classes/codecs/h264.h
@ -0,0 +1,118 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+namespace Decoder {
+
+class H264BitWriter {
+public:
+    H264BitWriter();
+    ~H264BitWriter();
+
+    /// The following Write methods are based on clause 9.1 in the H.264 specification.
+    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
+    void WriteU(s32 value, s32 value_sz);
+    void WriteSe(s32 value);
+    void WriteUe(u32 value);
+
+    /// Finalize the bitstream
+    void End();
+
+    /// append a bit to the stream, equivalent value to the state parameter
+    void WriteBit(bool state);
+
+    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
+    /// Writes the scaling matrices of the sream
+    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
+
+    /// Return the bitstream as a vector.
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    void WriteBits(s32 value, s32 bit_count);
+    void WriteExpGolombCodedInt(s32 value);
+    void WriteExpGolombCodedUInt(u32 value);
+    [[nodiscard]] s32 GetFreeBufferBits();
+    void Flush();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class H264 {
+public:
+    explicit H264(GPU& gpu);
+    ~H264();
+
+    /// Compose the H264 header of the frame for FFmpeg decoding
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
+        const NvdecCommon::NvdecRegisters& state, bool is_first_frame = false);
+
+private:
+    struct H264ParameterSet {
+        u32 log2_max_pic_order_cnt{};
+        u32 delta_pic_order_always_zero_flag{};
+        u32 frame_mbs_only_flag{};
+        u32 pic_width_in_mbs{};
+        u32 pic_height_in_map_units{};
+        INSERT_PADDING_WORDS(1);
+        u32 entropy_coding_mode_flag{};
+        u32 bottom_field_pic_order_flag{};
+        u32 num_refidx_l0_default_active{};
+        u32 num_refidx_l1_default_active{};
+        u32 deblocking_filter_control_flag{};
+        u32 redundant_pic_count_flag{};
+        u32 transform_8x8_mode_flag{};
+        INSERT_PADDING_WORDS(9);
+        u64 flags{};
+        u32 frame_number{};
+        u32 frame_number2{};
+    };
+    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
+
+    struct H264DecoderContext {
+        INSERT_PADDING_BYTES(0x48);
+        u32 frame_data_size{};
+        INSERT_PADDING_BYTES(0xc);
+        H264ParameterSet h264_parameter_set{};
+        INSERT_PADDING_BYTES(0x100);
+        std::array<u8, 0x60> scaling_matrix_4;
+        std::array<u8, 0x80> scaling_matrix_8;
+    };
+    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
+
+    std::vector<u8> frame;
+    GPU& gpu;
+};
+
+} // namespace Decoder
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/vp9.cpp
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@ -0,0 +1,989 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring> // for std::memcpy
+#include <numeric>
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// Default compressed header probabilities once frame context resets
+constexpr Vp9EntropyProbs default_probs{
+    .y_mode_prob{
+        65,  32, 18, 144, 162, 194, 41, 51, 98, 132, 68,  18, 165, 217, 196, 45, 40, 78,
+        173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
+    },
+    .partition_prob{
+        199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
+        174, 73,  87,  0, 92,  41, 83,  0, 82,  99,  50,  0, 53,  39,  39,  0,
+        177, 58,  59,  0, 68,  26, 63,  0, 52,  79,  25,  0, 17,  14,  12,  0,
+        222, 34,  30,  0, 72,  16, 44,  0, 58,  32,  12,  0, 10,  7,   6,   0,
+    },
+    .coef_probs{
+        195, 29,  183, 84,  49,  136, 8,   42,  71,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        31,  107, 169, 35,  99,  159, 17,  82,  140, 8,   66,  114, 2,  44,  76,  1,  19, 32,
+        40,  132, 201, 29,  114, 187, 13,  91,  157, 7,   75,  127, 3,  58,  95,  1,  28, 47,
+        69,  142, 221, 42,  122, 201, 15,  91,  159, 6,   67,  121, 1,  42,  77,  1,  17, 31,
+        102, 148, 228, 67,  117, 204, 17,  82,  154, 6,   59,  114, 2,  39,  75,  1,  15, 29,
+        156, 57,  233, 119, 57,  212, 58,  48,  163, 29,  40,  124, 12, 30,  81,  3,  12, 31,
+        191, 107, 226, 124, 117, 204, 25,  99,  155, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        29,  148, 210, 37,  126, 194, 8,   93,  157, 2,   68,  118, 1,  39,  69,  1,  17, 33,
+        41,  151, 213, 27,  123, 193, 3,   82,  144, 1,   58,  105, 1,  32,  60,  1,  13, 26,
+        59,  159, 220, 23,  126, 198, 4,   88,  151, 1,   66,  114, 1,  38,  71,  1,  18, 34,
+        114, 136, 232, 51,  114, 207, 11,  83,  155, 3,   56,  105, 1,  33,  65,  1,  17, 34,
+        149, 65,  234, 121, 57,  215, 61,  49,  166, 28,  36,  114, 12, 25,  76,  3,  16, 42,
+        214, 49,  220, 132, 63,  188, 42,  65,  137, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        85,  137, 221, 104, 131, 216, 49,  111, 192, 21,  87,  155, 2,  49,  87,  1,  16, 28,
+        89,  163, 230, 90,  137, 220, 29,  100, 183, 10,  70,  135, 2,  42,  81,  1,  17, 33,
+        108, 167, 237, 55,  133, 222, 15,  97,  179, 4,   72,  135, 1,  45,  85,  1,  19, 38,
+        124, 146, 240, 66,  124, 224, 17,  88,  175, 4,   58,  122, 1,  36,  75,  1,  18, 37,
+        141, 79,  241, 126, 70,  227, 66,  58,  182, 30,  44,  136, 12, 34,  96,  2,  20, 47,
+        229, 99,  249, 143, 111, 235, 46,  109, 192, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        82,  158, 236, 94,  146, 224, 25,  117, 191, 9,   87,  149, 3,  56,  99,  1,  33, 57,
+        83,  167, 237, 68,  145, 222, 10,  103, 177, 2,   72,  131, 1,  41,  79,  1,  20, 39,
+        99,  167, 239, 47,  141, 224, 10,  104, 178, 2,   73,  133, 1,  44,  85,  1,  22, 47,
+        127, 145, 243, 71,  129, 228, 17,  93,  177, 3,   61,  124, 1,  41,  84,  1,  21, 52,
+        157, 78,  244, 140, 72,  231, 69,  58,  184, 31,  44,  137, 14, 38,  105, 8,  23, 61,
+        125, 34,  187, 52,  41,  133, 6,   31,  56,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        37,  109, 153, 51,  102, 147, 23,  87,  128, 8,   67,  101, 1,  41,  63,  1,  19, 29,
+        31,  154, 185, 17,  127, 175, 6,   96,  145, 2,   73,  114, 1,  51,  82,  1,  28, 45,
+        23,  163, 200, 10,  131, 185, 2,   93,  148, 1,   67,  111, 1,  41,  69,  1,  14, 24,
+        29,  176, 217, 12,  145, 201, 3,   101, 156, 1,   69,  111, 1,  39,  63,  1,  14, 23,
+        57,  192, 233, 25,  154, 215, 6,   109, 167, 3,   78,  118, 1,  48,  69,  1,  21, 29,
+        202, 105, 245, 108, 106, 216, 18,  90,  144, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        33,  172, 219, 64,  149, 206, 14,  117, 177, 5,   90,  141, 2,  61,  95,  1,  37, 57,
+        33,  179, 220, 11,  140, 198, 1,   89,  148, 1,   60,  104, 1,  33,  57,  1,  12, 21,
+        30,  181, 221, 8,   141, 198, 1,   87,  145, 1,   58,  100, 1,  31,  55,  1,  12, 20,
+        32,  186, 224, 7,   142, 198, 1,   86,  143, 1,   58,  100, 1,  31,  55,  1,  12, 22,
+        57,  192, 227, 20,  143, 204, 3,   96,  154, 1,   68,  112, 1,  42,  69,  1,  19, 32,
+        212, 35,  215, 113, 47,  169, 29,  48,  105, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        74,  129, 203, 106, 120, 203, 49,  107, 178, 19,  84,  144, 4,  50,  84,  1,  15, 25,
+        71,  172, 217, 44,  141, 209, 15,  102, 173, 6,   76,  133, 2,  51,  89,  1,  24, 42,
+        64,  185, 231, 31,  148, 216, 8,   103, 175, 3,   74,  131, 1,  46,  81,  1,  18, 30,
+        65,  196, 235, 25,  157, 221, 5,   105, 174, 1,   67,  120, 1,  38,  69,  1,  15, 30,
+        65,  204, 238, 30,  156, 224, 7,   107, 177, 2,   70,  124, 1,  42,  73,  1,  18, 34,
+        225, 86,  251, 144, 104, 235, 42,  99,  181, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        85,  175, 239, 112, 165, 229, 29,  136, 200, 12,  103, 162, 6,  77,  123, 2,  53, 84,
+        75,  183, 239, 30,  155, 221, 3,   106, 171, 1,   74,  128, 1,  44,  76,  1,  17, 28,
+        73,  185, 240, 27,  159, 222, 2,   107, 172, 1,   75,  127, 1,  42,  73,  1,  17, 29,
+        62,  190, 238, 21,  159, 222, 2,   107, 172, 1,   72,  122, 1,  40,  71,  1,  18, 32,
+        61,  199, 240, 27,  161, 226, 4,   113, 180, 1,   76,  129, 1,  46,  80,  1,  23, 41,
+        7,   27,  153, 5,   30,  95,  1,   16,  30,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        50,  75,  127, 57,  75,  124, 27,  67,  108, 10,  54,  86,  1,  33,  52,  1,  12, 18,
+        43,  125, 151, 26,  108, 148, 7,   83,  122, 2,   59,  89,  1,  38,  60,  1,  17, 27,
+        23,  144, 163, 13,  112, 154, 2,   75,  117, 1,   50,  81,  1,  31,  51,  1,  14, 23,
+        18,  162, 185, 6,   123, 171, 1,   78,  125, 1,   51,  86,  1,  31,  54,  1,  14, 23,
+        15,  199, 227, 3,   150, 204, 1,   91,  146, 1,   55,  95,  1,  30,  53,  1,  11, 20,
+        19,  55,  240, 19,  59,  196, 3,   52,  105, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        41,  166, 207, 104, 153, 199, 31,  123, 181, 14,  101, 152, 5,  72,  106, 1,  36, 52,
+        35,  176, 211, 12,  131, 190, 2,   88,  144, 1,   60,  101, 1,  36,  60,  1,  16, 28,
+        28,  183, 213, 8,   134, 191, 1,   86,  142, 1,   56,  96,  1,  30,  53,  1,  12, 20,
+        20,  190, 215, 4,   135, 192, 1,   84,  139, 1,   53,  91,  1,  28,  49,  1,  11, 20,
+        13,  196, 216, 2,   137, 192, 1,   86,  143, 1,   57,  99,  1,  32,  56,  1,  13, 24,
+        211, 29,  217, 96,  47,  156, 22,  43,  87,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        78,  120, 193, 111, 116, 186, 46,  102, 164, 15,  80,  128, 2,  49,  76,  1,  18, 28,
+        71,  161, 203, 42,  132, 192, 10,  98,  150, 3,   69,  109, 1,  44,  70,  1,  18, 29,
+        57,  186, 211, 30,  140, 196, 4,   93,  146, 1,   62,  102, 1,  38,  65,  1,  16, 27,
+        47,  199, 217, 14,  145, 196, 1,   88,  142, 1,   57,  98,  1,  36,  62,  1,  15, 26,
+        26,  219, 229, 5,   155, 207, 1,   94,  151, 1,   60,  104, 1,  36,  62,  1,  16, 28,
+        233, 29,  248, 146, 47,  220, 43,  52,  140, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        100, 163, 232, 179, 161, 222, 63,  142, 204, 37,  113, 174, 26, 89,  137, 18, 68, 97,
+        85,  181, 230, 32,  146, 209, 7,   100, 164, 3,   71,  121, 1,  45,  77,  1,  18, 30,
+        65,  187, 230, 20,  148, 207, 2,   97,  159, 1,   68,  116, 1,  40,  70,  1,  14, 29,
+        40,  194, 227, 8,   147, 204, 1,   94,  155, 1,   65,  112, 1,  39,  66,  1,  14, 26,
+        16,  208, 228, 3,   151, 207, 1,   98,  160, 1,   67,  117, 1,  41,  74,  1,  17, 31,
+        17,  38,  140, 7,   34,  80,  1,   17,  29,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        37,  75,  128, 41,  76,  128, 26,  66,  116, 12,  52,  94,  2,  32,  55,  1,  10, 16,
+        50,  127, 154, 37,  109, 152, 16,  82,  121, 5,   59,  85,  1,  35,  54,  1,  13, 20,
+        40,  142, 167, 17,  110, 157, 2,   71,  112, 1,   44,  72,  1,  27,  45,  1,  11, 17,
+        30,  175, 188, 9,   124, 169, 1,   74,  116, 1,   48,  78,  1,  30,  49,  1,  11, 18,
+        10,  222, 223, 2,   150, 194, 1,   83,  128, 1,   48,  79,  1,  27,  45,  1,  11, 17,
+        36,  41,  235, 29,  36,  193, 10,  27,  111, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        85,  165, 222, 177, 162, 215, 110, 135, 195, 57,  113, 168, 23, 83,  120, 10, 49, 61,
+        85,  190, 223, 36,  139, 200, 5,   90,  146, 1,   60,  103, 1,  38,  65,  1,  18, 30,
+        72,  202, 223, 23,  141, 199, 2,   86,  140, 1,   56,  97,  1,  36,  61,  1,  16, 27,
+        55,  218, 225, 13,  145, 200, 1,   86,  141, 1,   57,  99,  1,  35,  61,  1,  13, 22,
+        15,  235, 212, 1,   132, 184, 1,   84,  139, 1,   57,  97,  1,  34,  56,  1,  14, 23,
+        181, 21,  201, 61,  37,  123, 10,  38,  71,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        47,  106, 172, 95,  104, 173, 42,  93,  159, 18,  77,  131, 4,  50,  81,  1,  17, 23,
+        62,  147, 199, 44,  130, 189, 28,  102, 154, 18,  75,  115, 2,  44,  65,  1,  12, 19,
+        55,  153, 210, 24,  130, 194, 3,   93,  146, 1,   61,  97,  1,  31,  50,  1,  10, 16,
+        49,  186, 223, 17,  148, 204, 1,   96,  142, 1,   53,  83,  1,  26,  44,  1,  11, 17,
+        13,  217, 212, 2,   136, 180, 1,   78,  124, 1,   50,  83,  1,  29,  49,  1,  14, 23,
+        197, 13,  247, 82,  17,  222, 25,  17,  162, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137,
+        111, 197, 242, 46,  158, 219, 9,   104, 171, 2,   65,  125, 1,  44,  80,  1,  17, 91,
+        104, 208, 245, 39,  168, 224, 3,   109, 162, 1,   79,  124, 1,  50,  102, 1,  43, 102,
+        84,  220, 246, 31,  177, 231, 2,   115, 180, 1,   79,  134, 1,  55,  77,  1,  60, 79,
+        43,  243, 240, 8,   180, 217, 1,   115, 166, 1,   84,  121, 1,  51,  67,  1,  16, 6,
+    },
+    .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
+    .inter_mode_prob{
+        2,  173, 34, 0,  7,  145, 85, 0,  7,  166, 63, 0,  7,  94,
+        66, 0,   8,  64, 46, 0,   17, 81, 31, 0,   25, 29, 30, 0,
+    },
+    .intra_inter_prob{9, 102, 187, 225},
+    .comp_inter_prob{9, 102, 187, 225, 0},
+    .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
+    .comp_ref_prob{50, 126, 123, 221, 226},
+    .tx_32x32_prob{3, 136, 37, 5, 52, 13},
+    .tx_16x16_prob{20, 152, 15, 101},
+    .tx_8x8_prob{100, 66},
+    .skip_probs{192, 128, 64},
+    .joints{32, 64, 96},
+    .sign{128, 128},
+    .classes{
+        224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
+        216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
+    },
+    .class_0{216, 208},
+    .prob_bits{
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+    },
+    .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
+    .fr{64, 96, 64, 64, 96, 64},
+    .class_0_hp{160, 160},
+    .high_precision{128, 128},
+};
+
+constexpr std::array<s32, 256> norm_lut{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+constexpr std::array<s32, 254> map_lut{
+    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
+    1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50,  51,  52,  53,  54,
+    55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
+    73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+    108, 109, 7,   110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124,
+    125, 126, 127, 128, 129, 130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142,
+    143, 144, 145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176, 177,
+    178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14,  194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212,
+    213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17,
+    230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18,  242, 243, 244, 245, 246, 247,
+    248, 249, 250, 251, 252, 253, 19,
+};
+
+// 6.2.14 Tile size calculation
+
+[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 min_log2 = 0;
+
+    while ((64 << min_log2) < sb64_cols) {
+        min_log2++;
+    }
+
+    return min_log2;
+}
+
+[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 max_log2 = 1;
+
+    while ((sb64_cols >> max_log2) >= 4) {
+        max_log2++;
+    }
+
+    return max_log2 - 1;
+}
+
+// Recenters probability. Based on section 6.3.6 of VP9 Specification
+[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) {
+    if (new_prob > old_prob * 2) {
+        return new_prob;
+    }
+
+    if (new_prob >= old_prob) {
+        return (new_prob - old_prob) * 2;
+    }
+
+    return (old_prob - new_prob) * 2 - 1;
+}
+
+// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
+[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) {
+    new_prob--;
+    old_prob--;
+
+    std::size_t index{};
+
+    if (old_prob * 2 <= 0xff) {
+        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
+    } else {
+        index = static_cast<std::size_t>(
+            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
+    }
+
+    return map_lut[index];
+}
+} // Anonymous namespace
+
+VP9::VP9(GPU& gpu_) : gpu{gpu_} {}
+
+VP9::~VP9() = default;
+
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        WriteProbabilityDelta(writer, new_prob, old_prob);
+    }
+}
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                 const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
+        WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
+    }
+}
+
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                         const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
+        WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
+    }
+}
+
+void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const int delta = RemapProbability(new_prob, old_prob);
+
+    EncodeTermSubExp(writer, delta);
+}
+
+void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
+    if (WriteLessThan(writer, value, 16)) {
+        writer.Write(value, 4);
+    } else if (WriteLessThan(writer, value, 32)) {
+        writer.Write(value - 16, 4);
+    } else if (WriteLessThan(writer, value, 64)) {
+        writer.Write(value - 32, 5);
+    } else {
+        value -= 64;
+
+        constexpr s32 size = 8;
+
+        const s32 mask = (1 << size) - 191;
+
+        const s32 delta = value - mask;
+
+        if (delta < 0) {
+            writer.Write(value, size - 1);
+        } else {
+            writer.Write(delta / 2 + mask, size - 1);
+            writer.Write(delta & 1, 1);
+        }
+    }
+}
+
+bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
+    const bool is_lt = value < test;
+    writer.Write(!is_lt);
+    return is_lt;
+}
+
+void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                     const std::array<u8, 1728>& new_prob,
+                                     const std::array<u8, 1728>& old_prob) {
+    constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3;
+
+    const auto needs_update = [&](u32 base_index) {
+        return !std::equal(new_prob.begin() + base_index,
+                           new_prob.begin() + base_index + block_bytes,
+                           old_prob.begin() + base_index);
+    };
+
+    for (u32 block_index = 0; block_index < 4; block_index++) {
+        const u32 base_index = block_index * block_bytes;
+        const bool update = needs_update(base_index);
+        writer.Write(update);
+
+        if (update) {
+            u32 index = base_index;
+            for (s32 i = 0; i < 2; i++) {
+                for (s32 j = 0; j < 2; j++) {
+                    for (s32 k = 0; k < 6; k++) {
+                        for (s32 l = 0; l < 6; l++) {
+                            if (k != 0 || l < 3) {
+                                WriteProbabilityUpdate(writer, new_prob[index + 0],
+                                                       old_prob[index + 0]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 1],
+                                                       old_prob[index + 1]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 2],
+                                                       old_prob[index + 2]);
+                            }
+                            index += 3;
+                        }
+                    }
+                }
+            }
+        }
+        if (block_index == static_cast<u32>(tx_mode)) {
+            break;
+        }
+    }
+}
+
+void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        writer.Write(new_prob >> 1, 7);
+    }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
+    PictureInfo picture_info{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = picture_info.Convert();
+
+    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+
+    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
+    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
+    // to avoid buffering frame data needed for reference frame updating in the header composition.
+    std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
+
+    return vp9_info;
+}
+
+void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
+    EntropyProbs entropy{};
+    gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    entropy.Convert(dst);
+}
+
+Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
+    Vp9FrameContainer current_frame{};
+    {
+        gpu.SyncGuestHost();
+        current_frame.info = GetVp9PictureInfo(state);
+        current_frame.bit_stream.resize(current_frame.info.bitstream_size);
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
+                                      current_frame.info.bitstream_size);
+    }
+    // Buffer two frames, saving the last show frame info
+    if (!next_next_frame.bit_stream.empty()) {
+        Vp9FrameContainer temp{
+            .info = current_frame.info,
+            .bit_stream = std::move(current_frame.bit_stream),
+        };
+        next_next_frame.info.show_frame = current_frame.info.last_frame_shown;
+        current_frame.info = next_next_frame.info;
+        current_frame.bit_stream = std::move(next_next_frame.bit_stream);
+        next_next_frame = std::move(temp);
+
+        if (!next_frame.bit_stream.empty()) {
+            Vp9FrameContainer temp2{
+                .info = current_frame.info,
+                .bit_stream = std::move(current_frame.bit_stream),
+            };
+            next_frame.info.show_frame = current_frame.info.last_frame_shown;
+            current_frame.info = next_frame.info;
+            current_frame.bit_stream = std::move(next_frame.bit_stream);
+            next_frame = std::move(temp2);
+        } else {
+            next_frame.info = current_frame.info;
+            next_frame.bit_stream = std::move(current_frame.bit_stream);
+        }
+    } else {
+        next_next_frame.info = current_frame.info;
+        next_next_frame.bit_stream = std::move(current_frame.bit_stream);
+    }
+    return current_frame;
+}
+
+std::vector<u8> VP9::ComposeCompressedHeader() {
+    VpxRangeEncoder writer{};
+    const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame;
+    if (!current_frame_info.lossless) {
+        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
+            writer.Write(3, 2);
+            writer.Write(current_frame_info.transform_mode == 4);
+        } else {
+            writer.Write(current_frame_info.transform_mode, 2);
+        }
+    }
+
+    if (current_frame_info.transform_mode == 4) {
+        // tx_mode_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
+                               prev_frame_probs.tx_8x8_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
+                               prev_frame_probs.tx_16x16_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
+                               prev_frame_probs.tx_32x32_prob);
+        if (update_probs) {
+            prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
+            prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
+            prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
+        }
+    }
+    // read_coef_probs()  in the spec
+    WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
+                               current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
+    // read_skip_probs()  in the spec
+    WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
+                           prev_frame_probs.skip_probs);
+
+    if (update_probs) {
+        prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
+        prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
+    }
+
+    if (!current_frame_info.intra_only) {
+        // read_inter_probs() in the spec
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
+                                       prev_frame_probs.inter_mode_prob);
+
+        if (current_frame_info.interp_filter == 4) {
+            // read_interp_filter_probs() in the spec
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
+                                   prev_frame_probs.switchable_interp_prob);
+            if (update_probs) {
+                prev_frame_probs.switchable_interp_prob =
+                    current_frame_info.entropy.switchable_interp_prob;
+            }
+        }
+
+        // read_is_inter_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
+                               prev_frame_probs.intra_inter_prob);
+
+        // frame_reference_mode() in the spec
+        if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[2] & 1) ||
+            (current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[3] & 1)) {
+            if (current_frame_info.reference_mode >= 1) {
+                writer.Write(1, 1);
+                writer.Write(current_frame_info.reference_mode == 2);
+            } else {
+                writer.Write(0, 1);
+            }
+        }
+
+        // frame_reference_mode_probs() in the spec
+        if (current_frame_info.reference_mode == 2) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
+                                   prev_frame_probs.comp_inter_prob);
+            if (update_probs) {
+                prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 1) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
+                                   prev_frame_probs.single_ref_prob);
+            if (update_probs) {
+                prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 0) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
+                                   prev_frame_probs.comp_ref_prob);
+            if (update_probs) {
+                prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
+            }
+        }
+
+        // read_y_mode_probs
+        for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
+             ++index) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
+                                   prev_frame_probs.y_mode_prob[index]);
+        }
+
+        // read_partition_probs
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
+                                       prev_frame_probs.partition_prob);
+
+        // mv_probs
+        for (s32 i = 0; i < 3; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
+                                     prev_frame_probs.joints[i]);
+        }
+        if (update_probs) {
+            prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
+            prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
+            prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
+            prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
+            prev_frame_probs.joints = current_frame_info.entropy.joints;
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
+                                     prev_frame_probs.sign[i]);
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
+                                         prev_frame_probs.classes[index]);
+            }
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
+                                     prev_frame_probs.class_0[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
+                                         prev_frame_probs.prob_bits[index]);
+            }
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 3; k++) {
+                    const int index = i * 2 * 3 + j * 3 + k;
+                    WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
+                                             prev_frame_probs.class_0_fr[index]);
+                }
+            }
+
+            for (s32 j = 0; j < 3; j++) {
+                const int index = i * 3 + j;
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
+                                         prev_frame_probs.fr[index]);
+            }
+        }
+
+        if (current_frame_info.allow_high_precision_mv) {
+            for (s32 index = 0; index < 2; index++) {
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
+                                         prev_frame_probs.class_0_hp[index]);
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
+                                         prev_frame_probs.high_precision[index]);
+            }
+        }
+
+        // save previous probs
+        if (update_probs) {
+            prev_frame_probs.sign = current_frame_info.entropy.sign;
+            prev_frame_probs.classes = current_frame_info.entropy.classes;
+            prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
+            prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
+            prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
+            prev_frame_probs.fr = current_frame_info.entropy.fr;
+            prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
+            prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
+        }
+    }
+    writer.End();
+    return writer.GetBuffer();
+}
+
+VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
+    VpxBitStreamWriter uncomp_writer{};
+
+    uncomp_writer.WriteU(2, 2);                                      // Frame marker.
+    uncomp_writer.WriteU(0, 2);                                      // Profile.
+    uncomp_writer.WriteBit(false);                                   // Show existing frame.
+    uncomp_writer.WriteBit(!current_frame_info.is_key_frame);        // is key frame?
+    uncomp_writer.WriteBit(current_frame_info.show_frame);           // show frame?
+    uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
+
+    if (current_frame_info.is_key_frame) {
+        uncomp_writer.WriteU(frame_sync_code, 24);
+        uncomp_writer.WriteU(0, 3); // Color space.
+        uncomp_writer.WriteU(0, 1); // Color range.
+        uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+        uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+        uncomp_writer.WriteBit(false); // Render and frame size different.
+
+        // Reset context
+        prev_frame_probs = default_probs;
+        swap_next_golden = false;
+        loop_filter_ref_deltas.fill(0);
+        loop_filter_mode_deltas.fill(0);
+
+        // allow frames offsets to stabilize before checking for golden frames
+        grace_period = 4;
+
+        // On key frames, all frame slots are set to the current frame,
+        // so the value of the selected slot doesn't really matter.
+        frame_ctxs.fill({current_frame_number, false, default_probs});
+
+        // intra only, meaning the frame can be recreated with no other references
+        current_frame_info.intra_only = true;
+
+    } else {
+
+        if (!current_frame_info.show_frame) {
+            uncomp_writer.WriteBit(current_frame_info.intra_only);
+            if (!current_frame_info.last_frame_was_key) {
+                swap_next_golden = !swap_next_golden;
+            }
+        } else {
+            current_frame_info.intra_only = false;
+        }
+        if (!current_frame_info.error_resilient_mode) {
+            uncomp_writer.WriteU(0, 2); // Reset frame context.
+        }
+
+        // Last, Golden, Altref frames
+        std::array<s32, 3> ref_frame_index{0, 1, 2};
+
+        // Set when next frame is hidden
+        // altref and golden references are swapped
+        if (swap_next_golden) {
+            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        }
+
+        // update Last Frame
+        u64 refresh_frame_flags = 1;
+
+        // golden frame may refresh, determined if the next golden frame offset is changed
+        bool golden_refresh = false;
+        if (grace_period <= 0) {
+            for (s32 index = 1; index < 3; ++index) {
+                if (current_frame_info.frame_offsets[index] !=
+                    next_frame.info.frame_offsets[index]) {
+                    current_frame_info.refresh_frame[index] = true;
+                    golden_refresh = true;
+                    grace_period = 3;
+                }
+            }
+        }
+
+        if (current_frame_info.show_frame &&
+            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
+            // Update golden frame
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        }
+
+        if (!current_frame_info.show_frame) {
+            // Update altref
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        } else if (golden_refresh) {
+            refresh_frame_flags = 3;
+        }
+
+        if (current_frame_info.intra_only) {
+            uncomp_writer.WriteU(frame_sync_code, 24);
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+        } else {
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+
+            for (s32 index = 1; index < 4; index++) {
+                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
+                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
+            }
+
+            uncomp_writer.WriteBit(true);  // Frame size with refs.
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
+            uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
+
+            if (current_frame_info.interp_filter != 4) {
+                uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
+            }
+        }
+    }
+
+    if (!current_frame_info.error_resilient_mode) {
+        uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
+        uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
+    }
+
+    int frame_ctx_idx = 0;
+    if (!current_frame_info.show_frame) {
+        frame_ctx_idx = 1;
+    }
+
+    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
+    prev_frame_probs =
+        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
+
+    uncomp_writer.WriteU(current_frame_info.first_level, 6);
+    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
+    uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
+
+    if (current_frame_info.mode_ref_delta_enabled) {
+        // check if ref deltas are different, update accordingly
+        std::array<bool, 4> update_loop_filter_ref_deltas;
+        std::array<bool, 2> update_loop_filter_mode_deltas;
+
+        bool loop_filter_delta_update = false;
+
+        for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_ref_deltas[index];
+            const s8 new_deltas = current_frame_info.ref_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_ref_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_mode_deltas[index];
+            const s8 new_deltas = current_frame_info.mode_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_mode_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        uncomp_writer.WriteBit(loop_filter_delta_update);
+
+        if (loop_filter_delta_update) {
+            for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
+
+                if (update_loop_filter_ref_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
+                }
+            }
+
+            for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
+
+                if (update_loop_filter_mode_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
+                }
+            }
+            // save new deltas
+            loop_filter_ref_deltas = current_frame_info.ref_deltas;
+            loop_filter_mode_deltas = current_frame_info.mode_deltas;
+        }
+    }
+
+    uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
+
+    uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
+
+    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+
+    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
+    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
+
+    const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
+    const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
+
+    // If it's less than the maximum, we need to add an extra 0 on the bitstream
+    // to indicate that it should stop reading.
+    if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
+    } else {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
+    }
+
+    const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
+
+    uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
+
+    if (tile_rows_log2_is_nonzero) {
+        uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
+    }
+
+    return uncomp_writer;
+}
+
+const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state) {
+    std::vector<u8> bitstream;
+    {
+        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        current_frame_info = curr_frame.info;
+        bitstream = std::move(curr_frame.bit_stream);
+    }
+
+    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
+    auto uncomp_writer = ComposeUncompressedHeader();
+    std::vector<u8> compressed_header = ComposeCompressedHeader();
+
+    uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
+    uncomp_writer.Flush();
+    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
+
+    // Write headers and frame to buffer
+    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
+                compressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
+                bitstream.data(), bitstream.size());
+
+    // keep track of frame number
+    current_frame_number++;
+    grace_period--;
+
+    // don't display hidden frames
+    hidden = !current_frame_info.show_frame;
+    return frame;
+}
+
+VpxRangeEncoder::VpxRangeEncoder() {
+    Write(false);
+}
+
+VpxRangeEncoder::~VpxRangeEncoder() = default;
+
+void VpxRangeEncoder::Write(s32 value, s32 value_size) {
+    for (s32 bit = value_size - 1; bit >= 0; bit--) {
+        Write(((value >> bit) & 1) != 0);
+    }
+}
+
+void VpxRangeEncoder::Write(bool bit) {
+    Write(bit, half_probability);
+}
+
+void VpxRangeEncoder::Write(bool bit, s32 probability) {
+    u32 local_range = range;
+    const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
+    local_range = split;
+
+    if (bit) {
+        low_value += split;
+        local_range = range - split;
+    }
+
+    s32 shift = norm_lut[local_range];
+    local_range <<= shift;
+    count += shift;
+
+    if (count >= 0) {
+        const s32 offset = shift - count;
+
+        if (((low_value << (offset - 1)) >> 31) != 0) {
+            const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
+            base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+            while (PeekByte() == 0xff) {
+                base_stream.WriteByte(0);
+
+                base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
+            }
+            base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
+            base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
+        }
+        base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
+
+        low_value <<= offset;
+        shift = count;
+        low_value &= 0xffffff;
+        count -= 8;
+    }
+
+    low_value <<= shift;
+    range = local_range;
+}
+
+void VpxRangeEncoder::End() {
+    for (std::size_t index = 0; index < 32; ++index) {
+        Write(false);
+    }
+}
+
+u8 VpxRangeEncoder::PeekByte() {
+    const u8 value = base_stream.ReadByte();
+    base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+
+    return value;
+}
+
+VpxBitStreamWriter::VpxBitStreamWriter() = default;
+
+VpxBitStreamWriter::~VpxBitStreamWriter() = default;
+
+void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
+    WriteBits(value, value_size);
+}
+
+void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
+    const bool sign = value < 0;
+    if (sign) {
+        value = -value;
+    }
+
+    WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
+}
+
+void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
+    const bool delta_coded = value != 0;
+    WriteBit(delta_coded);
+
+    if (delta_coded) {
+        WriteBits(value, 4);
+    }
+}
+
+void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
+    s32 value_pos = 0;
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free = GetFreeBufferBits();
+
+        if (copy_size > free) {
+            copy_size = free;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void VpxBitStreamWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+s32 VpxBitStreamWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void VpxBitStreamWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+    buffer = 0;
+    buffer_pos = 0;
+}
+
+std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
+    return byte_array;
+}
+
+} // namespace Tegra::Decoder
--- a/src/video_core/command_classes/codecs/vp9.h
+++ b/src/video_core/command_classes/codecs/vp9.h
@ -0,0 +1,197 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "common/common_types.h"
+#include "common/stream.h"
+#include "video_core/command_classes/codecs/vp9_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+enum class FrameType { KeyFrame = 0, InterFrame = 1 };
+namespace Decoder {
+
+/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
+/// VP9 header bitstreams.
+
+class VpxRangeEncoder {
+public:
+    VpxRangeEncoder();
+    ~VpxRangeEncoder();
+
+    VpxRangeEncoder(const VpxRangeEncoder&) = delete;
+    VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete;
+
+    VpxRangeEncoder(VpxRangeEncoder&&) = default;
+    VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default;
+
+    /// Writes the rightmost value_size bits from value into the stream
+    void Write(s32 value, s32 value_size);
+
+    /// Writes a single bit with half probability
+    void Write(bool bit);
+
+    /// Writes a bit to the base_stream encoded with probability
+    void Write(bool bit, s32 probability);
+
+    /// Signal the end of the bitstream
+    void End();
+
+    [[nodiscard]] std::vector<u8>& GetBuffer() {
+        return base_stream.GetBuffer();
+    }
+
+    [[nodiscard]] const std::vector<u8>& GetBuffer() const {
+        return base_stream.GetBuffer();
+    }
+
+private:
+    u8 PeekByte();
+    Common::Stream base_stream{};
+    u32 low_value{};
+    u32 range{0xff};
+    s32 count{-24};
+    s32 half_probability{128};
+};
+
+class VpxBitStreamWriter {
+public:
+    VpxBitStreamWriter();
+    ~VpxBitStreamWriter();
+
+    VpxBitStreamWriter(const VpxBitStreamWriter&) = delete;
+    VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete;
+
+    VpxBitStreamWriter(VpxBitStreamWriter&&) = default;
+    VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default;
+
+    /// Write an unsigned integer value
+    void WriteU(u32 value, u32 value_size);
+
+    /// Write a signed integer value
+    void WriteS(s32 value, u32 value_size);
+
+    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value
+    void WriteDeltaQ(u32 value);
+
+    /// Write a single bit.
+    void WriteBit(bool state);
+
+    /// Pushes current buffer into buffer_array, resets buffer
+    void Flush();
+
+    /// Returns byte_array
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+
+    /// Returns const byte_array
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    /// Write bit_count bits from value into buffer
+    void WriteBits(u32 value, u32 bit_count);
+
+    /// Gets next available position in buffer, invokes Flush() if buffer is full
+    s32 GetFreeBufferBits();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class VP9 {
+public:
+    explicit VP9(GPU& gpu_);
+    ~VP9();
+
+    VP9(const VP9&) = delete;
+    VP9& operator=(const VP9&) = delete;
+
+    VP9(VP9&&) = default;
+    VP9& operator=(VP9&&) = delete;
+
+    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
+    /// documentation
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
+        const NvdecCommon::NvdecRegisters& state);
+
+    /// Returns true if the most recent frame was a hidden frame.
+    [[nodiscard]] bool WasFrameHidden() const {
+        return hidden;
+    }
+
+private:
+    /// Generates compressed header probability updates in the bitstream writer
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                const std::array<T, N>& old_prob);
+
+    /// Generates compressed header probability updates in the bitstream writer
+    /// If probs are not equal, WriteProbabilityDelta is invoked
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Generates compressed header probability deltas in the bitstream writer
+    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Inverse of 6.3.4 Decode term subexp
+    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
+
+    /// Writes if the value is less than the test value
+    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
+
+    /// Writes probability updates for the Coef probabilities
+    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                    const std::array<u8, 1728>& new_prob,
+                                    const std::array<u8, 1728>& old_prob);
+
+    /// Write probabilities for 4-byte aligned structures
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                        const std::array<T, N>& old_prob);
+
+    /// Write motion vector probability updates. 6.3.17 in the spec
+    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Returns VP9 information from NVDEC provided offset and size
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
+
+    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
+    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
+
+    /// Returns frame to be decoded after buffering
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
+
+    /// Use NVDEC providied information to compose the headers for the current frame
+    [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
+    [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
+
+    GPU& gpu;
+    std::vector<u8> frame;
+
+    std::array<s8, 4> loop_filter_ref_deltas{};
+    std::array<s8, 2> loop_filter_mode_deltas{};
+
+    bool hidden = false;
+    s64 current_frame_number = -2; // since we buffer 2 frames
+    s32 grace_period = 6;          // frame offsets need to stabilize
+    std::array<FrameContexts, 4> frame_ctxs{};
+    Vp9FrameContainer next_frame{};
+    Vp9FrameContainer next_next_frame{};
+    bool swap_next_golden{};
+
+    Vp9PictureInfo current_frame_info{};
+    Vp9EntropyProbs prev_frame_probs{};
+
+    s32 diff_update_probability = 252;
+    s32 frame_sync_code = 0x498342;
+};
+
+} // namespace Decoder
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/vp9_types.h
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@ -0,0 +1,302 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+
+namespace Decoder {
+struct Vp9FrameDimensions {
+    s16 width{};
+    s16 height{};
+    s16 luma_pitch{};
+    s16 chroma_pitch{};
+};
+static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
+
+enum FrameFlags : u32 {
+    IsKeyFrame = 1 << 0,
+    LastFrameIsKeyFrame = 1 << 1,
+    FrameSizeChanged = 1 << 2,
+    ErrorResilientMode = 1 << 3,
+    LastShowFrame = 1 << 4,
+    IntraOnly = 1 << 5,
+};
+
+enum class TxSize {
+    Tx4x4 = 0,   // 4x4 transform
+    Tx8x8 = 1,   // 8x8 transform
+    Tx16x16 = 2, // 16x16 transform
+    Tx32x32 = 3, // 32x32 transform
+    TxSizes = 4
+};
+
+enum class TxMode {
+    Only4X4 = 0,      // Only 4x4 transform used
+    Allow8X8 = 1,     // Allow block transform size up to 8x8
+    Allow16X16 = 2,   // Allow block transform size up to 16x16
+    Allow32X32 = 3,   // Allow block transform size up to 32x32
+    TxModeSelect = 4, // Transform specified for each block
+    TxModes = 5
+};
+
+struct Segmentation {
+    u8 enabled{};
+    u8 update_map{};
+    u8 temporal_update{};
+    u8 abs_delta{};
+    std::array<u32, 8> feature_mask{};
+    std::array<std::array<s16, 4>, 8> feature_data{};
+};
+static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
+
+struct LoopFilter {
+    u8 mode_ref_delta_enabled{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+};
+static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
+
+struct Vp9EntropyProbs {
+    std::array<u8, 36> y_mode_prob{};
+    std::array<u8, 64> partition_prob{};
+    std::array<u8, 1728> coef_probs{};
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 3> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 20> prob_bits{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+};
+static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size");
+
+struct Vp9PictureInfo {
+    bool is_key_frame{};
+    bool intra_only{};
+    bool last_frame_was_key{};
+    bool frame_size_changed{};
+    bool error_resilient_mode{};
+    bool last_frame_shown{};
+    bool show_frame{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    s32 base_q_index{};
+    s32 y_dc_delta_q{};
+    s32 uv_dc_delta_q{};
+    s32 uv_ac_delta_q{};
+    bool lossless{};
+    s32 transform_mode{};
+    bool allow_high_precision_mv{};
+    s32 interp_filter{};
+    s32 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    s32 log2_tile_cols{};
+    s32 log2_tile_rows{};
+    bool segment_enabled{};
+    bool segment_map_update{};
+    bool segment_map_temporal_update{};
+    s32 segment_abs_delta{};
+    std::array<u32, 8> segment_feature_enable{};
+    std::array<std::array<s16, 4>, 8> segment_feature_data{};
+    bool mode_ref_delta_enabled{};
+    bool use_prev_in_find_mv_refs{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+    Vp9EntropyProbs entropy{};
+    Vp9FrameDimensions frame_size{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u32 bitstream_size{};
+    std::array<u64, 4> frame_offsets{};
+    std::array<bool, 4> refresh_frame{};
+};
+
+struct Vp9FrameContainer {
+    Vp9PictureInfo info{};
+    std::vector<u8> bit_stream;
+};
+
+struct PictureInfo {
+    INSERT_PADDING_WORDS(12);
+    u32 bitstream_size{};
+    INSERT_PADDING_WORDS(5);
+    Vp9FrameDimensions last_frame_size{};
+    Vp9FrameDimensions golden_frame_size{};
+    Vp9FrameDimensions alt_frame_size{};
+    Vp9FrameDimensions current_frame_size{};
+    u32 vp9_flags{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u8 base_q_index{};
+    u8 y_dc_delta_q{};
+    u8 uv_ac_delta_q{};
+    u8 uv_dc_delta_q{};
+    u8 lossless{};
+    u8 tx_mode{};
+    u8 allow_high_precision_mv{};
+    u8 interp_filter{};
+    u8 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    u8 log2_tile_cols{};
+    u8 log2_tile_rows{};
+    Segmentation segmentation{};
+    LoopFilter loop_filter{};
+    INSERT_PADDING_BYTES(5);
+    u32 surface_params{};
+    INSERT_PADDING_WORDS(3);
+
+    [[nodiscard]] Vp9PictureInfo Convert() const {
+        return {
+            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
+            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
+            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
+            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
+            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
+            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
+            .ref_frame_sign_bias = ref_frame_sign_bias,
+            .base_q_index = base_q_index,
+            .y_dc_delta_q = y_dc_delta_q,
+            .uv_dc_delta_q = uv_dc_delta_q,
+            .uv_ac_delta_q = uv_ac_delta_q,
+            .lossless = lossless != 0,
+            .transform_mode = tx_mode,
+            .allow_high_precision_mv = allow_high_precision_mv != 0,
+            .interp_filter = interp_filter,
+            .reference_mode = reference_mode,
+            .comp_fixed_ref = comp_fixed_ref,
+            .comp_var_ref = comp_var_ref,
+            .log2_tile_cols = log2_tile_cols,
+            .log2_tile_rows = log2_tile_rows,
+            .segment_enabled = segmentation.enabled != 0,
+            .segment_map_update = segmentation.update_map != 0,
+            .segment_map_temporal_update = segmentation.temporal_update != 0,
+            .segment_abs_delta = segmentation.abs_delta,
+            .segment_feature_enable = segmentation.feature_mask,
+            .segment_feature_data = segmentation.feature_data,
+            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
+            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
+                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
+                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
+                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
+                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
+            .ref_deltas = loop_filter.ref_deltas,
+            .mode_deltas = loop_filter.mode_deltas,
+            .frame_size = current_frame_size,
+            .first_level = first_level,
+            .sharpness_level = sharpness_level,
+            .bitstream_size = bitstream_size,
+        };
+    }
+};
+static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
+
+struct EntropyProbs {
+    INSERT_PADDING_BYTES(1024);
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    INSERT_PADDING_BYTES(80);
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> y_mode_prob_e8{};
+    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
+    INSERT_PADDING_BYTES(64);
+    std::array<u8, 64> partition_prob{};
+    INSERT_PADDING_BYTES(10);
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 3> skip_probs{};
+    INSERT_PADDING_BYTES(1);
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 20> pred_bits{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    INSERT_PADDING_BYTES(17);
+    std::array<u8, 2304> coef_probs{};
+
+    void Convert(Vp9EntropyProbs& fc) {
+        fc.inter_mode_prob = inter_mode_prob;
+        fc.intra_inter_prob = intra_inter_prob;
+        fc.tx_8x8_prob = tx_8x8_prob;
+        fc.tx_16x16_prob = tx_16x16_prob;
+        fc.tx_32x32_prob = tx_32x32_prob;
+
+        for (std::size_t i = 0; i < 4; i++) {
+            for (std::size_t j = 0; j < 9; j++) {
+                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
+            }
+        }
+
+        fc.partition_prob = partition_prob;
+        fc.switchable_interp_prob = switchable_interp_prob;
+        fc.comp_inter_prob = comp_inter_prob;
+        fc.skip_probs = skip_probs;
+        fc.joints = joints;
+        fc.sign = sign;
+        fc.class_0 = class_0;
+        fc.fr = fr;
+        fc.class_0_hp = class_0_hp;
+        fc.high_precision = high_precision;
+        fc.classes = classes;
+        fc.class_0_fr = class_0_fr;
+        fc.prob_bits = pred_bits;
+        fc.single_ref_prob = single_ref_prob;
+        fc.comp_ref_prob = comp_ref_prob;
+
+        // Skip the 4th element as it goes unused
+        for (std::size_t i = 0; i < coef_probs.size(); i += 4) {
+            const std::size_t j = i - i / 4;
+            fc.coef_probs[j] = coef_probs[i];
+            fc.coef_probs[j + 1] = coef_probs[i + 1];
+            fc.coef_probs[j + 2] = coef_probs[i + 2];
+        }
+    }
+};
+static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
+
+enum class Ref { Last, Golden, AltRef };
+
+struct RefPoolElement {
+    s64 frame{};
+    Ref ref{};
+    bool refresh{};
+};
+
+struct FrameContexts {
+    s64 from{};
+    bool adapted{};
+    Vp9EntropyProbs probs{};
+};
+
+}; // namespace Decoder
+}; // namespace Tegra
--- a/src/video_core/command_classes/host1x.cpp
+++ b/src/video_core/command_classes/host1x.cpp
@ -0,0 +1,30 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/host1x.h"
+#include "video_core/gpu.h"
+
+Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
+
+Tegra::Host1x::~Host1x() = default;
+
+void Tegra::Host1x::ProcessMethod(Method method, u32 argument) {
+    switch (method) {
+    case Method::LoadSyncptPayload32:
+        syncpoint_value = argument;
+        break;
+    case Method::WaitSyncpt:
+    case Method::WaitSyncpt32:
+        Execute(argument);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
+        break;
+    }
+}
+
+void Tegra::Host1x::Execute(u32 data) {
+    gpu.WaitFence(data, syncpoint_value);
+}
--- a/src/video_core/command_classes/host1x.h
+++ b/src/video_core/command_classes/host1x.h
@ -0,0 +1,37 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+class Host1x {
+public:
+    enum class Method : u32 {
+        WaitSyncpt = 0x8,
+        LoadSyncptPayload32 = 0x4e,
+        WaitSyncpt32 = 0x50,
+    };
+
+    explicit Host1x(GPU& gpu);
+    ~Host1x();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, u32 argument);
+
+private:
+    /// For Host1x, execute is waiting on a syncpoint previously written into the state
+    void Execute(u32 data);
+
+    u32 syncpoint_value{};
+    GPU& gpu;
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/nvdec.cpp
+++ b/src/video_core/command_classes/nvdec.cpp
@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+
+Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
+
+Nvdec::~Nvdec() = default;
+
+void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    if (method == Method::SetVideoCodec) {
+        codec->StateWrite(static_cast<u32>(method), arguments[0]);
+    } else {
+        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
+    }
+
+    switch (method) {
+    case Method::SetVideoCodec:
+        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
+        break;
+    case Method::Execute:
+        Execute();
+        break;
+    }
+}
+
+AVFramePtr Nvdec::GetFrame() {
+    return codec->GetCurrentFrame();
+}
+
+void Nvdec::Execute() {
+    switch (codec->GetCurrentCodec()) {
+    case NvdecCommon::VideoCodec::H264:
+    case NvdecCommon::VideoCodec::Vp9:
+        codec->Decode();
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
+        break;
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/command_classes/nvdec.h
+++ b/src/video_core/command_classes/nvdec.h
@ -0,0 +1,38 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/command_classes/codecs/codec.h"
+
+namespace Tegra {
+class GPU;
+
+class Nvdec {
+public:
+    enum class Method : u32 {
+        SetVideoCodec = 0x80,
+        Execute = 0xc0,
+    };
+
+    explicit Nvdec(GPU& gpu);
+    ~Nvdec();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+    /// Return most recently decoded frame
+    [[nodiscard]] AVFramePtr GetFrame();
+
+private:
+    /// Invoke codec to decode a frame
+    void Execute();
+
+    GPU& gpu;
+    std::unique_ptr<Codec> codec;
+};
+} // namespace Tegra
--- a/src/video_core/command_classes/nvdec_common.h
+++ b/src/video_core/command_classes/nvdec_common.h
@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::NvdecCommon {
+
+struct NvdecRegisters {
+    INSERT_PADDING_WORDS(256);
+    u64 set_codec_id{};
+    INSERT_PADDING_WORDS(254);
+    u64 set_platform_id{};
+    u64 picture_info_offset{};
+    u64 frame_bitstream_offset{};
+    u64 frame_number{};
+    u64 h264_slice_data_offsets{};
+    u64 h264_mv_dump_offset{};
+    INSERT_PADDING_WORDS(6);
+    u64 frame_stats_offset{};
+    u64 h264_last_surface_luma_offset{};
+    u64 h264_last_surface_chroma_offset{};
+    std::array<u64, 17> surface_luma_offset{};
+    std::array<u64, 17> surface_chroma_offset{};
+    INSERT_PADDING_WORDS(132);
+    u64 vp9_entropy_probs_offset{};
+    u64 vp9_backward_updates_offset{};
+    u64 vp9_last_frame_segmap_offset{};
+    u64 vp9_curr_frame_segmap_offset{};
+    INSERT_PADDING_WORDS(2);
+    u64 vp9_last_frame_mvs_offset{};
+    u64 vp9_curr_frame_mvs_offset{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
+
+enum class VideoCodec : u32 {
+    None = 0x0,
+    H264 = 0x3,
+    Vp8 = 0x5,
+    H265 = 0x7,
+    Vp9 = 0x9,
+};
+
+} // namespace Tegra::NvdecCommon
--- a/src/video_core/command_classes/sync_manager.cpp
+++ b/src/video_core/command_classes/sync_manager.cpp
@ -0,0 +1,60 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <algorithm>
+#include "sync_manager.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
+SyncptIncrManager::~SyncptIncrManager() = default;
+
+void SyncptIncrManager::Increment(u32 id) {
+    increments.emplace_back(0, 0, id, true);
+    IncrementAllDone();
+}
+
+u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
+    const u32 handle = current_id++;
+    increments.emplace_back(handle, class_id, id);
+    return handle;
+}
+
+void SyncptIncrManager::SignalDone(u32 handle) {
+    const auto done_incr =
+        std::find_if(increments.begin(), increments.end(),
+                     [handle](const SyncptIncr& incr) { return incr.id == handle; });
+    if (done_incr != increments.cend()) {
+        done_incr->complete = true;
+    }
+    IncrementAllDone();
+}
+
+void SyncptIncrManager::IncrementAllDone() {
+    std::size_t done_count = 0;
+    for (; done_count < increments.size(); ++done_count) {
+        if (!increments[done_count].complete) {
+            break;
+        }
+        gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
+    }
+    increments.erase(increments.begin(), increments.begin() + done_count);
+}
+} // namespace Tegra
--- a/src/video_core/command_classes/sync_manager.h
+++ b/src/video_core/command_classes/sync_manager.h
@ -0,0 +1,64 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+struct SyncptIncr {
+    u32 id;
+    u32 class_id;
+    u32 syncpt_id;
+    bool complete;
+
+    SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
+        : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
+};
+
+class SyncptIncrManager {
+public:
+    explicit SyncptIncrManager(GPU& gpu);
+    ~SyncptIncrManager();
+
+    /// Add syncpoint id and increment all
+    void Increment(u32 id);
+
+    /// Returns a handle to increment later
+    u32 IncrementWhenDone(u32 class_id, u32 id);
+
+    /// IncrememntAllDone, including handle
+    void SignalDone(u32 handle);
+
+    /// Increment all sequential pending increments that are already done.
+    void IncrementAllDone();
+
+private:
+    std::vector<SyncptIncr> increments;
+    std::mutex increment_lock;
+    u32 current_id{};
+
+    GPU& gpu;
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@ -0,0 +1,175 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/command_classes/vic.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+namespace Tegra {
+
+Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
+    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
+Vic::~Vic() = default;
+
+void Vic::VicStateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method);
+    VicStateWrite(static_cast<u32>(method), arguments[0]);
+    const u64 arg = static_cast<u64>(arguments[0]) << 8;
+    switch (method) {
+    case Method::Execute:
+        Execute();
+        break;
+    case Method::SetConfigStructOffset:
+        config_struct_address = arg;
+        break;
+    case Method::SetOutputSurfaceLumaOffset:
+        output_surface_luma_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaUOffset:
+        output_surface_chroma_u_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaVOffset:
+        output_surface_chroma_v_address = arg;
+        break;
+    default:
+        break;
+    }
+}
+
+void Vic::Execute() {
+    if (output_surface_luma_address == 0) {
+        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}",
+                  vic_state.output_surface.luma_offset);
+        return;
+    }
+    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
+    const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
+    const auto* frame = frame_ptr.get();
+    if (!frame || frame->width == 0 || frame->height == 0) {
+        return;
+    }
+    const VideoPixelFormat pixel_format =
+        static_cast<VideoPixelFormat>(config.pixel_format.Value());
+    switch (pixel_format) {
+    case VideoPixelFormat::BGRA8:
+    case VideoPixelFormat::RGBA8: {
+        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+
+        if (scaler_ctx == nullptr || frame->width != scaler_width ||
+            frame->height != scaler_height) {
+            const AVPixelFormat target_format =
+                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+
+            sws_freeContext(scaler_ctx);
+            scaler_ctx = nullptr;
+
+            // FFmpeg returns all frames in YUV420, convert it into expected format
+            scaler_ctx =
+                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
+                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+
+            scaler_width = frame->width;
+            scaler_height = frame->height;
+        }
+        // Get Converted frame
+        const std::size_t linear_size = frame->width * frame->height * 4;
+
+        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
+        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
+
+        const int converted_stride{frame->width * 4};
+        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+
+        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
+                  &converted_frame_buf_addr, &converted_stride);
+
+        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+        if (blk_kind != 0) {
+            // swizzle pitch linear to block linear
+            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
+                                                            block_height, 0);
+            std::vector<u8> swizzled_data(size);
+            Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
+                                           frame->width, 4, swizzled_data.data(),
+                                           converted_frame_buffer.get(), block_height, 0, 0);
+
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        } else {
+            // send pitch linear frame
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+                                           linear_size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case VideoPixelFormat::Yuv420: {
+        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+
+        const std::size_t surface_width = config.surface_width_minus1 + 1;
+        const std::size_t surface_height = config.surface_height_minus1 + 1;
+        const std::size_t half_width = surface_width / 2;
+        const std::size_t half_height = config.surface_height_minus1 / 2;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
+
+        const auto* luma_ptr = frame->data[0];
+        const auto* chroma_b_ptr = frame->data[1];
+        const auto* chroma_r_ptr = frame->data[2];
+        const auto stride = frame->linesize[0];
+        const auto half_stride = frame->linesize[1];
+
+        std::vector<u8> luma_buffer(aligned_width * surface_height);
+        std::vector<u8> chroma_buffer(aligned_width * half_height);
+
+        // Populate luma buffer
+        for (std::size_t y = 0; y < surface_height - 1; ++y) {
+            std::size_t src = y * stride;
+            std::size_t dst = y * aligned_width;
+
+            std::size_t size = surface_width;
+
+            for (std::size_t offset = 0; offset < size; ++offset) {
+                luma_buffer[dst + offset] = luma_ptr[src + offset];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+                                       luma_buffer.size());
+
+        // Populate chroma buffer from both channels with interleaving.
+        for (std::size_t y = 0; y < half_height; ++y) {
+            std::size_t src = y * half_stride;
+            std::size_t dst = y * aligned_width;
+
+            for (std::size_t x = 0; x < half_width; ++x) {
+                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
+                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+                                       chroma_buffer.size());
+        gpu.Maxwell3D().OnMemoryWrite();
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+        break;
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@ -0,0 +1,110 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+struct SwsContext;
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+struct PlaneOffsets {
+    u32 luma_offset{};
+    u32 chroma_u_offset{};
+    u32 chroma_v_offset{};
+};
+
+struct VicRegisters {
+    INSERT_PADDING_WORDS(64);
+    u32 nop{};
+    INSERT_PADDING_WORDS(15);
+    u32 pm_trigger{};
+    INSERT_PADDING_WORDS(47);
+    u32 set_application_id{};
+    u32 set_watchdog_timer{};
+    INSERT_PADDING_WORDS(17);
+    u32 context_save_area{};
+    u32 context_switch{};
+    INSERT_PADDING_WORDS(43);
+    u32 execute{};
+    INSERT_PADDING_WORDS(63);
+    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
+    u32 picture_index{};
+    u32 control_params{};
+    u32 config_struct_offset{};
+    u32 filter_struct_offset{};
+    u32 palette_offset{};
+    u32 hist_offset{};
+    u32 context_id{};
+    u32 fce_ucode_size{};
+    PlaneOffsets output_surface{};
+    u32 fce_ucode_offset{};
+    INSERT_PADDING_WORDS(4);
+    std::array<u32, 8> slot_context_id{};
+    INSERT_PADDING_WORDS(16);
+};
+static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
+
+class Vic {
+public:
+    enum class Method : u32 {
+        Execute = 0xc0,
+        SetControlParams = 0x1c1,
+        SetConfigStructOffset = 0x1c2,
+        SetOutputSurfaceLumaOffset = 0x1c8,
+        SetOutputSurfaceChromaUOffset = 0x1c9,
+        SetOutputSurfaceChromaVOffset = 0x1ca
+    };
+
+    explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
+    ~Vic();
+
+    /// Write to the device state.
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    void Execute();
+
+    void VicStateWrite(u32 offset, u32 arguments);
+    VicRegisters vic_state{};
+
+    enum class VideoPixelFormat : u64_le {
+        RGBA8 = 0x1f,
+        BGRA8 = 0x20,
+        Yuv420 = 0x44,
+    };
+
+    union VicConfig {
+        u64_le raw{};
+        BitField<0, 7, u64_le> pixel_format;
+        BitField<7, 2, u64_le> chroma_loc_horiz;
+        BitField<9, 2, u64_le> chroma_loc_vert;
+        BitField<11, 4, u64_le> block_linear_kind;
+        BitField<15, 4, u64_le> block_linear_height_log2;
+        BitField<19, 3, u64_le> reserved0;
+        BitField<22, 10, u64_le> reserved1;
+        BitField<32, 14, u64_le> surface_width_minus1;
+        BitField<46, 14, u64_le> surface_height_minus1;
+    };
+
+    GPU& gpu;
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+
+    GPUVAddr config_struct_address{};
+    GPUVAddr output_surface_luma_address{};
+    GPUVAddr output_surface_chroma_u_address{};
+    GPUVAddr output_surface_chroma_v_address{};
+
+    SwsContext* scaler_ctx{};
+    s32 scaler_width{};
+    s32 scaler_height{};
+};
+
+} // namespace Tegra
--- a/src/video_core/compatible_formats.cpp
+++ b/src/video_core/compatible_formats.cpp
@ -3,33 +3,33 @@
 // Refer to the license.txt file included.

 #include <array>
-#include <bitset>
 #include <cstddef>

+#include "common/common_types.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/surface.h"

 namespace VideoCore::Surface {
-
 namespace {
+using Table = std::array<std::array<u64, 2>, MaxPixelFormat>;

 // Compatibility table taken from Table 3.X.2 in:
 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt

-constexpr std::array VIEW_CLASS_128_BITS = {
+constexpr std::array VIEW_CLASS_128_BITS{
    PixelFormat::R32G32B32A32_FLOAT,
    PixelFormat::R32G32B32A32_UINT,
    PixelFormat::R32G32B32A32_SINT,
 };

-constexpr std::array VIEW_CLASS_96_BITS = {
+constexpr std::array VIEW_CLASS_96_BITS{
    PixelFormat::R32G32B32_FLOAT,
 };
 // Missing formats:
 // PixelFormat::RGB32UI,
 // PixelFormat::RGB32I,

-constexpr std::array VIEW_CLASS_64_BITS = {
+constexpr std::array VIEW_CLASS_64_BITS{
    PixelFormat::R32G32_FLOAT,       PixelFormat::R32G32_UINT,
    PixelFormat::R32G32_SINT,        PixelFormat::R16G16B16A16_FLOAT,
    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
@ -38,7 +38,7 @@ constexpr std::array VIEW_CLASS_64_BITS = {

 // TODO: How should we handle 48 bits?

-constexpr std::array VIEW_CLASS_32_BITS = {
+constexpr std::array VIEW_CLASS_32_BITS{
    PixelFormat::R16G16_FLOAT,      PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT,
    PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT,     PixelFormat::R32_UINT,
    PixelFormat::R16G16_SINT,       PixelFormat::R32_SINT,        PixelFormat::A8B8G8R8_UNORM,
@ -50,43 +50,105 @@ constexpr std::array VIEW_CLASS_32_BITS = {

 // TODO: How should we handle 24 bits?

-constexpr std::array VIEW_CLASS_16_BITS = {
+constexpr std::array VIEW_CLASS_16_BITS{
    PixelFormat::R16_FLOAT,  PixelFormat::R8G8_UINT,  PixelFormat::R16_UINT,
    PixelFormat::R16_SINT,   PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM,
    PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM,  PixelFormat::R8G8_SINT,
 };

-constexpr std::array VIEW_CLASS_8_BITS = {
+constexpr std::array VIEW_CLASS_8_BITS{
    PixelFormat::R8_UINT,
    PixelFormat::R8_UNORM,
    PixelFormat::R8_SINT,
    PixelFormat::R8_SNORM,
 };

-constexpr std::array VIEW_CLASS_RGTC1_RED = {
+constexpr std::array VIEW_CLASS_RGTC1_RED{
    PixelFormat::BC4_UNORM,
    PixelFormat::BC4_SNORM,
 };

-constexpr std::array VIEW_CLASS_RGTC2_RG = {
+constexpr std::array VIEW_CLASS_RGTC2_RG{
    PixelFormat::BC5_UNORM,
    PixelFormat::BC5_SNORM,
 };

-constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+constexpr std::array VIEW_CLASS_BPTC_UNORM{
    PixelFormat::BC7_UNORM,
    PixelFormat::BC7_SRGB,
 };

-constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+constexpr std::array VIEW_CLASS_BPTC_FLOAT{
    PixelFormat::BC6H_SFLOAT,
    PixelFormat::BC6H_UFLOAT,
 };

+constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{
+    PixelFormat::ASTC_2D_4X4_UNORM,
+    PixelFormat::ASTC_2D_4X4_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{
+    PixelFormat::ASTC_2D_5X4_UNORM,
+    PixelFormat::ASTC_2D_5X4_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{
+    PixelFormat::ASTC_2D_5X5_UNORM,
+    PixelFormat::ASTC_2D_5X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{
+    PixelFormat::ASTC_2D_6X5_UNORM,
+    PixelFormat::ASTC_2D_6X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{
+    PixelFormat::ASTC_2D_6X6_UNORM,
+    PixelFormat::ASTC_2D_6X6_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{
+    PixelFormat::ASTC_2D_8X5_UNORM,
+    PixelFormat::ASTC_2D_8X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{
+    PixelFormat::ASTC_2D_8X8_UNORM,
+    PixelFormat::ASTC_2D_8X8_SRGB,
+};
+
+// Missing formats:
+// PixelFormat::ASTC_2D_10X5_UNORM
+// PixelFormat::ASTC_2D_10X5_SRGB
+
+// Missing formats:
+// PixelFormat::ASTC_2D_10X6_UNORM
+// PixelFormat::ASTC_2D_10X6_SRGB
+
+constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{
+    PixelFormat::ASTC_2D_10X8_UNORM,
+    PixelFormat::ASTC_2D_10X8_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{
+    PixelFormat::ASTC_2D_10X10_UNORM,
+    PixelFormat::ASTC_2D_10X10_SRGB,
+};
+
+// Missing formats
+// ASTC_2D_12X10_UNORM,
+// ASTC_2D_12X10_SRGB,
+
+constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{
+    PixelFormat::ASTC_2D_12X12_UNORM,
+    PixelFormat::ASTC_2D_12X12_SRGB,
+};
+
 // Compatibility table taken from Table 4.X.1 in:
 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt

-constexpr std::array COPY_CLASS_128_BITS = {
+constexpr std::array COPY_CLASS_128_BITS{
    PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT,
    PixelFormat::BC2_UNORM,         PixelFormat::BC2_SRGB,           PixelFormat::BC3_UNORM,
    PixelFormat::BC3_SRGB,          PixelFormat::BC5_UNORM,          PixelFormat::BC5_SNORM,
@ -97,7 +159,7 @@ constexpr std::array COPY_CLASS_128_BITS = {
 // PixelFormat::RGBA32I
 // COMPRESSED_RG_RGTC2

-constexpr std::array COPY_CLASS_64_BITS = {
+constexpr std::array COPY_CLASS_64_BITS{
    PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT,
    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
    PixelFormat::R16G16B16A16_SINT,  PixelFormat::R32G32_UINT,
@ -110,32 +172,36 @@ constexpr std::array COPY_CLASS_64_BITS = {
 // COMPRESSED_RGBA_S3TC_DXT1_EXT
 // COMPRESSED_SIGNED_RED_RGTC1

-void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
-    compatiblity[format_a][format_b] = true;
-    compatiblity[format_b][format_a] = true;
+constexpr void Enable(Table& table, size_t format_a, size_t format_b) {
+    table[format_a][format_b / 64] |= u64(1) << (format_b % 64);
+    table[format_b][format_a / 64] |= u64(1) << (format_a % 64);
 }

-void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
-    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) {
+    Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
 }

 template <typename Range>
-void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+constexpr void EnableRange(Table& table, const Range& range) {
    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
-            Enable(compatibility, *it_a, *it_b);
+            Enable(table, *it_a, *it_b);
        }
    }
 }

-} // Anonymous namespace
+constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) {
+    const size_t a = static_cast<size_t>(format_a);
+    const size_t b = static_cast<size_t>(format_b);
+    return ((table[a][b / 64] >> (b % 64)) & 1) != 0;
+}

-FormatCompatibility::FormatCompatibility() {
+constexpr Table MakeViewTable() {
+    Table view{};
    for (size_t i = 0; i < MaxPixelFormat; ++i) {
        // Identity is allowed
        Enable(view, i, i);
    }
-
    EnableRange(view, VIEW_CLASS_128_BITS);
    EnableRange(view, VIEW_CLASS_96_BITS);
    EnableRange(view, VIEW_CLASS_64_BITS);
@ -146,10 +212,39 @@ FormatCompatibility::FormatCompatibility() {
    EnableRange(view, VIEW_CLASS_RGTC2_RG);
    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+    EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA);
+    return view;
+}

-    copy = view;
+constexpr Table MakeCopyTable() {
+    Table copy = MakeViewTable();
    EnableRange(copy, COPY_CLASS_128_BITS);
    EnableRange(copy, COPY_CLASS_64_BITS);
+    return copy;
+}
+} // Anonymous namespace
+
+bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views) {
+    if (broken_views) {
+        // If format views are broken, only accept formats that are identical.
+        return format_a == format_b;
+    }
+    static constexpr Table TABLE = MakeViewTable();
+    return IsSupported(TABLE, format_a, format_b);
+}
+
+bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) {
+    static constexpr Table TABLE = MakeCopyTable();
+    return IsSupported(TABLE, format_a, format_b);
 }

 } // namespace VideoCore::Surface
--- a/src/video_core/compatible_formats.h
+++ b/src/video_core/compatible_formats.h
@ -4,31 +4,12 @@

 #pragma once

-#include <array>
-#include <bitset>
-#include <cstddef>
-
 #include "video_core/surface.h"

 namespace VideoCore::Surface {

-class FormatCompatibility {
-public:
-    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views);

-    explicit FormatCompatibility();
-
-    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
-        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
-    }
-
-    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
-        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
-    }
-
-private:
-    Table view;
-    Table copy;
-};
+bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b);

 } // namespace VideoCore::Surface
--- a/src/video_core/delayed_destruction_ring.h
+++ b/src/video_core/delayed_destruction_ring.h
@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace VideoCommon {
+
+/// Container to push objects to be destroyed a few ticks in the future
+template <typename T, size_t TICKS_TO_DESTROY>
+class DelayedDestructionRing {
+public:
+    void Tick() {
+        index = (index + 1) % TICKS_TO_DESTROY;
+        elements[index].clear();
+    }
+
+    void Push(T&& object) {
+        elements[index].push_back(std::move(object));
+    }
+
+private:
+    size_t index = 0;
+    std::array<std::vector<T>, TICKS_TO_DESTROY> elements;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@ -9,13 +9,16 @@
 #include "video_core/dirty_flags.h"

 #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name)
-#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32))
+#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))

 namespace VideoCommon::Dirty {

 using Tegra::Engines::Maxwell3D;

 void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
+    FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
+    FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
+
    static constexpr std::size_t num_per_rt = NUM(rt[0]);
    static constexpr std::size_t begin = OFF(rt);
    static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@ -23,6 +26,10 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
        FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt);
    }
    FillBlock(tables[1], begin, num, RenderTargets);
+    FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets);
+
+    tables[0][OFF(rt_control)] = RenderTargets;
+    tables[1][OFF(rt_control)] = RenderTargetControl;

    static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets};
    for (std::size_t i = 0; i < std::size(zeta_flags); ++i) {
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@ -16,7 +16,10 @@ namespace VideoCommon::Dirty {
 enum : u8 {
    NullEntry = 0,

+    Descriptors,
+
    RenderTargets,
+    RenderTargetControl,
    ColorBuffer0,
    ColorBuffer1,
    ColorBuffer2,
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/cityhash.h"
 #include "common/microprofile.h"
 #include "core/core.h"
 #include "core/memory.h"
@ -12,7 +13,7 @@

 namespace Tegra {

-DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {}
+DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_) : gpu{gpu_}, system{system_} {}

 DmaPusher::~DmaPusher() = default;

@ -45,32 +46,41 @@ bool DmaPusher::Step() {
        return false;
    }

-    const CommandList& command_list{dma_pushbuffer.front()};
-    ASSERT_OR_EXECUTE(!command_list.empty(), {
-        // Somehow the command_list is empty, in order to avoid a crash
-        // We ignore it and assume its size is 0.
+    CommandList& command_list{dma_pushbuffer.front()};
+
+    ASSERT_OR_EXECUTE(
+        command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
+            // Somehow the command_list is empty, in order to avoid a crash
+            // We ignore it and assume its size is 0.
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+            return true;
+        });
+
+    if (command_list.prefetch_command_list.size()) {
+        // Prefetched command list from nvdrv, used for things like synchronization
+        command_headers = std::move(command_list.prefetch_command_list);
        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-        return true;
-    });
-    const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
-    const GPUVAddr dma_get = command_list_header.addr;
+    } else {
+        const CommandListHeader command_list_header{
+            command_list.command_lists[dma_pushbuffer_subindex++]};
+        const GPUVAddr dma_get = command_list_header.addr;

-    if (dma_pushbuffer_subindex >= command_list.size()) {
-        // We've gone through the current list, remove it from the queue
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
+        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
+
+        if (command_list_header.size == 0) {
+            return true;
+        }
+
+        // Push buffer non-empty, read a word
+        command_headers.resize(command_list_header.size);
+        gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                            command_list_header.size * sizeof(u32));
    }
-
-    if (command_list_header.size == 0) {
-        return true;
-    }
-
-    // Push buffer non-empty, read a word
-    command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
-                                        command_list_header.size * sizeof(u32));
-
    for (std::size_t index = 0; index < command_headers.size();) {
        const CommandHeader& command_header = command_headers[index];

@ -142,7 +152,12 @@ void DmaPusher::SetState(const CommandHeader& command_header) {

 void DmaPusher::CallMethod(u32 argument) const {
    if (dma_state.method < non_puller_methods) {
-        gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count});
+        gpu.CallMethod(GPU::MethodCall{
+            dma_state.method,
+            argument,
+            dma_state.subchannel,
+            dma_state.method_count,
+        });
    } else {
        subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
                                                      dma_state.is_last_call);
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@ -18,6 +18,8 @@ class System;

 namespace Tegra {

+class GPU;
+
 enum class SubmissionMode : u32 {
    IncreasingOld = 0,
    Increasing = 1,
@ -27,6 +29,31 @@ enum class SubmissionMode : u32 {
    IncreaseOnce = 5
 };

+// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
+// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
+// So the values you see in docs might be multiplied by 4.
+enum class BufferMethods : u32 {
+    BindObject = 0x0,
+    Nop = 0x2,
+    SemaphoreAddressHigh = 0x4,
+    SemaphoreAddressLow = 0x5,
+    SemaphoreSequence = 0x6,
+    SemaphoreTrigger = 0x7,
+    NotifyIntr = 0x8,
+    WrcacheFlush = 0x9,
+    Unk28 = 0xA,
+    UnkCacheFlush = 0xB,
+    RefCnt = 0x14,
+    SemaphoreAcquire = 0x1A,
+    SemaphoreRelease = 0x1B,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
+    WaitForInterrupt = 0x1E,
+    Unk7c = 0x1F,
+    Yield = 0x20,
+    NonPullerMethods = 0x40,
+};
+
 struct CommandListHeader {
    union {
        u64 raw;
@ -49,9 +76,23 @@ union CommandHeader {
 static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");

-class GPU;
+inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
+    CommandHeader result{};
+    result.method.Assign(static_cast<u32>(method));
+    result.arg_count.Assign(arg_count);
+    result.mode.Assign(mode);
+    return result;
+}

-using CommandList = std::vector<Tegra::CommandListHeader>;
+struct CommandList final {
+    CommandList() = default;
+    explicit CommandList(std::size_t size) : command_lists(size) {}
+    explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_)
+        : prefetch_command_list{std::move(prefetch_command_list_)} {}
+
+    std::vector<CommandListHeader> command_lists;
+    std::vector<CommandHeader> prefetch_command_list;
+};

 /**
 * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
@ -60,9 +101,9 @@ using CommandList = std::vector<Tegra::CommandListHeader>;
 * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
 * details on this implementation.
 */
-class DmaPusher {
+class DmaPusher final {
 public:
-    explicit DmaPusher(Core::System& system, GPU& gpu);
+    explicit DmaPusher(Core::System& system_, GPU& gpu_);
    ~DmaPusher();

    void Push(CommandList&& entries) {
@ -71,7 +112,7 @@ public:

    void DispatchCalls();

-    void BindSubchannel(Tegra::Engines::EngineInterface* engine, u32 subchannel_id) {
+    void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {
        subchannels[subchannel_id] = engine;
    }

@ -104,7 +145,7 @@ private:

    bool ib_enable{true}; ///< IB mode enabled

-    std::array<Tegra::Engines::EngineInterface*, max_subchannels> subchannels{};
+    std::array<Engines::EngineInterface*, max_subchannels> subchannels{};

    GPU& gpu;
    Core::System& system;
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@ -11,16 +11,16 @@

 namespace Tegra::Engines::Upload {

-State::State(MemoryManager& memory_manager, Registers& regs)
-    : regs{regs}, memory_manager{memory_manager} {}
+State::State(MemoryManager& memory_manager_, Registers& regs_)
+    : regs{regs_}, memory_manager{memory_manager_} {}

 State::~State() = default;

-void State::ProcessExec(const bool is_linear) {
+void State::ProcessExec(const bool is_linear_) {
    write_offset = 0;
    copy_size = regs.line_length_in * regs.line_count;
    inner_buffer.resize(copy_size);
-    this->is_linear = is_linear;
+    is_linear = is_linear_;
 }

 void State::ProcessData(const u32 data, const bool is_last_call) {
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@ -54,10 +54,10 @@ struct Registers {

 class State {
 public:
-    State(MemoryManager& memory_manager, Registers& regs);
+    explicit State(MemoryManager& memory_manager_, Registers& regs_);
    ~State();

-    void ProcessExec(bool is_linear);
+    void ProcessExec(bool is_linear_);
    void ProcessData(u32 data, bool is_last_call);

 private:
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@ -10,7 +10,11 @@

 namespace Tegra::Engines {

-Fermi2D::Fermi2D() = default;
+Fermi2D::Fermi2D() {
+    // Nvidia's OpenGL driver seems to assume these values
+    regs.src.depth = 1;
+    regs.dst.depth = 1;
+}

 Fermi2D::~Fermi2D() = default;

@ -21,79 +25,43 @@ void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
 void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Fermi2D register, increase the size of the Regs structure");
-
    regs.reg_array[method] = method_argument;

-    switch (method) {
-    // Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit,
-    // so trigger on the second 32-bit write.
-    case FERMI2D_REG_INDEX(blit_src_y) + 1: {
-        HandleSurfaceCopy();
-        break;
-    }
+    if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) {
+        Blit();
    }
 }

 void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
-    for (std::size_t i = 0; i < amount; i++) {
-        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    for (u32 i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - i <= 1);
    }
 }

-static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
-    const u32 line_a = src_2 - src_1;
-    const u32 line_b = dst_2 - dst_1;
-    const u32 excess = std::max<s32>(0, line_a - src_line + src_1);
-    return {line_b - (excess * line_b) / line_a, excess};
-}
+void Fermi2D::Blit() {
+    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
+              regs.src.Address(), regs.dst.Address());

-void Fermi2D::HandleSurfaceCopy() {
-    LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}",
-              static_cast<u32>(regs.operation));
+    UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy");
+    UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one");
+    UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");

-    // TODO(Subv): Only raw copies are implemented.
-    ASSERT(regs.operation == Operation::SrcCopy);
-
-    const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)};
-    const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)};
-    u32 src_blit_x2, src_blit_y2;
-    if (regs.blit_control.origin == Origin::Corner) {
-        src_blit_x2 =
-            static_cast<u32>((regs.blit_src_x + (regs.blit_du_dx * regs.blit_dst_width)) >> 32);
-        src_blit_y2 =
-            static_cast<u32>((regs.blit_src_y + (regs.blit_dv_dy * regs.blit_dst_height)) >> 32);
-    } else {
-        src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width);
-        src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height);
-    }
-    u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width;
-    u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height;
-    const auto [new_dst_w, src_excess_x] =
-        DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width);
-    const auto [new_dst_h, src_excess_y] =
-        DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height);
-    dst_blit_x2 = new_dst_w + regs.blit_dst_x;
-    src_blit_x2 = src_blit_x2 - src_excess_x;
-    dst_blit_y2 = new_dst_h + regs.blit_dst_y;
-    src_blit_y2 = src_blit_y2 - src_excess_y;
-    const auto [new_src_w, dst_excess_x] =
-        DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width);
-    const auto [new_src_h, dst_excess_y] =
-        DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height);
-    src_blit_x2 = new_src_w + src_blit_x1;
-    dst_blit_x2 = dst_blit_x2 - dst_excess_x;
-    src_blit_y2 = new_src_h + src_blit_y1;
-    dst_blit_y2 = dst_blit_y2 - dst_excess_y;
-    const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
-    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
-                                          dst_blit_y2};
-    const Config copy_config{
+    const auto& args = regs.pixels_from_memory;
+    const Config config{
        .operation = regs.operation,
-        .filter = regs.blit_control.filter,
-        .src_rect = src_rect,
-        .dst_rect = dst_rect,
+        .filter = args.sample_mode.filter,
+        .dst_x0 = args.dst_x0,
+        .dst_y0 = args.dst_y0,
+        .dst_x1 = args.dst_x0 + args.dst_width,
+        .dst_y1 = args.dst_y0 + args.dst_height,
+        .src_x0 = static_cast<s32>(args.src_x0 >> 32),
+        .src_y0 = static_cast<s32>(args.src_y0 >> 32),
+        .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
+        .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
    };
-    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
+    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) {
        UNIMPLEMENTED();
    }
 }
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@ -53,8 +53,8 @@ public:
    };

    enum class Filter : u32 {
-        PointSample = 0, // Nearest
-        Linear = 1,
+        Point = 0,
+        Bilinear = 1,
    };

    enum class Operation : u32 {
@ -67,88 +67,235 @@ public:
        BlendPremult = 6,
    };

-    struct Regs {
-        static constexpr std::size_t NUM_REGS = 0x258;
+    enum class MemoryLayout : u32 {
+        BlockLinear = 0,
+        Pitch = 1,
+    };

-        struct Surface {
-            RenderTargetFormat format;
-            BitField<0, 1, u32> linear;
-            union {
-                BitField<0, 4, u32> block_width;
-                BitField<4, 4, u32> block_height;
-                BitField<8, 4, u32> block_depth;
-            };
-            u32 depth;
-            u32 layer;
-            u32 pitch;
-            u32 width;
-            u32 height;
-            u32 address_high;
-            u32 address_low;
-
-            GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
-            }
-
-            u32 BlockWidth() const {
-                return block_width.Value();
-            }
-
-            u32 BlockHeight() const {
-                return block_height.Value();
-            }
-
-            u32 BlockDepth() const {
-                return block_depth.Value();
-            }
-        };
-        static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
+    enum class CpuIndexWrap : u32 {
+        Wrap = 0,
+        NoWrap = 1,
+    };

+    struct Surface {
+        RenderTargetFormat format;
+        MemoryLayout linear;
        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 depth;
+        u32 layer;
+        u32 pitch;
+        u32 width;
+        u32 height;
+        u32 addr_upper;
+        u32 addr_lower;
+
+        [[nodiscard]] constexpr GPUVAddr Address() const noexcept {
+            return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
+        }
+    };
+    static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
+
+    enum class SectorPromotion : u32 {
+        NoPromotion = 0,
+        PromoteTo2V = 1,
+        PromoteTo2H = 2,
+        PromoteTo4 = 3,
+    };
+
+    enum class NumTpcs : u32 {
+        All = 0,
+        One = 1,
+    };
+
+    enum class RenderEnableMode : u32 {
+        False = 0,
+        True = 1,
+        Conditional = 2,
+        RenderIfEqual = 3,
+        RenderIfNotEqual = 4,
+    };
+
+    enum class ColorKeyFormat : u32 {
+        A16R56G6B5 = 0,
+        A1R5G55B5 = 1,
+        A8R8G8B8 = 2,
+        A2R10G10B10 = 3,
+        Y8 = 4,
+        Y16 = 5,
+        Y32 = 6,
+    };
+
+    union Beta4 {
+        BitField<0, 8, u32> b;
+        BitField<8, 8, u32> g;
+        BitField<16, 8, u32> r;
+        BitField<24, 8, u32> a;
+    };
+
+    struct Point {
+        u32 x;
+        u32 y;
+    };
+
+    enum class PatternSelect : u32 {
+        MonoChrome8x8 = 0,
+        MonoChrome64x1 = 1,
+        MonoChrome1x64 = 2,
+        Color = 3,
+    };
+
+    enum class NotifyType : u32 {
+        WriteOnly = 0,
+        WriteThenAwaken = 1,
+    };
+
+    enum class MonochromePatternColorFormat : u32 {
+        A8X8R8G6B5 = 0,
+        A1R5G5B5 = 1,
+        A8R8G8B8 = 2,
+        A8Y8 = 3,
+        A8X8Y16 = 4,
+        Y32 = 5,
+    };
+
+    enum class MonochromePatternFormat : u32 {
+        CGA6_M1 = 0,
+        LE_M1 = 1,
+    };
+
+    union Regs {
+        static constexpr std::size_t NUM_REGS = 0x258;
+        struct {
+            u32 object;
+            INSERT_UNION_PADDING_WORDS(0x3F);
+            u32 no_operation;
+            NotifyType notify;
+            INSERT_UNION_PADDING_WORDS(0x2);
+            u32 wait_for_idle;
+            INSERT_UNION_PADDING_WORDS(0xB);
+            u32 pm_trigger;
+            INSERT_UNION_PADDING_WORDS(0xF);
+            u32 context_dma_notify;
+            u32 dst_context_dma;
+            u32 src_context_dma;
+            u32 semaphore_context_dma;
+            INSERT_UNION_PADDING_WORDS(0x1C);
+            Surface dst;
+            CpuIndexWrap pixels_from_cpu_index_wrap;
+            u32 kind2d_check_enable;
+            Surface src;
+            SectorPromotion pixels_from_memory_sector_promotion;
+            INSERT_UNION_PADDING_WORDS(0x1);
+            NumTpcs num_tpcs;
+            u32 render_enable_addr_upper;
+            u32 render_enable_addr_lower;
+            RenderEnableMode render_enable_mode;
+            INSERT_UNION_PADDING_WORDS(0x4);
+            u32 clip_x0;
+            u32 clip_y0;
+            u32 clip_width;
+            u32 clip_height;
+            BitField<0, 1, u32> clip_enable;
+            BitField<0, 3, ColorKeyFormat> color_key_format;
+            u32 color_key;
+            BitField<0, 1, u32> color_key_enable;
+            BitField<0, 8, u32> rop;
+            u32 beta1;
+            Beta4 beta4;
+            Operation operation;
+            union {
+                BitField<0, 6, u32> x;
+                BitField<8, 6, u32> y;
+            } pattern_offset;
+            BitField<0, 2, PatternSelect> pattern_select;
+            INSERT_UNION_PADDING_WORDS(0xC);
            struct {
-                INSERT_UNION_PADDING_WORDS(0x80);
-
-                Surface dst;
-
-                INSERT_UNION_PADDING_WORDS(2);
-
-                Surface src;
-
-                INSERT_UNION_PADDING_WORDS(0x15);
-
-                Operation operation;
-
-                INSERT_UNION_PADDING_WORDS(0x177);
-
+                BitField<0, 3, MonochromePatternColorFormat> color_format;
+                BitField<0, 1, MonochromePatternFormat> format;
+                u32 color0;
+                u32 color1;
+                u32 pattern0;
+                u32 pattern1;
+            } monochrome_pattern;
+            struct {
+                std::array<u32, 0x40> X8R8G8B8;
+                std::array<u32, 0x20> R5G6B5;
+                std::array<u32, 0x20> X1R5G5B5;
+                std::array<u32, 0x10> Y8;
+            } color_pattern;
+            INSERT_UNION_PADDING_WORDS(0x10);
+            struct {
+                u32 prim_mode;
+                u32 prim_color_format;
+                u32 prim_color;
+                u32 line_tie_break_bits;
+                INSERT_UNION_PADDING_WORDS(0x14);
+                u32 prim_point_xy;
+                INSERT_UNION_PADDING_WORDS(0x7);
+                std::array<Point, 0x40> prim_point;
+            } render_solid;
+            struct {
+                u32 data_type;
+                u32 color_format;
+                u32 index_format;
+                u32 mono_format;
+                u32 wrap;
+                u32 color0;
+                u32 color1;
+                u32 mono_opacity;
+                INSERT_UNION_PADDING_WORDS(0x6);
+                u32 src_width;
+                u32 src_height;
+                u32 dx_du_frac;
+                u32 dx_du_int;
+                u32 dx_dv_frac;
+                u32 dy_dv_int;
+                u32 dst_x0_frac;
+                u32 dst_x0_int;
+                u32 dst_y0_frac;
+                u32 dst_y0_int;
+                u32 data;
+            } pixels_from_cpu;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            u32 big_endian_control;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            struct {
+                BitField<0, 3, u32> block_shape;
+                BitField<0, 5, u32> corral_size;
+                BitField<0, 1, u32> safe_overlap;
                union {
-                    u32 raw;
                    BitField<0, 1, Origin> origin;
                    BitField<4, 1, Filter> filter;
-                } blit_control;
-
+                } sample_mode;
                INSERT_UNION_PADDING_WORDS(0x8);
-
-                u32 blit_dst_x;
-                u32 blit_dst_y;
-                u32 blit_dst_width;
-                u32 blit_dst_height;
-                u64 blit_du_dx;
-                u64 blit_dv_dy;
-                u64 blit_src_x;
-                u64 blit_src_y;
-
-                INSERT_UNION_PADDING_WORDS(0x21);
-            };
-            std::array<u32, NUM_REGS> reg_array;
+                s32 dst_x0;
+                s32 dst_y0;
+                s32 dst_width;
+                s32 dst_height;
+                s64 du_dx;
+                s64 dv_dy;
+                s64 src_x0;
+                s64 src_y0;
+            } pixels_from_memory;
        };
+        std::array<u32, NUM_REGS> reg_array;
    } regs{};

    struct Config {
-        Operation operation{};
-        Filter filter{};
-        Common::Rectangle<u32> src_rect;
-        Common::Rectangle<u32> dst_rect;
+        Operation operation;
+        Filter filter;
+        s32 dst_x0;
+        s32 dst_y0;
+        s32 dst_x1;
+        s32 dst_y1;
+        s32 src_x0;
+        s32 src_y0;
+        s32 src_x1;
+        s32 src_y1;
    };

 private:
@ -156,25 +303,49 @@ private:

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
-    void HandleSurfaceCopy();
+    void Blit();
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
-    static_assert(offsetof(Fermi2D::Regs, field_name) == position * 4,                             \
+    static_assert(offsetof(Fermi2D::Regs, field_name) == position,                                 \
                  "Field " #field_name " has invalid position")

-ASSERT_REG_POSITION(dst, 0x80);
-ASSERT_REG_POSITION(src, 0x8C);
-ASSERT_REG_POSITION(operation, 0xAB);
-ASSERT_REG_POSITION(blit_control, 0x223);
-ASSERT_REG_POSITION(blit_dst_x, 0x22c);
-ASSERT_REG_POSITION(blit_dst_y, 0x22d);
-ASSERT_REG_POSITION(blit_dst_width, 0x22e);
-ASSERT_REG_POSITION(blit_dst_height, 0x22f);
-ASSERT_REG_POSITION(blit_du_dx, 0x230);
-ASSERT_REG_POSITION(blit_dv_dy, 0x232);
-ASSERT_REG_POSITION(blit_src_x, 0x234);
-ASSERT_REG_POSITION(blit_src_y, 0x236);
+ASSERT_REG_POSITION(object, 0x0);
+ASSERT_REG_POSITION(no_operation, 0x100);
+ASSERT_REG_POSITION(notify, 0x104);
+ASSERT_REG_POSITION(wait_for_idle, 0x110);
+ASSERT_REG_POSITION(pm_trigger, 0x140);
+ASSERT_REG_POSITION(context_dma_notify, 0x180);
+ASSERT_REG_POSITION(dst_context_dma, 0x184);
+ASSERT_REG_POSITION(src_context_dma, 0x188);
+ASSERT_REG_POSITION(semaphore_context_dma, 0x18C);
+ASSERT_REG_POSITION(dst, 0x200);
+ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228);
+ASSERT_REG_POSITION(kind2d_check_enable, 0x22C);
+ASSERT_REG_POSITION(src, 0x230);
+ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258);
+ASSERT_REG_POSITION(num_tpcs, 0x260);
+ASSERT_REG_POSITION(render_enable_addr_upper, 0x264);
+ASSERT_REG_POSITION(render_enable_addr_lower, 0x268);
+ASSERT_REG_POSITION(clip_x0, 0x280);
+ASSERT_REG_POSITION(clip_y0, 0x284);
+ASSERT_REG_POSITION(clip_width, 0x288);
+ASSERT_REG_POSITION(clip_height, 0x28c);
+ASSERT_REG_POSITION(clip_enable, 0x290);
+ASSERT_REG_POSITION(color_key_format, 0x294);
+ASSERT_REG_POSITION(color_key, 0x298);
+ASSERT_REG_POSITION(rop, 0x2A0);
+ASSERT_REG_POSITION(beta1, 0x2A4);
+ASSERT_REG_POSITION(beta4, 0x2A8);
+ASSERT_REG_POSITION(operation, 0x2AC);
+ASSERT_REG_POSITION(pattern_offset, 0x2B0);
+ASSERT_REG_POSITION(pattern_select, 0x2B4);
+ASSERT_REG_POSITION(monochrome_pattern, 0x2E8);
+ASSERT_REG_POSITION(color_pattern, 0x300);
+ASSERT_REG_POSITION(render_solid, 0x580);
+ASSERT_REG_POSITION(pixels_from_cpu, 0x800);
+ASSERT_REG_POSITION(big_endian_control, 0x870);
+ASSERT_REG_POSITION(pixels_from_memory, 0x880);

 #undef ASSERT_REG_POSITION

--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@ -58,24 +58,6 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun
    }
 }

-Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
-    const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value();
-    ASSERT(cbuf_mask[regs.tex_cb_index]);
-
-    const auto& texinfo = launch_description.const_buffer_config[regs.tex_cb_index];
-    ASSERT(texinfo.Address() != 0);
-
-    const GPUVAddr address = texinfo.Address() + offset * sizeof(Texture::TextureHandle);
-    ASSERT(address < texinfo.Address() + texinfo.size);
-
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)};
-    return GetTextureInfo(tex_handle);
-}
-
-Texture::FullTextureInfo KeplerCompute::GetTextureInfo(Texture::TextureHandle tex_handle) const {
-    return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
-}
-
 u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
    ASSERT(stage == ShaderType::Compute);
    const auto& buffer = launch_description.const_buffer_config[const_buffer];
@ -98,9 +80,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con

 SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
    const Texture::TextureHandle tex_handle{handle};
-    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
-    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
    return result;
 }

--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@ -209,11 +209,6 @@ public:
    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                         u32 methods_pending) override;

-    Texture::FullTextureInfo GetTexture(std::size_t offset) const;
-
-    /// Given a texture handle, returns the TSC and TIC entries.
-    Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;
-
    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;

    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@ -14,8 +14,8 @@

 namespace Tegra::Engines {

-KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
-    : system{system}, upload_state{memory_manager, regs.upload} {}
+KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
+    : system{system_}, upload_state{memory_manager, regs.upload} {}

 KeplerMemory::~KeplerMemory() = default;

--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@ -35,7 +35,7 @@ namespace Tegra::Engines {

 class KeplerMemory final : public EngineInterface {
 public:
-    KeplerMemory(Core::System& system, MemoryManager& memory_manager);
+    explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
    ~KeplerMemory();

    /// Write the value to the register identified by method.
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -2,7 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cinttypes>
 #include <cstring>
 #include <optional>
 #include "common/assert.h"
@ -124,6 +123,116 @@ void Maxwell3D::InitializeRegisterDefaults() {
    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }

+void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
+    if (executing_macro == 0) {
+        // A macro call must begin by writing the macro method's register, not its argument.
+        ASSERT_MSG((method % 2) == 0,
+                   "Can't start macro execution by writing to the ARGS register");
+        executing_macro = method;
+    }
+
+    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+
+    // Call the macro when there are no more parameters in the command buffer
+    if (is_last_call) {
+        CallMacroMethod(executing_macro, macro_params);
+        macro_params.clear();
+    }
+}
+
+u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
+    // Keep track of the register value in shadow_state when requested.
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+        shadow_state.reg_array[method] = argument;
+        return argument;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        return shadow_state.reg_array[method];
+    }
+    return argument;
+}
+
+void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
+    if (regs.reg_array[method] == argument) {
+        return;
+    }
+    regs.reg_array[method] = argument;
+
+    for (const auto& table : dirty.tables) {
+        dirty.flags[table[method]] = true;
+    }
+}
+
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
+                                  bool is_last_call) {
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+        return rasterizer->WaitForIdle();
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
+        return;
+    case MAXWELL3D_REG_INDEX(macros.data):
+        return macro_engine->AddCode(regs.macros.upload_address, argument);
+    case MAXWELL3D_REG_INDEX(macros.bind):
+        return ProcessMacroBind(argument);
+    case MAXWELL3D_REG_INDEX(firmware[4]):
+        return ProcessFirmwareCall4();
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        return StartCBData(method);
+    case MAXWELL3D_REG_INDEX(cb_bind[0]):
+        return ProcessCBBind(0);
+    case MAXWELL3D_REG_INDEX(cb_bind[1]):
+        return ProcessCBBind(1);
+    case MAXWELL3D_REG_INDEX(cb_bind[2]):
+        return ProcessCBBind(2);
+    case MAXWELL3D_REG_INDEX(cb_bind[3]):
+        return ProcessCBBind(3);
+    case MAXWELL3D_REG_INDEX(cb_bind[4]):
+        return ProcessCBBind(4);
+    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
+        return DrawArrays();
+    case MAXWELL3D_REG_INDEX(clear_buffers):
+        return ProcessClearBuffers();
+    case MAXWELL3D_REG_INDEX(query.query_get):
+        return ProcessQueryGet();
+    case MAXWELL3D_REG_INDEX(condition.mode):
+        return ProcessQueryCondition();
+    case MAXWELL3D_REG_INDEX(counter_reset):
+        return ProcessCounterReset();
+    case MAXWELL3D_REG_INDEX(sync_info):
+        return ProcessSyncPoint();
+    case MAXWELL3D_REG_INDEX(exec_upload):
+        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
+    case MAXWELL3D_REG_INDEX(data_upload):
+        upload_state.ProcessData(argument, is_last_call);
+        if (is_last_call) {
+            OnMemoryWrite();
+        }
+        return;
+    case MAXWELL3D_REG_INDEX(fragment_barrier):
+        return rasterizer->FragmentBarrier();
+    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+        return rasterizer->TiledCacheBarrier();
+    }
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
    // Reset the current macro.
    executing_macro = 0;
@ -157,142 +266,16 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
    // uploaded to the GPU during initialization.
    if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        macro_params.push_back(method_argument);
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params);
-            macro_params.clear();
-        }
+        ProcessMacro(method, &method_argument, 1, is_last_call);
        return;
    }

    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");

-    u32 arg = method_argument;
-    // Keep track of the register value in shadow_state when requested.
-    if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track ||
-        shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) {
-        shadow_state.reg_array[method] = arg;
-    } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) {
-        arg = shadow_state.reg_array[method];
-    }
-
-    if (regs.reg_array[method] != arg) {
-        regs.reg_array[method] = arg;
-
-        for (const auto& table : dirty.tables) {
-            dirty.flags[table[method]] = true;
-        }
-    }
-
-    switch (method) {
-    case MAXWELL3D_REG_INDEX(wait_for_idle): {
-        rasterizer->WaitForIdle();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(shadow_ram_control): {
-        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_argument);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.data): {
-        macro_engine->AddCode(regs.macros.upload_address, arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.bind): {
-        ProcessMacroBind(arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(firmware[4]): {
-        ProcessFirmwareCall4();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        StartCBData(method);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[0]): {
-        ProcessCBBind(0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[1]): {
-        ProcessCBBind(1);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[2]): {
-        ProcessCBBind(2);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[3]): {
-        ProcessCBBind(3);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[4]): {
-        ProcessCBBind(4);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): {
-        DrawArrays();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(clear_buffers): {
-        ProcessClearBuffers();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(query.query_get): {
-        ProcessQueryGet();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(condition.mode): {
-        ProcessQueryCondition();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(counter_reset): {
-        ProcessCounterReset();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(sync_info): {
-        ProcessSyncPoint();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(exec_upload): {
-        upload_state.ProcessExec(regs.exec_upload.linear != 0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(data_upload): {
-        upload_state.ProcessData(arg, is_last_call);
-        if (is_last_call) {
-            OnMemoryWrite();
-        }
-        break;
-    }
-    default:
-        break;
-    }
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+    ProcessMethodCall(method, argument, method_argument, is_last_call);
 }

 void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
@ -300,23 +283,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
    // uploaded to the GPU during initialization.
    if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        for (std::size_t i = 0; i < amount; i++) {
-            macro_params.push_back(base_start[i]);
-        }
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params);
-            macro_params.clear();
-        }
+        ProcessMacro(method, base_start, amount, amount == methods_pending);
        return;
    }
    switch (method) {
@ -335,15 +302,14 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
        ProcessCBMultiData(method, base_start, amount);
        break;
-    }
-    default: {
+    default:
        for (std::size_t i = 0; i < amount; i++) {
            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
        }
-    }
+        break;
    }
 }

@ -396,7 +362,7 @@ void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) {
 }

 void Maxwell3D::FlushMMEInlineDraw() {
-    LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
              regs.vertex_buffer.count);
    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
    ASSERT(mme_draw.instance_count == mme_draw.gl_end_count);
@ -541,8 +507,7 @@ void Maxwell3D::ProcessCounterReset() {
        rasterizer->ResetCounter(QueryType::SamplesPassed);
        break;
    default:
-        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
-                  static_cast<int>(regs.counter_reset));
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset);
        break;
    }
 }
@ -557,7 +522,7 @@ void Maxwell3D::ProcessSyncPoint() {
 }

 void Maxwell3D::DrawArrays() {
-    LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
              regs.vertex_buffer.count);
    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");

@ -595,12 +560,12 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
        return 0;
    case Regs::QuerySelect::SamplesPassed:
        // Deferred.
-        rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
+        rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed,
                          system.GPU().GetTicks());
        return std::nullopt;
    default:
        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
-                  static_cast<u32>(regs.query.query_get.select.Value()));
+                  regs.query.query_get.select.Value());
        return 1;
    }
 }
@ -677,7 +642,7 @@ void Maxwell3D::FinishCBData() {
 }

 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
-    const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
+    const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};

    Texture::TICEntry tic_entry;
    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
@ -686,43 +651,19 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
 }

 Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
-    const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
+    const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};

    Texture::TSCEntry tsc_entry;
    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
    return tsc_entry;
 }

-Texture::FullTextureInfo Maxwell3D::GetTextureInfo(Texture::TextureHandle tex_handle) const {
-    return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
-}
-
-Texture::FullTextureInfo Maxwell3D::GetStageTexture(ShaderType stage, std::size_t offset) const {
-    const auto stage_index = static_cast<std::size_t>(stage);
-    const auto& shader = state.shader_stages[stage_index];
-    const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
-    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
-
-    const GPUVAddr tex_info_address =
-        tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
-
-    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
-
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
-
-    return GetTextureInfo(tex_handle);
-}
-
 u32 Maxwell3D::GetRegisterValue(u32 method) const {
    ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
    return regs.reg_array[method];
 }

 void Maxwell3D::ProcessClearBuffers() {
-    ASSERT(regs.clear_buffers.R == regs.clear_buffers.G &&
-           regs.clear_buffers.R == regs.clear_buffers.B &&
-           regs.clear_buffers.R == regs.clear_buffers.A);
-
    rasterizer->Clear();
 }

@ -730,9 +671,7 @@ u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offse
    ASSERT(stage != ShaderType::Compute);
    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
    const auto& buffer = shader_stage.const_buffers[const_buffer];
-    u32 result;
-    std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32));
-    return result;
+    return memory_manager.Read<u32>(buffer.address + offset);
 }

 SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
@ -750,9 +689,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b

 SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
    const Texture::TextureHandle tex_handle{handle};
-    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
-    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
    return result;
 }

--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@ -438,16 +438,6 @@ public:
            DecrWrapOGL = 0x8508,
        };

-        enum class MemoryLayout : u32 {
-            Linear = 0,
-            BlockLinear = 1,
-        };
-
-        enum class InvMemoryLayout : u32 {
-            BlockLinear = 0,
-            Linear = 1,
-        };
-
        enum class CounterReset : u32 {
            SampleCnt = 0x01,
            Unk02 = 0x02,
@ -589,21 +579,31 @@ public:
            NegativeW = 7,
        };

+        enum class SamplerIndex : u32 {
+            Independently = 0,
+            ViaHeaderIndex = 1,
+        };
+
+        struct TileMode {
+            union {
+                BitField<0, 4, u32> block_width;
+                BitField<4, 4, u32> block_height;
+                BitField<8, 4, u32> block_depth;
+                BitField<12, 1, u32> is_pitch_linear;
+                BitField<16, 1, u32> is_3d;
+            };
+        };
+        static_assert(sizeof(TileMode) == 4);
+
        struct RenderTargetConfig {
            u32 address_high;
            u32 address_low;
            u32 width;
            u32 height;
            Tegra::RenderTargetFormat format;
+            TileMode tile_mode;
            union {
-                BitField<0, 3, u32> block_width;
-                BitField<4, 3, u32> block_height;
-                BitField<8, 3, u32> block_depth;
-                BitField<12, 1, InvMemoryLayout> type;
-                BitField<16, 1, u32> is_3d;
-            } memory_layout;
-            union {
-                BitField<0, 16, u32> layers;
+                BitField<0, 16, u32> depth;
                BitField<16, 1, u32> volume;
            };
            u32 layer_stride;
@ -755,7 +755,11 @@ public:

                u32 data_upload;

-                INSERT_UNION_PADDING_WORDS(0x44);
+                INSERT_UNION_PADDING_WORDS(0x16);
+
+                u32 force_early_fragment_tests;
+
+                INSERT_UNION_PADDING_WORDS(0x2D);

                struct {
                    union {
@ -828,7 +832,11 @@ public:

                u32 patch_vertices;

-                INSERT_UNION_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0x4);
+
+                u32 fragment_barrier;
+
+                INSERT_UNION_PADDING_WORDS(0x7);

                std::array<ScissorTest, NumViewports> scissor_test;

@ -838,7 +846,15 @@ public:
                u32 stencil_back_mask;
                u32 stencil_back_func_mask;

-                INSERT_UNION_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0x5);
+
+                u32 invalidate_texture_data_cache;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
+
+                u32 tiled_cache_barrier;
+
+                INSERT_UNION_PADDING_WORDS(0x4);

                u32 color_mask_common;

@ -862,12 +878,7 @@ public:
                    u32 address_high;
                    u32 address_low;
                    Tegra::DepthFormat format;
-                    union {
-                        BitField<0, 4, u32> block_width;
-                        BitField<4, 4, u32> block_height;
-                        BitField<8, 4, u32> block_depth;
-                        BitField<20, 1, InvMemoryLayout> type;
-                    } memory_layout;
+                    TileMode tile_mode;
                    u32 layer_stride;

                    GPUVAddr Address() const {
@ -876,7 +887,18 @@ public:
                    }
                } zeta;

-                INSERT_UNION_PADDING_WORDS(0x41);
+                struct {
+                    union {
+                        BitField<0, 16, u32> x;
+                        BitField<16, 16, u32> width;
+                    };
+                    union {
+                        BitField<0, 16, u32> y;
+                        BitField<16, 16, u32> height;
+                    };
+                } render_area;
+
+                INSERT_UNION_PADDING_WORDS(0x3F);

                union {
                    BitField<0, 4, u32> stencil;
@ -917,7 +939,7 @@ public:
                        BitField<25, 3, u32> map_7;
                    };

-                    u32 GetMap(std::size_t index) const {
+                    u32 Map(std::size_t index) const {
                        const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3,
                                                                     map_4, map_5, map_6, map_7};
                        ASSERT(index < maps.size());
@ -930,11 +952,13 @@ public:
                u32 zeta_width;
                u32 zeta_height;
                union {
-                    BitField<0, 16, u32> zeta_layers;
+                    BitField<0, 16, u32> zeta_depth;
                    BitField<16, 1, u32> zeta_volume;
                };

-                INSERT_UNION_PADDING_WORDS(0x26);
+                SamplerIndex sampler_index;
+
+                INSERT_UNION_PADDING_WORDS(0x25);

                u32 depth_test_enable;

@ -960,6 +984,7 @@ public:
                    float b;
                    float a;
                } blend_color;
+
                INSERT_UNION_PADDING_WORDS(0x4);

                struct {
@ -997,7 +1022,12 @@ public:
                float line_width_smooth;
                float line_width_aliased;

-                INSERT_UNION_PADDING_WORDS(0x1F);
+                INSERT_UNION_PADDING_WORDS(0x1B);
+
+                u32 invalidate_sampler_cache_no_wfi;
+                u32 invalidate_texture_header_cache_no_wfi;
+
+                INSERT_UNION_PADDING_WORDS(0x2);

                u32 vb_element_base;
                u32 vb_base_instance;
@ -1041,13 +1071,13 @@ public:
                } condition;

                struct {
-                    u32 tsc_address_high;
-                    u32 tsc_address_low;
-                    u32 tsc_limit;
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;

-                    GPUVAddr TSCAddress() const {
-                        return static_cast<GPUVAddr>(
-                            (static_cast<GPUVAddr>(tsc_address_high) << 32) | tsc_address_low);
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
                    }
                } tsc;

@ -1058,13 +1088,13 @@ public:
                u32 line_smooth_enable;

                struct {
-                    u32 tic_address_high;
-                    u32 tic_address_low;
-                    u32 tic_limit;
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;

-                    GPUVAddr TICAddress() const {
-                        return static_cast<GPUVAddr>(
-                            (static_cast<GPUVAddr>(tic_address_high) << 32) | tic_address_low);
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
                    }
                } tic;

@ -1393,12 +1423,6 @@ public:

    void FlushMMEInlineDraw();

-    /// Given a texture handle, returns the TSC and TIC entries.
-    Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;
-
-    /// Returns the texture information for a specific texture in a specific shader stage.
-    Texture::FullTextureInfo GetStageTexture(ShaderType stage, std::size_t offset) const;
-
    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;

    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
@ -1461,38 +1485,13 @@ public:
 private:
    void InitializeRegisterDefaults();

-    Core::System& system;
-    MemoryManager& memory_manager;
+    void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call);

-    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    u32 ProcessShadowRam(u32 method, u32 argument);

-    /// Start offsets of each macro in macro_memory
-    std::array<u32, 0x80> macro_positions = {};
+    void ProcessDirtyRegisters(u32 method, u32 argument);

-    std::array<bool, Regs::NUM_REGS> mme_inline{};
-
-    /// Macro method that is currently being executed / being fed parameters.
-    u32 executing_macro = 0;
-    /// Parameters that have been submitted to the macro call so far.
-    std::vector<u32> macro_params;
-
-    /// Interpreter for the macro codes uploaded to the GPU.
-    std::unique_ptr<MacroEngine> macro_engine;
-
-    static constexpr u32 null_cb_data = 0xFFFFFFFF;
-    struct {
-        std::array<std::array<u32, 0x4000>, 16> buffer;
-        u32 current{null_cb_data};
-        u32 id{null_cb_data};
-        u32 start_pos{};
-        u32 counter{};
-    } cb_data_state;
-
-    Upload::State upload_state;
-
-    bool execute_on{true};
-
-    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
+    void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);

    /// Retrieves information about a specific TIC entry from the TIC buffer.
    Texture::TICEntry GetTICEntry(u32 tic_index) const;
@ -1502,8 +1501,8 @@ private:

    /**
     * Call a macro on this engine.
+     *
     * @param method Method to call
-     * @param num_parameters Number of arguments
     * @param parameters Arguments to the method call
     */
    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
@ -1552,6 +1551,38 @@ private:

    /// Returns a query's value or an empty object if the value will be deferred through a cache.
    std::optional<u64> GetQueryResult();
+
+    Core::System& system;
+    MemoryManager& memory_manager;
+
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+
+    /// Start offsets of each macro in macro_memory
+    std::array<u32, 0x80> macro_positions{};
+
+    std::array<bool, Regs::NUM_REGS> mme_inline{};
+
+    /// Macro method that is currently being executed / being fed parameters.
+    u32 executing_macro = 0;
+    /// Parameters that have been submitted to the macro call so far.
+    std::vector<u32> macro_params;
+
+    /// Interpreter for the macro codes uploaded to the GPU.
+    std::unique_ptr<MacroEngine> macro_engine;
+
+    static constexpr u32 null_cb_data = 0xFFFFFFFF;
+    struct CBDataState {
+        std::array<std::array<u32, 0x4000>, 16> buffer;
+        u32 current{null_cb_data};
+        u32 id{null_cb_data};
+        u32 start_pos{};
+        u32 counter{};
+    };
+    CBDataState cb_data_state;
+
+    Upload::State upload_state;
+
+    bool execute_on{true};
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@ -1564,6 +1595,7 @@ ASSERT_REG_POSITION(shadow_ram_control, 0x49);
 ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec_upload, 0x6C);
 ASSERT_REG_POSITION(data_upload, 0x6D);
+ASSERT_REG_POSITION(force_early_fragment_tests, 0x84);
 ASSERT_REG_POSITION(sync_info, 0xB2);
 ASSERT_REG_POSITION(tess_mode, 0xC8);
 ASSERT_REG_POSITION(tess_level_outer, 0xC9);
@ -1586,10 +1618,13 @@ ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370);
 ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371);
 ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
 ASSERT_REG_POSITION(patch_vertices, 0x373);
+ASSERT_REG_POSITION(fragment_barrier, 0x378);
 ASSERT_REG_POSITION(scissor_test, 0x380);
 ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
 ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
 ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
+ASSERT_REG_POSITION(invalidate_texture_data_cache, 0x3DD);
+ASSERT_REG_POSITION(tiled_cache_barrier, 0x3DF);
 ASSERT_REG_POSITION(color_mask_common, 0x3E4);
 ASSERT_REG_POSITION(depth_bounds, 0x3E7);
 ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
@ -1597,6 +1632,7 @@ ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED);
 ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE);
 ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF);
 ASSERT_REG_POSITION(zeta, 0x3F8);
+ASSERT_REG_POSITION(render_area, 0x3FD);
 ASSERT_REG_POSITION(clear_flags, 0x43E);
 ASSERT_REG_POSITION(fill_rectangle, 0x44F);
 ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
@ -1605,7 +1641,8 @@ ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
-ASSERT_REG_POSITION(zeta_layers, 0x48c);
+ASSERT_REG_POSITION(zeta_depth, 0x48c);
+ASSERT_REG_POSITION(sampler_index, 0x48D);
 ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
 ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
 ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
@ -1629,6 +1666,8 @@ ASSERT_REG_POSITION(frag_color_clamp, 0x4EA);
 ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(line_width_smooth, 0x4EC);
 ASSERT_REG_POSITION(line_width_aliased, 0x4ED);
+ASSERT_REG_POSITION(invalidate_sampler_cache_no_wfi, 0x509);
+ASSERT_REG_POSITION(invalidate_texture_header_cache_no_wfi, 0x50A);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(vb_base_instance, 0x50E);
 ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@ -16,8 +16,10 @@ namespace Tegra::Engines {

 using namespace Texture;

-MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
-    : system{system}, memory_manager{memory_manager} {}
+MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_} {}
+
+MaxwellDMA::~MaxwellDMA() = default;

 void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
@ -94,6 +96,7 @@ void MaxwellDMA::CopyPitchToPitch() {
 }

 void MaxwellDMA::CopyBlockLinearToPitch() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
    UNIMPLEMENTED_IF(regs.src_params.layer != 0);

@ -114,8 +117,6 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
    const u32 block_depth = src_params.block_size.depth;
    const size_t src_size =
        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
-    const size_t src_layer_size =
-        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);

    if (read_buffer.size() < src_size) {
        read_buffer.resize(src_size);
@ -135,6 +136,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
 }

 void MaxwellDMA::CopyPitchToBlockLinear() {
+    UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
+
    const auto& dst_params = regs.dst_params;
    const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
    const u32 width = dst_params.width;
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@ -72,11 +72,13 @@ public:

    struct RenderEnable {
        enum class Mode : u32 {
-            FALSE = 0,
-            TRUE = 1,
-            CONDITIONAL = 2,
-            RENDER_IF_EQUAL = 3,
-            RENDER_IF_NOT_EQUAL = 4,
+            // Note: This uses Pascal case in order to avoid the identifiers
+            // FALSE and TRUE, which are reserved on Darwin.
+            False = 0,
+            True = 1,
+            Conditional = 2,
+            RenderIfEqual = 3,
+            RenderIfNotEqual = 4,
        };

        PackedGPUVAddr address;
@ -185,8 +187,8 @@ public:
    };
    static_assert(sizeof(RemapConst) == 12);

-    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
-    ~MaxwellDMA() = default;
+    explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_);
+    ~MaxwellDMA();

    /// Write the value to the register identified by method.
    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@ -32,31 +32,31 @@ struct Register {

    constexpr Register() = default;

-    constexpr Register(u64 value) : value(value) {}
+    constexpr Register(u64 value_) : value(value_) {}

-    constexpr operator u64() const {
+    [[nodiscard]] constexpr operator u64() const {
        return value;
    }

    template <typename T>
-    constexpr u64 operator-(const T& oth) const {
+    [[nodiscard]] constexpr u64 operator-(const T& oth) const {
        return value - oth;
    }

    template <typename T>
-    constexpr u64 operator&(const T& oth) const {
+    [[nodiscard]] constexpr u64 operator&(const T& oth) const {
        return value & oth;
    }

-    constexpr u64 operator&(const Register& oth) const {
+    [[nodiscard]] constexpr u64 operator&(const Register& oth) const {
        return value & oth.value;
    }

-    constexpr u64 operator~() const {
+    [[nodiscard]] constexpr u64 operator~() const {
        return ~value;
    }

-    u64 GetSwizzledIndex(u64 elem) const {
+    [[nodiscard]] u64 GetSwizzledIndex(u64 elem) const {
        elem = (value + elem) & 3;
        return (value & ~3) + elem;
    }
@ -75,7 +75,7 @@ enum class AttributeSize : u64 {
 union Attribute {
    Attribute() = default;

-    constexpr explicit Attribute(u64 value) : value(value) {}
+    constexpr explicit Attribute(u64 value_) : value(value_) {}

    enum class Index : u64 {
        LayerViewportPointSize = 6,
@ -107,7 +107,7 @@ union Attribute {
        BitField<31, 1, u64> patch;
        BitField<47, 3, AttributeSize> size;

-        bool IsPhysical() const {
+        [[nodiscard]] bool IsPhysical() const {
            return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0;
        }
    } fmt20;
@ -124,7 +124,7 @@ union Attribute {
 union Sampler {
    Sampler() = default;

-    constexpr explicit Sampler(u64 value) : value(value) {}
+    constexpr explicit Sampler(u64 value_) : value(value_) {}

    enum class Index : u64 {
        Sampler_0 = 8,
@ -137,7 +137,7 @@ union Sampler {
 union Image {
    Image() = default;

-    constexpr explicit Image(u64 value) : value{value} {}
+    constexpr explicit Image(u64 value_) : value{value_} {}

    BitField<36, 13, u64> index;
    u64 value;
@ -505,14 +505,14 @@ struct IpaMode {
    IpaInterpMode interpolation_mode;
    IpaSampleMode sampling_mode;

-    bool operator==(const IpaMode& a) const {
+    [[nodiscard]] bool operator==(const IpaMode& a) const {
        return std::tie(interpolation_mode, sampling_mode) ==
               std::tie(a.interpolation_mode, a.sampling_mode);
    }
-    bool operator!=(const IpaMode& a) const {
+    [[nodiscard]] bool operator!=(const IpaMode& a) const {
        return !operator==(a);
    }
-    bool operator<(const IpaMode& a) const {
+    [[nodiscard]] bool operator<(const IpaMode& a) const {
        return std::tie(interpolation_mode, sampling_mode) <
               std::tie(a.interpolation_mode, a.sampling_mode);
    }
@ -658,10 +658,10 @@ union Instruction {
        return *this;
    }

-    constexpr Instruction(u64 value) : value{value} {}
+    constexpr Instruction(u64 value_) : value{value_} {}
    constexpr Instruction(const Instruction& instr) : value(instr.value) {}

-    constexpr bool Bit(u64 offset) const {
+    [[nodiscard]] constexpr bool Bit(u64 offset) const {
        return ((value >> offset) & 1) != 0;
    }

@ -746,34 +746,34 @@ union Instruction {
            BitField<28, 8, u64> imm_lut28;
            BitField<48, 8, u64> imm_lut48;

-            u32 GetImmLut28() const {
+            [[nodiscard]] u32 GetImmLut28() const {
                return static_cast<u32>(imm_lut28);
            }

-            u32 GetImmLut48() const {
+            [[nodiscard]] u32 GetImmLut48() const {
                return static_cast<u32>(imm_lut48);
            }
        } lop3;

-        u16 GetImm20_16() const {
+        [[nodiscard]] u16 GetImm20_16() const {
            return static_cast<u16>(imm20_16);
        }

-        u32 GetImm20_19() const {
+        [[nodiscard]] u32 GetImm20_19() const {
            u32 imm{static_cast<u32>(imm20_19)};
            imm <<= 12;
            imm |= negate_imm ? 0x80000000 : 0;
            return imm;
        }

-        u32 GetImm20_32() const {
+        [[nodiscard]] u32 GetImm20_32() const {
            return static_cast<u32>(imm20_32);
        }

-        s32 GetSignedImm20_20() const {
-            u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
+        [[nodiscard]] s32 GetSignedImm20_20() const {
+            const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
            // Sign extend the 20-bit value.
-            u32 mask = 1U << (20 - 1);
+            const auto mask = 1U << (20 - 1);
            return static_cast<s32>((immediate ^ mask) - mask);
        }
    } alu;
@ -857,7 +857,7 @@ union Instruction {
        BitField<56, 1, u64> second_negate;
        BitField<30, 9, u64> second;

-        u32 PackImmediates() const {
+        [[nodiscard]] u32 PackImmediates() const {
            // Immediates are half floats shifted.
            constexpr u32 imm_shift = 6;
            return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift)));
@ -1033,7 +1033,7 @@ union Instruction {
        BitField<28, 2, AtomicType> type;
        BitField<30, 22, s64> offset;

-        s32 GetImmediateOffset() const {
+        [[nodiscard]] s32 GetImmediateOffset() const {
            return static_cast<s32>(offset << 2);
        }
    } atoms;
@ -1215,7 +1215,7 @@ union Instruction {
            BitField<39, 4, u64> rounding;
            // H0, H1 extract for F16 missing
            BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
-            F2fRoundingOp GetRoundingMode() const {
+            [[nodiscard]] F2fRoundingOp GetRoundingMode() const {
                constexpr u64 rounding_mask = 0x0B;
                return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
            }
@ -1239,15 +1239,15 @@ union Instruction {
        BitField<54, 1, u64> aoffi_flag;
        BitField<55, 3, TextureProcessMode> process_mode;

-        bool IsComponentEnabled(std::size_t component) const {
-            return ((1ull << component) & component_mask) != 0;
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
        }

-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
            return process_mode;
        }

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::DC:
                return dc_flag != 0;
@ -1271,15 +1271,15 @@ union Instruction {
        BitField<36, 1, u64> aoffi_flag;
        BitField<37, 3, TextureProcessMode> process_mode;

-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
            return ((1ULL << component) & component_mask) != 0;
        }

-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
            return process_mode;
        }

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::DC:
                return dc_flag != 0;
@ -1299,7 +1299,7 @@ union Instruction {
        BitField<31, 4, u64> component_mask;
        BitField<49, 1, u64> nodep_flag;

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::NODEP:
                return nodep_flag != 0;
@ -1309,7 +1309,7 @@ union Instruction {
            return false;
        }

-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
            return ((1ULL << component) & component_mask) != 0;
        }
    } txq;
@ -1321,11 +1321,11 @@ union Instruction {
        BitField<35, 1, u64> ndv_flag;
        BitField<49, 1, u64> nodep_flag;

-        bool IsComponentEnabled(std::size_t component) const {
-            return ((1ull << component) & component_mask) != 0;
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
        }

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::NDV:
                return (ndv_flag != 0);
@ -1347,7 +1347,7 @@ union Instruction {
        BitField<54, 2, u64> offset_mode;
        BitField<56, 2, u64> component;

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::NDV:
                return ndv_flag != 0;
@ -1373,7 +1373,7 @@ union Instruction {
        BitField<33, 2, u64> offset_mode;
        BitField<37, 2, u64> component;

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::NDV:
                return ndv_flag != 0;
@ -1399,7 +1399,7 @@ union Instruction {
        BitField<52, 2, u64> component;
        BitField<55, 1, u64> fp16_flag;

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::DC:
                return dc_flag != 0;
@ -1422,24 +1422,27 @@ union Instruction {
        BitField<53, 4, u64> texture_info;
        BitField<59, 1, u64> fp32_flag;

-        TextureType GetTextureType() const {
+        [[nodiscard]] TextureType GetTextureType() const {
            // The TEXS instruction has a weird encoding for the texture type.
-            if (texture_info == 0)
+            if (texture_info == 0) {
                return TextureType::Texture1D;
-            if (texture_info >= 1 && texture_info <= 9)
+            }
+            if (texture_info >= 1 && texture_info <= 9) {
                return TextureType::Texture2D;
-            if (texture_info >= 10 && texture_info <= 11)
+            }
+            if (texture_info >= 10 && texture_info <= 11) {
                return TextureType::Texture3D;
-            if (texture_info >= 12 && texture_info <= 13)
+            }
+            if (texture_info >= 12 && texture_info <= 13) {
                return TextureType::TextureCube;
+            }

-            LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}",
-                         static_cast<u32>(texture_info.Value()));
+            LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value());
            UNREACHABLE();
            return TextureType::Texture1D;
        }

-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
            switch (texture_info) {
            case 0:
            case 2:
@ -1458,7 +1461,7 @@ union Instruction {
            return TextureProcessMode::None;
        }

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::DC:
                return (texture_info >= 4 && texture_info <= 6) || texture_info == 9;
@ -1470,16 +1473,16 @@ union Instruction {
            return false;
        }

-        bool IsArrayTexture() const {
+        [[nodiscard]] bool IsArrayTexture() const {
            // TEXS only supports Texture2D arrays.
            return texture_info >= 7 && texture_info <= 9;
        }

-        bool HasTwoDestinations() const {
+        [[nodiscard]] bool HasTwoDestinations() const {
            return gpr28.Value() != Register::ZeroIndex;
        }

-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
            static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
                {},
                {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
@ -1506,7 +1509,7 @@ union Instruction {
        BitField<54, 1, u64> cl;
        BitField<55, 1, u64> process_mode;

-        TextureProcessMode GetTextureProcessMode() const {
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
            return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL;
        }
    } tld;
@ -1516,7 +1519,7 @@ union Instruction {
        BitField<53, 4, u64> texture_info;
        BitField<59, 1, u64> fp32_flag;

-        TextureType GetTextureType() const {
+        [[nodiscard]] TextureType GetTextureType() const {
            // The TLDS instruction has a weird encoding for the texture type.
            if (texture_info <= 1) {
                return TextureType::Texture1D;
@ -1529,19 +1532,19 @@ union Instruction {
                return TextureType::Texture3D;
            }

-            LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}",
-                         static_cast<u32>(texture_info.Value()));
+            LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value());
            UNREACHABLE();
            return TextureType::Texture1D;
        }

-        TextureProcessMode GetTextureProcessMode() const {
-            if (texture_info == 1 || texture_info == 5 || texture_info == 12)
+        [[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
+            if (texture_info == 1 || texture_info == 5 || texture_info == 12) {
                return TextureProcessMode::LL;
+            }
            return TextureProcessMode::LZ;
        }

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::AOFFI:
                return texture_info == 12 || texture_info == 4;
@ -1555,7 +1558,7 @@ union Instruction {
            return false;
        }

-        bool IsArrayTexture() const {
+        [[nodiscard]] bool IsArrayTexture() const {
            // TEXS only supports Texture2D arrays.
            return texture_info == 8;
        }
@ -1567,7 +1570,7 @@ union Instruction {
        BitField<35, 1, u64> aoffi_flag;
        BitField<49, 1, u64> nodep_flag;

-        bool UsesMiscMode(TextureMiscMode mode) const {
+        [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
            switch (mode) {
            case TextureMiscMode::AOFFI:
                return aoffi_flag != 0;
@ -1591,7 +1594,7 @@ union Instruction {
        BitField<20, 3, StoreType> store_data_layout;
        BitField<20, 4, u64> component_mask_selector;

-        bool IsComponentEnabled(std::size_t component) const {
+        [[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
            ASSERT(mode == SurfaceDataMode::P);
            constexpr u8 R = 0b0001;
            constexpr u8 G = 0b0010;
@ -1604,7 +1607,7 @@ union Instruction {
            return std::bitset<4>{mask.at(component_mask_selector)}.test(component);
        }

-        StoreType GetStoreDataLayout() const {
+        [[nodiscard]] StoreType GetStoreDataLayout() const {
            ASSERT(mode == SurfaceDataMode::D_BA);
            return store_data_layout;
        }
@ -1622,14 +1625,15 @@ union Instruction {
        BitField<20, 24, u64> target;
        BitField<5, 1, u64> constant_buffer;

-        s32 GetBranchTarget() const {
+        [[nodiscard]] s32 GetBranchTarget() const {
            // Sign extend the branch target offset
-            u32 mask = 1U << (24 - 1);
-            u32 value = static_cast<u32>(target);
+            const auto mask = 1U << (24 - 1);
+            const auto target_value = static_cast<u32>(target);
+            constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
+
            // The branch offset is relative to the next instruction and is stored in bytes, so
            // divide it by the size of an instruction and add 1 to it.
-            return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
-                   1;
+            return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
        }
    } bra;

@ -1637,14 +1641,15 @@ union Instruction {
        BitField<20, 24, u64> target;
        BitField<5, 1, u64> constant_buffer;

-        s32 GetBranchExtend() const {
+        [[nodiscard]] s32 GetBranchExtend() const {
            // Sign extend the branch target offset
-            u32 mask = 1U << (24 - 1);
-            u32 value = static_cast<u32>(target);
+            const auto mask = 1U << (24 - 1);
+            const auto target_value = static_cast<u32>(target);
+            constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
+
            // The branch offset is relative to the next instruction and is stored in bytes, so
            // divide it by the size of an instruction and add 1 to it.
-            return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
-                   1;
+            return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
        }
    } brx;

@ -1697,7 +1702,7 @@ union Instruction {
        BitField<50, 1, u64> is_op_b_register;
        BitField<51, 3, VmnmxOperation> operation;

-        VmnmxType SourceFormatA() const {
+        [[nodiscard]] VmnmxType SourceFormatA() const {
            switch (src_format_a) {
            case 0b11:
                return VmnmxType::Bits32;
@ -1708,7 +1713,7 @@ union Instruction {
            }
        }

-        VmnmxType SourceFormatB() const {
+        [[nodiscard]] VmnmxType SourceFormatB() const {
            switch (src_format_b) {
            case 0b11:
                return VmnmxType::Bits32;
@ -1739,7 +1744,7 @@ union Instruction {
        BitField<20, 14, u64> shifted_offset;
        BitField<34, 5, u64> index;

-        u64 GetOffset() const {
+        [[nodiscard]] u64 GetOffset() const {
            return shifted_offset * 4;
        }
    } cbuf34;
@ -1748,7 +1753,7 @@ union Instruction {
        BitField<20, 16, s64> offset;
        BitField<36, 5, u64> index;

-        s64 GetOffset() const {
+        [[nodiscard]] s64 GetOffset() const {
            return offset;
        }
    } cbuf36;
@ -1893,6 +1898,7 @@ public:
        ICMP_IMM,
        FCMP_RR,
        FCMP_RC,
+        FCMP_IMMR,
        MUFU,  // Multi-Function Operator
        RRO_C, // Range Reduction Operator
        RRO_R,
@ -1996,29 +2002,29 @@ public:

    /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be
    /// conditionally executed).
-    static bool IsPredicatedInstruction(Id opcode) {
+    [[nodiscard]] static bool IsPredicatedInstruction(Id opcode) {
        // TODO(Subv): Add the rest of unpredicated instructions.
        return opcode != Id::SSY && opcode != Id::PBK;
    }

    class Matcher {
    public:
-        constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type)
-            : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {}
+        constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_)
+            : name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {}

-        constexpr const char* GetName() const {
+        [[nodiscard]] constexpr const char* GetName() const {
            return name;
        }

-        constexpr u16 GetMask() const {
+        [[nodiscard]] constexpr u16 GetMask() const {
            return mask;
        }

-        constexpr Id GetId() const {
+        [[nodiscard]] constexpr Id GetId() const {
            return id;
        }

-        constexpr Type GetType() const {
+        [[nodiscard]] constexpr Type GetType() const {
            return type;
        }

@ -2027,7 +2033,7 @@ public:
         * @param instruction The instruction to test
         * @returns true if the given instruction matches.
         */
-        constexpr bool Matches(u16 instruction) const {
+        [[nodiscard]] constexpr bool Matches(u16 instruction) const {
            return (instruction & mask) == expected;
        }

@ -2039,7 +2045,8 @@ public:
        Type type;
    };

-    static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) {
+    using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>;
+    [[nodiscard]] static DecodeResult Decode(Instruction instr) {
        static const auto table{GetDecodeTable()};

        const auto matches_instruction = [instr](const auto& matcher) {
@ -2061,7 +2068,7 @@ private:
         * A '0' in a bitstring indicates that a zero must be present at that bit position.
         * A '1' in a bitstring indicates that a one must be present at that bit position.
         */
-        static constexpr auto GetMaskAndExpect(const char* const bitstring) {
+        [[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) {
            u16 mask = 0, expect = 0;
            for (std::size_t i = 0; i < opcode_bitsize; i++) {
                const std::size_t bit_position = opcode_bitsize - i - 1;
@ -2083,14 +2090,14 @@ private:

    public:
        /// Creates a matcher that can match and parse instructions based on bitstring.
-        static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type,
-                                         const char* const name) {
+        [[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op,
+                                                       Type type, const char* const name) {
            const auto [mask, expected] = GetMaskAndExpect(bitstring);
            return Matcher(name, mask, expected, op, type);
        }
    };

-    static std::vector<Matcher> GetDecodeTable() {
+    [[nodiscard]] static std::vector<Matcher> GetDecodeTable() {
        std::vector<Matcher> table = {
 #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name)
            INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
@ -2205,6 +2212,7 @@ private:
            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
            INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
            INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
+            INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"),
            INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
            INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
            INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@ -41,30 +41,30 @@ struct Header {
        BitField<26, 1, u32> does_load_or_store;
        BitField<27, 1, u32> does_fp64;
        BitField<28, 4, u32> stream_out_mask;
-    } common0{};
+    } common0;

    union {
        BitField<0, 24, u32> shader_local_memory_low_size;
        BitField<24, 8, u32> per_patch_attribute_count;
-    } common1{};
+    } common1;

    union {
        BitField<0, 24, u32> shader_local_memory_high_size;
        BitField<24, 8, u32> threads_per_input_primitive;
-    } common2{};
+    } common2;

    union {
        BitField<0, 24, u32> shader_local_memory_crs_size;
        BitField<24, 4, OutputTopology> output_topology;
        BitField<28, 4, u32> reserved;
-    } common3{};
+    } common3;

    union {
        BitField<0, 12, u32> max_output_vertices;
        BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
        BitField<20, 4, u32> reserved;
        BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
-    } common4{};
+    } common4;

    union {
        struct {
@ -145,7 +145,7 @@ struct Header {
            }
        } ps;

-        std::array<u32, 0xF> raw{};
+        std::array<u32, 0xF> raw;
    };

    u64 GetLocalMemorySize() const {
@ -153,7 +153,6 @@ struct Header {
                (common2.shader_local_memory_high_size << 24));
    }
 };
-
 static_assert(sizeof(Header) == 0x50, "Incorrect structure size");

 } // namespace Tegra::Shader
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@ -9,6 +9,7 @@

 #include "common/common_types.h"
 #include "core/core.h"
+#include "video_core/delayed_destruction_ring.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
@ -17,11 +18,11 @@ namespace VideoCommon {

 class FenceBase {
 public:
-    FenceBase(u32 payload, bool is_stubbed)
-        : address{}, payload{payload}, is_semaphore{false}, is_stubbed{is_stubbed} {}
+    explicit FenceBase(u32 payload_, bool is_stubbed_)
+        : address{}, payload{payload_}, is_semaphore{false}, is_stubbed{is_stubbed_} {}

-    FenceBase(GPUVAddr address, u32 payload, bool is_stubbed)
-        : address{address}, payload{payload}, is_semaphore{true}, is_stubbed{is_stubbed} {}
+    explicit FenceBase(GPUVAddr address_, u32 payload_, bool is_stubbed_)
+        : address{address_}, payload{payload_}, is_semaphore{true}, is_stubbed{is_stubbed_} {}

    GPUVAddr GetAddress() const {
        return address;
@ -47,6 +48,11 @@ protected:
 template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache>
 class FenceManager {
 public:
+    /// Notify the fence manager about a new frame
+    void TickFrame() {
+        delayed_destruction_ring.Tick();
+    }
+
    void SignalSemaphore(GPUVAddr addr, u32 value) {
        TryReleasePendingFences();
        const bool should_flush = ShouldFlush();
@ -86,7 +92,7 @@ public:
            } else {
                gpu.IncrementSyncPoint(current_fence->GetPayload());
            }
-            fences.pop();
+            PopFence();
        }
    }

@ -132,7 +138,7 @@ private:
            } else {
                gpu.IncrementSyncPoint(current_fence->GetPayload());
            }
-            fences.pop();
+            PopFence();
        }
    }

@ -158,7 +164,14 @@ private:
        query_cache.CommitAsyncFlushes();
    }

+    void PopFence() {
+        delayed_destruction_ring.Push(std::move(fences.front()));
+        fences.pop();
+    }
+
    std::queue<TFence> fences;
+
+    DelayedDestructionRing<TFence, 6> delayed_destruction_ring;
 };

 } // namespace VideoCommon
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@ -0,0 +1,31 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Tegra {
+
+/**
+ * Struct describing framebuffer configuration
+ */
+struct FramebufferConfig {
+    enum class PixelFormat : u32 {
+        A8B8G8R8_UNORM = 1,
+        RGB565_UNORM = 4,
+        B8G8R8A8_UNORM = 5,
+    };
+
+    VAddr address{};
+    u32 offset{};
+    u32 width{};
+    u32 height{};
+    u32 stride{};
+    PixelFormat pixel_format{};
+
+    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
+    TransformFlags transform_flags{};
+    Common::Rectangle<int> crop_rect;
+};
+
+} // namespace Tegra
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@ -10,6 +10,7 @@
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/frontend/emu_window.h"
+#include "core/hardware_interrupt_manager.h"
 #include "core/memory.h"
 #include "core/settings.h"
 #include "video_core/engines/fermi_2d.h"
@ -27,15 +28,17 @@ namespace Tegra {

 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));

-GPU::GPU(Core::System& system_, bool is_async_)
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
+      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
      maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
      fermi_2d{std::make_unique<Engines::Fermi2D>()},
      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
      maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
      kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
-      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {}
+      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
+      gpu_thread{system_, is_async_} {}

 GPU::~GPU() = default;

@ -77,31 +80,46 @@ DmaPusher& GPU::DmaPusher() {
    return *dma_pusher;
 }

+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return *cdma_pusher;
+}
+
 const DmaPusher& GPU::DmaPusher() const {
    return *dma_pusher;
 }

+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return *cdma_pusher;
+}
+
 void GPU::WaitFence(u32 syncpoint_id, u32 value) {
    // Synced GPU, is always in sync
    if (!is_async) {
        return;
    }
+    if (syncpoint_id == UINT32_MAX) {
+        // TODO: Research what this does.
+        LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
+        return;
+    }
    MICROPROFILE_SCOPE(GPU_wait);
    std::unique_lock lock{sync_mutex};
-    sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; });
+    sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; });
 }

 void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
-    syncpoints[syncpoint_id]++;
+    auto& syncpoint = syncpoints.at(syncpoint_id);
+    syncpoint++;
    std::lock_guard lock{sync_mutex};
    sync_cv.notify_all();
-    if (!syncpt_interrupts[syncpoint_id].empty()) {
-        u32 value = syncpoints[syncpoint_id].load();
-        auto it = syncpt_interrupts[syncpoint_id].begin();
-        while (it != syncpt_interrupts[syncpoint_id].end()) {
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+    if (!interrupt.empty()) {
+        u32 value = syncpoint.load();
+        auto it = interrupt.begin();
+        while (it != interrupt.end()) {
            if (value >= *it) {
                TriggerCpuInterrupt(syncpoint_id, *it);
-                it = syncpt_interrupts[syncpoint_id].erase(it);
+                it = interrupt.erase(it);
                continue;
            }
            it++;
@ -110,22 +128,22 @@ void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
 }

 u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
-    return syncpoints[syncpoint_id].load();
+    return syncpoints.at(syncpoint_id).load();
 }

 void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
-    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
                                [value](u32 in_value) { return in_value == value; });
    if (contains) {
        return;
    }
-    syncpt_interrupts[syncpoint_id].emplace_back(value);
+    interrupt.emplace_back(value);
 }

 bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
    std::lock_guard lock{sync_mutex};
-    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
    const auto iter =
        std::find_if(interrupt.begin(), interrupt.end(),
                     [value](u32 interrupt_value) { return value == interrupt_value; });
@ -182,34 +200,6 @@ void GPU::SyncGuestHost() {
    renderer->Rasterizer().SyncGuestHost();
 }

-void GPU::OnCommandListEnd() {
-    renderer->Rasterizer().ReleaseFences();
-}
-// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
-// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
-// So the values you see in docs might be multiplied by 4.
-enum class BufferMethods {
-    BindObject = 0x0,
-    Nop = 0x2,
-    SemaphoreAddressHigh = 0x4,
-    SemaphoreAddressLow = 0x5,
-    SemaphoreSequence = 0x6,
-    SemaphoreTrigger = 0x7,
-    NotifyIntr = 0x8,
-    WrcacheFlush = 0x9,
-    Unk28 = 0xA,
-    UnkCacheFlush = 0xB,
-    RefCnt = 0x14,
-    SemaphoreAcquire = 0x1A,
-    SemaphoreRelease = 0x1B,
-    FenceValue = 0x1C,
-    FenceAction = 0x1D,
-    Unk78 = 0x1E,
-    Unk7c = 0x1F,
-    Yield = 0x20,
-    NonPullerMethods = 0x40,
-};
-
 enum class GpuSemaphoreOperation {
    AcquireEqual = 0x1,
    WriteLong = 0x2,
@ -240,8 +230,12 @@ void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32
        CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
    } else {
        for (std::size_t i = 0; i < amount; i++) {
-            CallPullerMethod(
-                {method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)});
+            CallPullerMethod(MethodCall{
+                method,
+                base_start[i],
+                subchannel,
+                methods_pending - static_cast<u32>(i),
+            });
        }
    }
 }
@ -268,7 +262,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
    case BufferMethods::UnkCacheFlush:
    case BufferMethods::WrcacheFlush:
    case BufferMethods::FenceValue:
+        break;
    case BufferMethods::FenceAction:
+        ProcessFenceActionMethod();
+        break;
+    case BufferMethods::WaitForInterrupt:
+        ProcessWaitForInterruptMethod();
        break;
    case BufferMethods::SemaphoreTrigger: {
        ProcessSemaphoreTriggerMethod();
@ -298,8 +297,7 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
        break;
    }
    default:
-        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented",
-                  static_cast<u32>(method));
+        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
        break;
    }
 }
@ -378,10 +376,28 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
        dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
        break;
    default:
-        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id));
+        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
    }
 }

+void GPU::ProcessFenceActionMethod() {
+    switch (regs.fence_action.op) {
+    case FenceOperation::Acquire:
+        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        break;
+    case FenceOperation::Increment:
+        IncrementSyncPoint(regs.fence_action.syncpoint_id);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
+    }
+}
+
+void GPU::ProcessWaitForInterruptMethod() {
+    // TODO(bunnei) ImplementMe
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+}
+
 void GPU::ProcessSemaphoreTriggerMethod() {
    const auto semaphoreOperationMask = 0xF;
    const auto op =
@ -443,4 +459,75 @@ void GPU::ProcessSemaphoreAcquire() {
    }
 }

+void GPU::Start() {
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
+    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+    cpu_context->MakeCurrent();
+}
+
+void GPU::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPU::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
+void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
+    gpu_thread.SubmitList(std::move(entries));
+}
+
+void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
+void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    gpu_thread.SwapBuffers(framebuffer);
+}
+
+void GPU::FlushRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushRegion(addr, size);
+}
+
+void GPU::InvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.InvalidateRegion(addr, size);
+}
+
+void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushAndInvalidateRegion(addr, size);
+}
+
+void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
+void GPU::WaitIdle() const {
+    gpu_thread.WaitIdle();
+}
+
+void GPU::OnCommandListEnd() {
+    if (is_async) {
+        // This command only applies to asynchronous GPU mode
+        gpu_thread.OnCommandListEnd();
+    }
+}
+
 } // namespace Tegra
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@ -13,14 +13,17 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
+#include "video_core/framebuffer_config.h"
+#include "video_core/gpu_thread.h"

 using CacheAddr = std::uintptr_t;
-inline CacheAddr ToCacheAddr(const void* host_ptr) {
+[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
    return reinterpret_cast<CacheAddr>(host_ptr);
 }

-inline u8* FromCacheAddr(CacheAddr cache_addr) {
+[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
    return reinterpret_cast<u8*>(cache_addr);
 }

@ -100,28 +103,6 @@ enum class DepthFormat : u32 {
 struct CommandListHeader;
 class DebugContext;

-/**
- * Struct describing framebuffer configuration
- */
-struct FramebufferConfig {
-    enum class PixelFormat : u32 {
-        A8B8G8R8_UNORM = 1,
-        RGB565_UNORM = 4,
-        B8G8R8A8_UNORM = 5,
-    };
-
-    VAddr address;
-    u32 offset;
-    u32 width;
-    u32 height;
-    u32 stride;
-    PixelFormat pixel_format;
-
-    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
-    TransformFlags transform_flags;
-    Common::Rectangle<int> crop_rect;
-};
-
 namespace Engines {
 class Fermi2D;
 class Maxwell3D;
@ -140,7 +121,7 @@ enum class EngineID {

 class MemoryManager;

-class GPU {
+class GPU final {
 public:
    struct MethodCall {
        u32 method{};
@ -148,17 +129,17 @@ public:
        u32 subchannel{};
        u32 method_count{};

-        bool IsLastCall() const {
+        explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0)
+            : method(method_), argument(argument_), subchannel(subchannel_),
+              method_count(method_count_) {}
+
+        [[nodiscard]] bool IsLastCall() const {
            return method_count <= 1;
        }
-
-        MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0)
-            : method(method), argument(argument), subchannel(subchannel),
-              method_count(method_count) {}
    };

-    explicit GPU(Core::System& system, bool is_async);
-    virtual ~GPU();
+    explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
+    ~GPU();

    /// Binds a renderer to the GPU.
    void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
@ -175,13 +156,13 @@ public:
    /// Synchronizes CPU writes with Host GPU memory.
    void SyncGuestHost();
    /// Signal the ending of command list.
-    virtual void OnCommandListEnd();
+    void OnCommandListEnd();

    /// Request a host GPU memory flush from the CPU.
-    u64 RequestFlush(VAddr addr, std::size_t size);
+    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);

    /// Obtains current flush request fence id.
-    u64 CurrentFlushRequestFence() const {
+    [[nodiscard]] u64 CurrentFlushRequestFence() const {
        return current_flush_fence.load(std::memory_order_relaxed);
    }

@ -189,68 +170,100 @@ public:
    void TickWork();

    /// Returns a reference to the Maxwell3D GPU engine.
-    Engines::Maxwell3D& Maxwell3D();
+    [[nodiscard]] Engines::Maxwell3D& Maxwell3D();

    /// Returns a const reference to the Maxwell3D GPU engine.
-    const Engines::Maxwell3D& Maxwell3D() const;
+    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;

    /// Returns a reference to the KeplerCompute GPU engine.
-    Engines::KeplerCompute& KeplerCompute();
+    [[nodiscard]] Engines::KeplerCompute& KeplerCompute();

    /// Returns a reference to the KeplerCompute GPU engine.
-    const Engines::KeplerCompute& KeplerCompute() const;
+    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;

    /// Returns a reference to the GPU memory manager.
-    Tegra::MemoryManager& MemoryManager();
+    [[nodiscard]] Tegra::MemoryManager& MemoryManager();

    /// Returns a const reference to the GPU memory manager.
-    const Tegra::MemoryManager& MemoryManager() const;
+    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;

    /// Returns a reference to the GPU DMA pusher.
-    Tegra::DmaPusher& DmaPusher();
+    [[nodiscard]] Tegra::DmaPusher& DmaPusher();

-    VideoCore::RendererBase& Renderer() {
+    /// Returns a const reference to the GPU DMA pusher.
+    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
+
+    /// Returns a reference to the GPU CDMA pusher.
+    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
+
+    /// Returns a reference to the underlying renderer.
+    [[nodiscard]] VideoCore::RendererBase& Renderer() {
        return *renderer;
    }

-    const VideoCore::RendererBase& Renderer() const {
+    /// Returns a const reference to the underlying renderer.
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
        return *renderer;
    }

-    VideoCore::ShaderNotify& ShaderNotify() {
+    /// Returns a reference to the shader notifier.
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
        return *shader_notify;
    }

-    const VideoCore::ShaderNotify& ShaderNotify() const {
+    /// Returns a const reference to the shader notifier.
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
        return *shader_notify;
    }

    // Waits for the GPU to finish working
-    virtual void WaitIdle() const = 0;
+    void WaitIdle() const;

    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
    void WaitFence(u32 syncpoint_id, u32 value);

    void IncrementSyncPoint(u32 syncpoint_id);

-    u32 GetSyncpointValue(u32 syncpoint_id) const;
+    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;

    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);

-    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);

-    u64 GetTicks() const;
+    [[nodiscard]] u64 GetTicks() const;

-    std::unique_lock<std::mutex> LockSync() {
+    [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
        return std::unique_lock{sync_mutex};
    }

-    bool IsAsync() const {
+    [[nodiscard]] bool IsAsync() const {
        return is_async;
    }

-    /// Returns a const reference to the GPU DMA pusher.
-    const Tegra::DmaPusher& DmaPusher() const;
+    [[nodiscard]] bool UseNvdec() const {
+        return use_nvdec;
+    }
+
+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+
+        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+            FenceAction result{};
+            result.op.Assign(op);
+            result.syncpoint_id.Assign(syncpoint_id);
+            return {result.raw};
+        }
+    };

    struct Regs {
        static constexpr size_t NUM_REGS = 0x40;
@ -262,7 +275,7 @@ public:
                    u32 address_high;
                    u32 address_low;

-                    GPUVAddr SemaphoreAddress() const {
+                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                     address_low);
                    }
@ -280,10 +293,7 @@ public:
                u32 semaphore_acquire;
                u32 semaphore_release;
                u32 fence_value;
-                union {
-                    BitField<4, 4, u32> operation;
-                    BitField<8, 8, u32> id;
-                } fence_action;
+                FenceAction fence_action;
                INSERT_UNION_PADDING_WORDS(0xE2);

                // Puller state
@ -300,34 +310,39 @@ public:
    /// Performs any additional setup necessary in order to begin GPU emulation.
    /// This can be used to launch any necessary threads and register any necessary
    /// core timing events.
-    virtual void Start() = 0;
+    void Start();

    /// Obtain the CPU Context
-    virtual void ObtainContext() = 0;
+    void ObtainContext();

    /// Release the CPU Context
-    virtual void ReleaseContext() = 0;
+    void ReleaseContext();

    /// Push GPU command entries to be processed
-    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
+    void PushGPUEntries(Tegra::CommandList&& entries);
+
+    /// Push GPU command buffer entries to be processed
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries);

    /// Swap buffers (render frame)
-    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);

    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    void FlushRegion(VAddr addr, u64 size);

    /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    void InvalidateRegion(VAddr addr, u64 size);

    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size);

 protected:
-    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;

 private:
    void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessFenceActionMethod();
+    void ProcessWaitForInterruptMethod();
    void ProcessSemaphoreTriggerMethod();
    void ProcessSemaphoreRelease();
    void ProcessSemaphoreAcquire();
@ -343,13 +358,15 @@ private:
                               u32 methods_pending);

    /// Determines where the method should be executed.
-    bool ExecuteMethodOnEngine(u32 method);
+    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);

 protected:
    Core::System& system;
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
    std::unique_ptr<VideoCore::RendererBase> renderer;
+    const bool use_nvdec;

 private:
    /// Mapping of command subchannels to their bound engine ids
@ -372,12 +389,13 @@ private:
    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;

    std::mutex sync_mutex;
+    std::mutex device_mutex;

    std::condition_variable sync_cv;

    struct FlushRequest {
-        FlushRequest(u64 fence, VAddr addr, std::size_t size)
-            : fence{fence}, addr{addr}, size{size} {}
+        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
+            : fence{fence_}, addr{addr_}, size{size_} {}
        u64 fence;
        VAddr addr;
        std::size_t size;
@ -389,6 +407,9 @@ private:
    std::mutex flush_request_mutex;

    const bool is_async;
+
+    VideoCommon::GPUThread::ThreadManager gpu_thread;
+    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@ -1,64 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "core/core.h"
-#include "core/hardware_interrupt_manager.h"
-#include "video_core/gpu_asynch.h"
-#include "video_core/gpu_thread.h"
-#include "video_core/renderer_base.h"
-
-namespace VideoCommon {
-
-GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {}
-
-GPUAsynch::~GPUAsynch() = default;
-
-void GPUAsynch::Start() {
-    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
-    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
-    cpu_context->MakeCurrent();
-}
-
-void GPUAsynch::ObtainContext() {
-    cpu_context->MakeCurrent();
-}
-
-void GPUAsynch::ReleaseContext() {
-    cpu_context->DoneCurrent();
-}
-
-void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
-    gpu_thread.SubmitList(std::move(entries));
-}
-
-void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
-    gpu_thread.SwapBuffers(framebuffer);
-}
-
-void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
-    gpu_thread.FlushRegion(addr, size);
-}
-
-void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
-    gpu_thread.InvalidateRegion(addr, size);
-}
-
-void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    gpu_thread.FlushAndInvalidateRegion(addr, size);
-}
-
-void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
-    auto& interrupt_manager = system.InterruptManager();
-    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
-}
-
-void GPUAsynch::WaitIdle() const {
-    gpu_thread.WaitIdle();
-}
-
-void GPUAsynch::OnCommandListEnd() {
-    gpu_thread.OnCommandListEnd();
-}
-
-} // namespace VideoCommon
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@ -1,46 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include "video_core/gpu.h"
-#include "video_core/gpu_thread.h"
-
-namespace Core::Frontend {
-class GraphicsContext;
-}
-
-namespace VideoCore {
-class RendererBase;
-} // namespace VideoCore
-
-namespace VideoCommon {
-
-/// Implementation of GPU interface that runs the GPU asynchronously
-class GPUAsynch final : public Tegra::GPU {
-public:
-    explicit GPUAsynch(Core::System& system);
-    ~GPUAsynch() override;
-
-    void Start() override;
-    void ObtainContext() override;
-    void ReleaseContext() override;
-    void PushGPUEntries(Tegra::CommandList&& entries) override;
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
-    void WaitIdle() const override;
-
-    void OnCommandListEnd() override;
-
-protected:
-    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
-
-private:
-    GPUThread::ThreadManager gpu_thread;
-    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
-};
-
-} // namespace VideoCommon
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@ -1,45 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/gpu_synch.h"
-#include "video_core/renderer_base.h"
-
-namespace VideoCommon {
-
-GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {}
-
-GPUSynch::~GPUSynch() = default;
-
-void GPUSynch::Start() {}
-
-void GPUSynch::ObtainContext() {
-    renderer->Context().MakeCurrent();
-}
-
-void GPUSynch::ReleaseContext() {
-    renderer->Context().DoneCurrent();
-}
-
-void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
-    dma_pusher->Push(std::move(entries));
-    dma_pusher->DispatchCalls();
-}
-
-void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
-    renderer->SwapBuffers(framebuffer);
-}
-
-void GPUSynch::FlushRegion(VAddr addr, u64 size) {
-    renderer->Rasterizer().FlushRegion(addr, size);
-}
-
-void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
-    renderer->Rasterizer().InvalidateRegion(addr, size);
-}
-
-void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    renderer->Rasterizer().FlushAndInvalidateRegion(addr, size);
-}
-
-} // namespace VideoCommon
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@ -1,40 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include "video_core/gpu.h"
-
-namespace Core::Frontend {
-class GraphicsContext;
-}
-
-namespace VideoCore {
-class RendererBase;
-} // namespace VideoCore
-
-namespace VideoCommon {
-
-/// Implementation of GPU interface that runs the GPU synchronously
-class GPUSynch final : public Tegra::GPU {
-public:
-    explicit GPUSynch(Core::System& system);
-    ~GPUSynch() override;
-
-    void Start() override;
-    void ObtainContext() override;
-    void ReleaseContext() override;
-    void PushGPUEntries(Tegra::CommandList&& entries) override;
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
-    void WaitIdle() const override {}
-
-protected:
-    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
-                             [[maybe_unused]] u32 value) const override {}
-};
-
-} // namespace VideoCommon
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@ -4,6 +4,7 @@

 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/scope_exit.h"
 #include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
@ -18,9 +19,11 @@ namespace VideoCommon::GPUThread {
 /// Runs the GPU thread
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                      Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
-                      SynchState& state) {
+                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
    std::string name = "yuzu:GPU";
    MicroProfileOnThreadCreate(name.c_str());
+    SCOPE_EXIT({ MicroProfileOnThreadExit(); });
+
    Common::SetCurrentThreadName(name.c_str());
    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
    system.RegisterHostThread();
@ -39,19 +42,23 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
    CommandDataContainer next;
    while (state.is_running) {
        next = state.queue.PopWait();
-        if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
+        if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
            dma_pusher.Push(std::move(submit_list->entries));
            dma_pusher.DispatchCalls();
-        } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
+        } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
+            // NVDEC
+            cdma_pusher.Push(std::move(command_list->entries));
+            cdma_pusher.DispatchCalls();
+        } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
            renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
        } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
            renderer.Rasterizer().ReleaseFences();
        } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
            system.GPU().TickWork();
-        } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
-            renderer.Rasterizer().FlushRegion(data->addr, data->size);
-        } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
-            renderer.Rasterizer().OnCPUWrite(data->addr, data->size);
+        } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
+            renderer.Rasterizer().FlushRegion(flush->addr, flush->size);
+        } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
+            renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size);
        } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
            return;
        } else {
@ -61,7 +68,8 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
    }
 }

-ThreadManager::ThreadManager(Core::System& system) : system{system} {}
+ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
+    : system{system_}, is_async{is_async_} {}

 ThreadManager::~ThreadManager() {
    if (!thread.joinable()) {
@ -75,33 +83,48 @@ ThreadManager::~ThreadManager() {

 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                Core::Frontend::GraphicsContext& context,
-                                Tegra::DmaPusher& dma_pusher) {
-    thread = std::thread{RunThread,         std::ref(system),     std::ref(renderer),
-                         std::ref(context), std::ref(dma_pusher), std::ref(state)};
+                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
+    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
 }

 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
    PushCommand(SubmitListCommand(std::move(entries)));
 }

+void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
+    PushCommand(SubmitChCommandEntries(std::move(entries)));
+}
+
 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
 }

 void ThreadManager::FlushRegion(VAddr addr, u64 size) {
-    if (!Settings::IsGPULevelHigh()) {
+    if (!is_async) {
+        // Always flush with synchronous GPU mode
        PushCommand(FlushRegionCommand(addr, size));
        return;
    }
-    if (!Settings::IsGPULevelExtreme()) {
-        return;
-    }
-    if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) {
+
+    // Asynchronous GPU mode
+    switch (Settings::values.gpu_accuracy.GetValue()) {
+    case Settings::GPUAccuracy::Normal:
+        PushCommand(FlushRegionCommand(addr, size));
+        break;
+    case Settings::GPUAccuracy::High:
+        // TODO(bunnei): Is this right? Preserving existing behavior for now
+        break;
+    case Settings::GPUAccuracy::Extreme: {
        auto& gpu = system.GPU();
        u64 fence = gpu.RequestFlush(addr, size);
        PushCommand(GPUTickCommand());
        while (fence > gpu.CurrentFlushRequestFence()) {
        }
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue());
    }
 }

@ -115,7 +138,8 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
 }

 void ThreadManager::WaitIdle() const {
-    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed)) {
+    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
+           system.IsPoweredOn()) {
    }
 }

@ -126,6 +150,12 @@ void ThreadManager::OnCommandListEnd() {
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
    const u64 fence{++state.last_fence};
    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
+
+    if (!is_async) {
+        // In synchronous GPU mode, block the caller until the command has executed
+        WaitIdle();
+    }
+
    return fence;
 }

--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@ -10,8 +10,9 @@
 #include <optional>
 #include <thread>
 #include <variant>
+
 #include "common/threadsafe_queue.h"
-#include "video_core/gpu.h"
+#include "video_core/framebuffer_config.h"

 namespace Tegra {
 struct FramebufferConfig;
@ -25,6 +26,10 @@ class GraphicsContext;
 class System;
 } // namespace Core

+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
 namespace VideoCommon::GPUThread {

 /// Command to signal to the GPU thread that processing has ended
@ -32,22 +37,30 @@ struct EndProcessingCommand final {};

 /// Command to signal to the GPU thread that a command list is ready for processing
 struct SubmitListCommand final {
-    explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
+    explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}

    Tegra::CommandList entries;
 };

+/// Command to signal to the GPU thread that a cdma command list is ready for processing
+struct SubmitChCommandEntries final {
+    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries_)
+        : entries{std::move(entries_)} {}
+
+    Tegra::ChCommandHeaderList entries;
+};
+
 /// Command to signal to the GPU thread that a swap buffers is pending
 struct SwapBuffersCommand final {
-    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
-        : framebuffer{std::move(framebuffer)} {}
+    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer_)
+        : framebuffer{std::move(framebuffer_)} {}

    std::optional<Tegra::FramebufferConfig> framebuffer;
 };

 /// Command to signal to the GPU thread to flush a region
 struct FlushRegionCommand final {
-    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+    explicit constexpr FlushRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}

    VAddr addr;
    u64 size;
@ -55,7 +68,7 @@ struct FlushRegionCommand final {

 /// Command to signal to the GPU thread to invalidate a region
 struct InvalidateRegionCommand final {
-    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+    explicit constexpr InvalidateRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}

    VAddr addr;
    u64 size;
@ -63,8 +76,8 @@ struct InvalidateRegionCommand final {

 /// Command to signal to the GPU thread to flush and invalidate a region
 struct FlushAndInvalidateRegionCommand final {
-    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
-        : addr{addr}, size{size} {}
+    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr_, u64 size_)
+        : addr{addr_}, size{size_} {}

    VAddr addr;
    u64 size;
@ -77,15 +90,15 @@ struct OnCommandListEndCommand final {};
 struct GPUTickCommand final {};

 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
-                 GPUTickCommand>;
+    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
+                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
+                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;

 struct CommandDataContainer {
    CommandDataContainer() = default;

-    CommandDataContainer(CommandData&& data, u64 next_fence)
-        : data{std::move(data)}, fence{next_fence} {}
+    explicit CommandDataContainer(CommandData&& data_, u64 next_fence_)
+        : data{std::move(data_)}, fence{next_fence_} {}

    CommandData data;
    u64 fence{};
@ -104,16 +117,19 @@ struct SynchState final {
 /// Class used to manage the GPU thread
 class ThreadManager final {
 public:
-    explicit ThreadManager(Core::System& system);
+    explicit ThreadManager(Core::System& system_, bool is_async_);
    ~ThreadManager();

    /// Creates and starts the GPU thread.
    void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
-                     Tegra::DmaPusher& dma_pusher);
+                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);

    /// Push GPU command entries to be processed
    void SubmitList(Tegra::CommandList&& entries);

+    /// Push GPU CDMA command buffer entries to be processed
+    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
+
    /// Swap buffers (render frame)
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);

@ -135,11 +151,11 @@ private:
    /// Pushes a command to be executed by the GPU thread
    u64 PushCommand(CommandData&& command_data);

-private:
    SynchState state;
    Core::System& system;
    std::thread thread;
    std::thread::id thread_id;
+    const bool is_async;
 };

 } // namespace VideoCommon::GPUThread
--- a/src/video_core/guest_driver.h
+++ b/src/video_core/guest_driver.h
@ -19,8 +19,8 @@ namespace VideoCore {
 class GuestDriverProfile {
 public:
    explicit GuestDriverProfile() = default;
-    explicit GuestDriverProfile(std::optional<u32> texture_handler_size)
-        : texture_handler_size{texture_handler_size} {}
+    explicit GuestDriverProfile(std::optional<u32> texture_handler_size_)
+        : texture_handler_size{texture_handler_size_} {}

    void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);

--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@ -1,18 +1,29 @@
 set(SHADER_FILES
+    block_linear_unswizzle_2d.comp
+    block_linear_unswizzle_3d.comp
+    convert_depth_to_float.frag
+    convert_float_to_depth.frag
+    full_screen_triangle.vert
+    opengl_copy_bc4.comp
    opengl_present.frag
    opengl_present.vert
+    pitch_unswizzle.comp
+    vulkan_blit_color_float.frag
+    vulkan_blit_depth_stencil.frag
+    vulkan_present.frag
+    vulkan_present.vert
+    vulkan_quad_array.comp
+    vulkan_quad_indexed.comp
+    vulkan_uint8.comp
 )

+find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED)
+
+set(GLSL_FLAGS "")
+
 set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
-set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
-
 set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
-add_custom_command(
-    OUTPUT
-        ${SHADER_DIR}
-    COMMAND
-        ${CMAKE_COMMAND} -E make_directory ${SHADER_DIR}
-)
+set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)

 set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
 set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
@ -20,19 +31,36 @@ set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
 foreach(FILENAME IN ITEMS ${SHADER_FILES})
    string(REPLACE "." "_" SHADER_NAME ${FILENAME})
    set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
-    set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
-    add_custom_command(
-        OUTPUT
-            ${HEADER_FILE}
-        COMMAND
-            ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE}
-        MAIN_DEPENDENCY
-            ${SOURCE_FILE}
-        DEPENDS
-            ${HEADER_GENERATOR}
-            ${INPUT_FILE}
-    )
-    set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE})
+    # Skip generating source headers on Vulkan exclusive files
+    if (NOT ${FILENAME} MATCHES "vulkan.*")
+        set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
+        add_custom_command(
+            OUTPUT
+                ${SOURCE_HEADER_FILE}
+            COMMAND
+                ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE}
+            MAIN_DEPENDENCY
+                ${SOURCE_FILE}
+            DEPENDS
+                ${INPUT_FILE}
+                # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
+        )
+        set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE})
+    endif()
+    # Skip compiling to SPIR-V OpenGL exclusive files
+    if (NOT ${FILENAME} MATCHES "opengl.*")
+        string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME)
+        set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h)
+        add_custom_command(
+            OUTPUT
+                ${SPIRV_HEADER_FILE}
+            COMMAND
+                ${GLSLANGVALIDATOR} -V ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
+            MAIN_DEPENDENCY
+                ${SOURCE_FILE}
+        )
+        set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE})
+    endif()
 endforeach()

 add_custom_target(host_shaders
--- a/src/video_core/host_shaders/StringShaderHeader.cmake
+++ b/src/video_core/host_shaders/StringShaderHeader.cmake
@ -8,4 +8,6 @@ string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)

 file(READ ${SOURCE_FILE} CONTENTS)

+get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
+make_directory(${OUTPUT_DIR})
 configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY)
--- a/src/video_core/host_shaders/block_linear_unswizzle_2d.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp
@ -0,0 +1,122 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 2
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec3 origin;
+UNIFORM(1) ivec3 destination;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint layer_stride;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
+END_PUSH_CONSTANTS
+
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image;
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block_log2) {
+#if HAS_EXTENDED_TYPES
+    case 0:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 1:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 0:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 2:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 3:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 4:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += pos.z * layer_stride;
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp
@ -0,0 +1,125 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 2
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec3 origin;
+UNIFORM(1) ivec3 destination;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint slice_size;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
+UNIFORM(8) uint block_depth;
+UNIFORM(9) uint block_depth_mask;
+END_PUSH_CONSTANTS
+
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image;
+
+layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block_log2) {
+#if HAS_EXTENDED_TYPES
+    case 0:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 1:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 0:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 2:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 3:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 4:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += (pos.z >> block_depth) * slice_size;
+    offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height);
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/convert_depth_to_float.frag
+++ b/src/video_core/host_shaders/convert_depth_to_float.frag
@ -0,0 +1,13 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D depth_texture;
+layout(location = 0) out float output_color;
+
+void main() {
+    ivec2 coord = ivec2(gl_FragCoord.xy);
+    output_color = texelFetch(depth_texture, coord, 0).r;
+}
--- a/src/video_core/host_shaders/convert_float_to_depth.frag
+++ b/src/video_core/host_shaders/convert_float_to_depth.frag
@ -0,0 +1,13 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D color_texture;
+
+void main() {
+    ivec2 coord = ivec2(gl_FragCoord.xy);
+    float color = texelFetch(color_texture, coord, 0).r;
+    gl_FragDepth = color;
+}
--- a/src/video_core/host_shaders/full_screen_triangle.vert
+++ b/src/video_core/host_shaders/full_screen_triangle.vert
@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+#ifdef VULKAN
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) vec2 tex_scale;
+UNIFORM(1) vec2 tex_offset;
+END_PUSH_CONSTANTS
+
+layout(location = 0) out vec2 texcoord;
+
+void main() {
+    float x = float((gl_VertexIndex & 1) << 2);
+    float y = float((gl_VertexIndex & 2) << 1);
+    gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0);
+    texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset);
+}
--- a/src/video_core/host_shaders/opengl_copy_bc4.comp
+++ b/src/video_core/host_shaders/opengl_copy_bc4.comp
@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430 core
+#extension GL_ARB_gpu_shader_int64 : require
+
+layout (local_size_x = 4, local_size_y = 4) in;
+
+layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input;
+layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output;
+
+layout(location = 0) uniform uvec3 src_offset;
+layout(location = 1) uniform uvec3 dst_offset;
+
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
+uint DecompressBlock(uint64_t bits, uvec2 coord) {
+    const uint code_offset = 16 + 3 * (4 * coord.y + coord.x);
+    const uint code = uint(bits >> code_offset) & 7;
+    const uint red0 = uint(bits >> 0) & 0xff;
+    const uint red1 = uint(bits >> 8) & 0xff;
+    if (red0 > red1) {
+        switch (code) {
+        case 0:
+            return red0;
+        case 1:
+            return red1;
+        case 2:
+            return (6 * red0 + 1 * red1) / 7;
+        case 3:
+            return (5 * red0 + 2 * red1) / 7;
+        case 4:
+            return (4 * red0 + 3 * red1) / 7;
+        case 5:
+            return (3 * red0 + 4 * red1) / 7;
+        case 6:
+            return (2 * red0 + 5 * red1) / 7;
+        case 7:
+            return (1 * red0 + 6 * red1) / 7;
+        }
+    } else {
+        switch (code) {
+        case 0:
+            return red0;
+        case 1:
+            return red1;
+        case 2:
+            return (4 * red0 + 1 * red1) / 5;
+        case 3:
+            return (3 * red0 + 2 * red1) / 5;
+        case 4:
+            return (2 * red0 + 3 * red1) / 5;
+        case 5:
+            return (1 * red0 + 4 * red1) / 5;
+        case 6:
+            return 0;
+        case 7:
+            return 0xff;
+        }
+    }
+    return 0;
+}
+
+void main() {
+    uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg;
+    uint64_t bits = packUint2x32(packed_bits);
+    uint red = DecompressBlock(bits, gl_LocalInvocationID.xy);
+    uvec4 color = uvec4(red & 0xff, 0, 0, 0xff);
+    imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color);
+}
--- a/src/video_core/host_shaders/opengl_present.frag
+++ b/src/video_core/host_shaders/opengl_present.frag
@ -1,3 +1,7 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
 #version 430 core

 layout (location = 0) in vec2 frag_tex_coord;
--- a/src/video_core/host_shaders/opengl_present.vert
+++ b/src/video_core/host_shaders/opengl_present.vert
@ -1,3 +1,7 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
 #version 430 core

 out gl_PerVertex {
--- a/src/video_core/host_shaders/pitch_unswizzle.comp
+++ b/src/video_core/host_shaders/pitch_unswizzle.comp
@ -0,0 +1,86 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 1
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec2 origin;
+UNIFORM(1) ivec2 destination;
+UNIFORM(2) uint bytes_per_block;
+UNIFORM(3) uint pitch;
+END_PUSH_CONSTANTS
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image;
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block) {
+#if HAS_EXTENDED_TYPES
+    case 1:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 2:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 2:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 4:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 8:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 16:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec2 pos = gl_GlobalInvocationID.xy + origin;
+
+    uint offset = 0;
+    offset += pos.x * bytes_per_block;
+    offset += pos.y * pitch;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/vulkan_blit_color_float.frag
+++ b/src/video_core/host_shaders/vulkan_blit_color_float.frag
@ -0,0 +1,14 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D tex;
+
+layout(location = 0) in vec2 texcoord;
+layout(location = 0) out vec4 color;
+
+void main() {
+    color = textureLod(tex, texcoord, 0);
+}
--- a/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag
+++ b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag
@ -0,0 +1,16 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+#extension GL_ARB_shader_stencil_export : require
+
+layout(binding = 0) uniform sampler2D depth_tex;
+layout(binding = 1) uniform isampler2D stencil_tex;
+
+layout(location = 0) in vec2 texcoord;
+
+void main() {
+    gl_FragDepth = textureLod(depth_tex, texcoord, 0).r;
+    gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r;
+}
--- a/src/video_core/renderer_vulkan/shaders/blit.frag
+++ b/src/video_core/renderer_vulkan/shaders/blit.frag
@ -2,15 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-/*
- * Build instructions:
- * $ glslangValidator -V $THIS_FILE -o output.spv
- * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
- * $ xxd -i optimized.spv
- *
- * Then copy that bytecode to the C++ file
- */
-
 #version 460 core

 layout (location = 0) in vec2 frag_tex_coord;
--- a/src/video_core/renderer_vulkan/shaders/blit.vert
+++ b/src/video_core/renderer_vulkan/shaders/blit.vert
@ -2,15 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-/*
- * Build instructions:
- * $ glslangValidator -V $THIS_FILE -o output.spv
- * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
- * $ xxd -i optimized.spv
- *
- * Then copy that bytecode to the C++ file
- */
-
 #version 460 core

 layout (location = 0) in vec2 vert_position;
--- a/src/video_core/renderer_vulkan/shaders/quad_array.comp
+++ b/src/video_core/renderer_vulkan/shaders/quad_array.comp
@ -2,15 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-/*
- * Build instructions:
- * $ glslangValidator -V $THIS_FILE -o output.spv
- * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
- * $ xxd -i optimized.spv
- *
- * Then copy that bytecode to the C++ file
- */
-
 #version 460 core

 layout (local_size_x = 1024) in;
--- a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp
+++ b/src/video_core/renderer_vulkan/shaders/quad_indexed.comp
@ -2,15 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-/*
- * Build instructions:
- * $ glslangValidator -V quad_indexed.comp -o output.spv
- * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
- * $ xxd -i optimized.spv
- *
- * Then copy that bytecode to the C++ file
- */
-
 #version 460 core

 layout (local_size_x = 1024) in;
--- a/src/video_core/renderer_vulkan/shaders/uint8.comp
+++ b/src/video_core/renderer_vulkan/shaders/uint8.comp
@ -2,15 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-/*
- * Build instructions:
- * $ glslangValidator -V $THIS_FILE -o output.spv
- * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
- * $ xxd -i optimized.spv
- *
- * Then copy that bytecode to the C++ file
- */
-
 #version 460 core
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_EXT_shader_8bit_storage : require
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@ -85,7 +85,7 @@ constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
    {0x0217920100488FF7, &HLE_0217920100488FF7},
 }};

-HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
 HLEMacro::~HLEMacro() = default;

 std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
@ -99,8 +99,8 @@ std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) co

 HLEMacroImpl::~HLEMacroImpl() = default;

-HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
-    : maxwell3d(maxwell3d), func(func) {}
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
+    : maxwell3d{maxwell3d_}, func{func_} {}

 void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
    func(maxwell3d, parameters);
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u3

 class HLEMacro {
 public:
-    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
    ~HLEMacro();

    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@ -11,29 +11,29 @@
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));

 namespace Tegra {
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
-    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
+    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}

 std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
 }

-MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
-                                           const std::vector<u32>& code)
-    : maxwell3d(maxwell3d), code(code) {}
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_,
+                                           const std::vector<u32>& code_)
+    : maxwell3d{maxwell3d_}, code{code_} {}

-void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
    MICROPROFILE_SCOPE(MacroInterp);
    Reset();

-    registers[1] = parameters[0];
-    num_parameters = parameters.size();
+    registers[1] = params[0];
+    num_parameters = params.size();

    if (num_parameters > parameters_capacity) {
        parameters_capacity = num_parameters;
-        this->parameters = std::make_unique<u32[]>(num_parameters);
+        parameters = std::make_unique<u32[]>(num_parameters);
    }
-    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
+    std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32));

    // Execute the code until we hit an exit condition.
    bool keep_executing = true;
@ -133,8 +133,7 @@ bool MacroInterpreterImpl::Step(bool is_delay_slot) {
        break;
    }
    default:
-        UNIMPLEMENTED_MSG("Unimplemented macro operation {}",
-                          static_cast<u32>(opcode.operation.Value()));
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value());
    }

    // An instruction with the Exit flag will not actually
@ -182,7 +181,7 @@ u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a,
        return ~(src_a & src_b);

    default:
-        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast<u32>(operation));
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation);
        return 0;
    }
 }
@ -230,7 +229,7 @@ void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 r
        Send((result >> 12) & 0b111111);
        break;
    default:
-        UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast<u32>(operation));
+        UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation);
    }
 }

--- a/src/video_core/macro/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@ -17,7 +17,7 @@ class Maxwell3D;

 class MacroInterpreter final : public MacroEngine {
 public:
-    explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
+    explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_);

 protected:
    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
@ -28,8 +28,8 @@ private:

 class MacroInterpreterImpl : public CachedMacro {
 public:
-    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
-    void Execute(const std::vector<u32>& parameters, u32 method) override;
+    explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
+    void Execute(const std::vector<u32>& params, u32 method) override;

 private:
    /// Resets the execution engine state, zeroing registers, etc.
@ -38,9 +38,9 @@ private:
    /**
     * Executes a single macro instruction located at the current program counter. Returns whether
     * the interpreter should keep running.
-     * @param offset Offset to start execution at.
+     *
     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
-     * previous instruction.
+     *                      previous instruction.
     */
    bool Step(bool is_delay_slot);

--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@ -28,15 +28,15 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
    BRANCH_HOLDER,
 });

-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
-    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
+    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}

 std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
 }

-MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
-    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
+    : CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} {
    Compile();
 }

@ -165,8 +165,7 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
        }
        break;
    default:
-        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
-                          static_cast<std::size_t>(opcode.alu_operation.Value()));
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value());
        break;
    }
    Compile_ProcessResult(opcode.result_operation, opcode.dst);
@ -553,15 +552,15 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
 }

 void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
-    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
+    const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) {
        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
        // register.
-        if (reg == 0) {
+        if (reg_index == 0) {
            return;
        }
-        mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
+        mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result);
    };
-    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); };

    switch (operation) {
    case Macro::ResultOperation::IgnoreAndFetch:
@ -604,7 +603,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3
        Compile_Send(RESULT);
        break;
    default:
-        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation);
    }
 }

--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@ -23,7 +23,7 @@ constexpr size_t MAX_CODE_SIZE = 0x10000;

 class MacroJITx64 final : public MacroEngine {
 public:
-    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_);

 protected:
    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
@ -34,7 +34,7 @@ private:

 class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
 public:
-    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
    ~MacroJITx64Impl();

    void Execute(const std::vector<u32>& parameters, u32 method) override;
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -11,6 +11,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"

 namespace Tegra {

@ -44,13 +45,22 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_
    return Map(cpu_addr, *FindFreeRange(size, align), size);
 }

+GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
+    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
+    ASSERT(gpu_addr);
+    return Map(cpu_addr, *gpu_addr, size);
+}
+
 void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    if (!size) {
        return;
    }

    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-    system.GPU().FlushAndInvalidateRegion(*GpuToCpuAddress(gpu_addr), size);
+    const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);
+
+    rasterizer->UnmapMemory(*cpu_addr, size);

    UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
 }
@ -108,7 +118,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s
    page_table[PageEntryIndex(gpu_addr)] = page_entry;
 }

-std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const {
+std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
+                                                     bool start_32bit_address) const {
    if (!align) {
        align = page_size;
    } else {
@ -116,7 +127,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
    }

    u64 available_size{};
-    GPUVAddr gpu_addr{address_space_start};
+    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
    while (gpu_addr + available_size < address_space_size) {
        if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
            available_size += page_size;
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@ -28,7 +28,7 @@ public:
    };

    constexpr PageEntry() = default;
-    constexpr PageEntry(State state) : state{state} {}
+    constexpr PageEntry(State state_) : state{state_} {}
    constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {}

    [[nodiscard]] constexpr bool IsUnmapped() const {
@ -68,7 +68,7 @@ static_assert(sizeof(PageEntry) == 4, "PageEntry is too large");

 class MemoryManager final {
 public:
-    explicit MemoryManager(Core::System& system);
+    explicit MemoryManager(Core::System& system_);
    ~MemoryManager();

    /// Binds a renderer to the memory manager.
@ -116,6 +116,7 @@ public:

    [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
    [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
+    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
    [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
    [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
    void Unmap(GPUVAddr gpu_addr, std::size_t size);
@ -124,7 +125,8 @@ private:
    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
    void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
    GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
-    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const;
+    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
+                                                        bool start_32bit_address = false) const;

    void TryLockPage(PageEntry page_entry, std::size_t size);
    void TryUnlockPage(PageEntry page_entry, std::size_t size);
@ -135,6 +137,7 @@ private:

    static constexpr u64 address_space_size = 1ULL << 40;
    static constexpr u64 address_space_start = 1ULL << 32;
+    static constexpr u64 address_space_start_low = 1ULL << 16;
    static constexpr u64 page_bits{16};
    static constexpr u64 page_size{1 << page_bits};
    static constexpr u64 page_mask{page_size - 1};
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@ -1,250 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <array>
-#include <cstring>
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "video_core/morton.h"
-#include "video_core/surface.h"
-#include "video_core/textures/decoders.h"
-
-namespace VideoCore {
-
-using Surface::GetBytesPerPixel;
-using Surface::PixelFormat;
-
-using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, u8*);
-using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>;
-
-template <bool morton_to_linear, PixelFormat format>
-static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth,
-                       u32 tile_width_spacing, u8* buffer, u8* addr) {
-    constexpr u32 bytes_per_pixel = GetBytesPerPixel(format);
-
-    // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
-    // pixel values.
-    constexpr u32 tile_size_x{GetDefaultBlockWidth(format)};
-    constexpr u32 tile_size_y{GetDefaultBlockHeight(format)};
-
-    if constexpr (morton_to_linear) {
-        Tegra::Texture::UnswizzleTexture(buffer, addr, tile_size_x, tile_size_y, bytes_per_pixel,
-                                         stride, height, depth, block_height, block_depth,
-                                         tile_width_spacing);
-    } else {
-        Tegra::Texture::CopySwizzledData((stride + tile_size_x - 1) / tile_size_x,
-                                         (height + tile_size_y - 1) / tile_size_y, depth,
-                                         bytes_per_pixel, bytes_per_pixel, addr, buffer, false,
-                                         block_height, block_depth, tile_width_spacing);
-    }
-}
-
-static constexpr ConversionArray morton_to_linear_fns = {
-    MortonCopy<true, PixelFormat::A8B8G8R8_UNORM>,
-    MortonCopy<true, PixelFormat::A8B8G8R8_SNORM>,
-    MortonCopy<true, PixelFormat::A8B8G8R8_SINT>,
-    MortonCopy<true, PixelFormat::A8B8G8R8_UINT>,
-    MortonCopy<true, PixelFormat::R5G6B5_UNORM>,
-    MortonCopy<true, PixelFormat::B5G6R5_UNORM>,
-    MortonCopy<true, PixelFormat::A1R5G5B5_UNORM>,
-    MortonCopy<true, PixelFormat::A2B10G10R10_UNORM>,
-    MortonCopy<true, PixelFormat::A2B10G10R10_UINT>,
-    MortonCopy<true, PixelFormat::A1B5G5R5_UNORM>,
-    MortonCopy<true, PixelFormat::R8_UNORM>,
-    MortonCopy<true, PixelFormat::R8_SNORM>,
-    MortonCopy<true, PixelFormat::R8_SINT>,
-    MortonCopy<true, PixelFormat::R8_UINT>,
-    MortonCopy<true, PixelFormat::R16G16B16A16_FLOAT>,
-    MortonCopy<true, PixelFormat::R16G16B16A16_UNORM>,
-    MortonCopy<true, PixelFormat::R16G16B16A16_SNORM>,
-    MortonCopy<true, PixelFormat::R16G16B16A16_SINT>,
-    MortonCopy<true, PixelFormat::R16G16B16A16_UINT>,
-    MortonCopy<true, PixelFormat::B10G11R11_FLOAT>,
-    MortonCopy<true, PixelFormat::R32G32B32A32_UINT>,
-    MortonCopy<true, PixelFormat::BC1_RGBA_UNORM>,
-    MortonCopy<true, PixelFormat::BC2_UNORM>,
-    MortonCopy<true, PixelFormat::BC3_UNORM>,
-    MortonCopy<true, PixelFormat::BC4_UNORM>,
-    MortonCopy<true, PixelFormat::BC4_SNORM>,
-    MortonCopy<true, PixelFormat::BC5_UNORM>,
-    MortonCopy<true, PixelFormat::BC5_SNORM>,
-    MortonCopy<true, PixelFormat::BC7_UNORM>,
-    MortonCopy<true, PixelFormat::BC6H_UFLOAT>,
-    MortonCopy<true, PixelFormat::BC6H_SFLOAT>,
-    MortonCopy<true, PixelFormat::ASTC_2D_4X4_UNORM>,
-    MortonCopy<true, PixelFormat::B8G8R8A8_UNORM>,
-    MortonCopy<true, PixelFormat::R32G32B32A32_FLOAT>,
-    MortonCopy<true, PixelFormat::R32G32B32A32_SINT>,
-    MortonCopy<true, PixelFormat::R32G32_FLOAT>,
-    MortonCopy<true, PixelFormat::R32G32_SINT>,
-    MortonCopy<true, PixelFormat::R32_FLOAT>,
-    MortonCopy<true, PixelFormat::R16_FLOAT>,
-    MortonCopy<true, PixelFormat::R16_UNORM>,
-    MortonCopy<true, PixelFormat::R16_SNORM>,
-    MortonCopy<true, PixelFormat::R16_UINT>,
-    MortonCopy<true, PixelFormat::R16_SINT>,
-    MortonCopy<true, PixelFormat::R16G16_UNORM>,
-    MortonCopy<true, PixelFormat::R16G16_FLOAT>,
-    MortonCopy<true, PixelFormat::R16G16_UINT>,
-    MortonCopy<true, PixelFormat::R16G16_SINT>,
-    MortonCopy<true, PixelFormat::R16G16_SNORM>,
-    MortonCopy<true, PixelFormat::R32G32B32_FLOAT>,
-    MortonCopy<true, PixelFormat::A8B8G8R8_SRGB>,
-    MortonCopy<true, PixelFormat::R8G8_UNORM>,
-    MortonCopy<true, PixelFormat::R8G8_SNORM>,
-    MortonCopy<true, PixelFormat::R8G8_SINT>,
-    MortonCopy<true, PixelFormat::R8G8_UINT>,
-    MortonCopy<true, PixelFormat::R32G32_UINT>,
-    MortonCopy<true, PixelFormat::R16G16B16X16_FLOAT>,
-    MortonCopy<true, PixelFormat::R32_UINT>,
-    MortonCopy<true, PixelFormat::R32_SINT>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X8_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X5_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_5X4_UNORM>,
-    MortonCopy<true, PixelFormat::B8G8R8A8_SRGB>,
-    MortonCopy<true, PixelFormat::BC1_RGBA_SRGB>,
-    MortonCopy<true, PixelFormat::BC2_SRGB>,
-    MortonCopy<true, PixelFormat::BC3_SRGB>,
-    MortonCopy<true, PixelFormat::BC7_SRGB>,
-    MortonCopy<true, PixelFormat::A4B4G4R4_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_5X5_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_10X8_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_6X6_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_6X6_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_10X10_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_10X10_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_12X12_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_12X12_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X6_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>,
-    MortonCopy<true, PixelFormat::ASTC_2D_6X5_UNORM>,
-    MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>,
-    MortonCopy<true, PixelFormat::E5B9G9R9_FLOAT>,
-    MortonCopy<true, PixelFormat::D32_FLOAT>,
-    MortonCopy<true, PixelFormat::D16_UNORM>,
-    MortonCopy<true, PixelFormat::D24_UNORM_S8_UINT>,
-    MortonCopy<true, PixelFormat::S8_UINT_D24_UNORM>,
-    MortonCopy<true, PixelFormat::D32_FLOAT_S8_UINT>,
-};
-
-static constexpr ConversionArray linear_to_morton_fns = {
-    MortonCopy<false, PixelFormat::A8B8G8R8_UNORM>,
-    MortonCopy<false, PixelFormat::A8B8G8R8_SNORM>,
-    MortonCopy<false, PixelFormat::A8B8G8R8_SINT>,
-    MortonCopy<false, PixelFormat::A8B8G8R8_UINT>,
-    MortonCopy<false, PixelFormat::R5G6B5_UNORM>,
-    MortonCopy<false, PixelFormat::B5G6R5_UNORM>,
-    MortonCopy<false, PixelFormat::A1R5G5B5_UNORM>,
-    MortonCopy<false, PixelFormat::A2B10G10R10_UNORM>,
-    MortonCopy<false, PixelFormat::A2B10G10R10_UINT>,
-    MortonCopy<false, PixelFormat::A1B5G5R5_UNORM>,
-    MortonCopy<false, PixelFormat::R8_UNORM>,
-    MortonCopy<false, PixelFormat::R8_SNORM>,
-    MortonCopy<false, PixelFormat::R8_SINT>,
-    MortonCopy<false, PixelFormat::R8_UINT>,
-    MortonCopy<false, PixelFormat::R16G16B16A16_FLOAT>,
-    MortonCopy<false, PixelFormat::R16G16B16A16_SNORM>,
-    MortonCopy<false, PixelFormat::R16G16B16A16_SINT>,
-    MortonCopy<false, PixelFormat::R16G16B16A16_UNORM>,
-    MortonCopy<false, PixelFormat::R16G16B16A16_UINT>,
-    MortonCopy<false, PixelFormat::B10G11R11_FLOAT>,
-    MortonCopy<false, PixelFormat::R32G32B32A32_UINT>,
-    MortonCopy<false, PixelFormat::BC1_RGBA_UNORM>,
-    MortonCopy<false, PixelFormat::BC2_UNORM>,
-    MortonCopy<false, PixelFormat::BC3_UNORM>,
-    MortonCopy<false, PixelFormat::BC4_UNORM>,
-    MortonCopy<false, PixelFormat::BC4_SNORM>,
-    MortonCopy<false, PixelFormat::BC5_UNORM>,
-    MortonCopy<false, PixelFormat::BC5_SNORM>,
-    MortonCopy<false, PixelFormat::BC7_UNORM>,
-    MortonCopy<false, PixelFormat::BC6H_UFLOAT>,
-    MortonCopy<false, PixelFormat::BC6H_SFLOAT>,
-    // TODO(Subv): Swizzling ASTC formats are not supported
-    nullptr,
-    MortonCopy<false, PixelFormat::B8G8R8A8_UNORM>,
-    MortonCopy<false, PixelFormat::R32G32B32A32_FLOAT>,
-    MortonCopy<false, PixelFormat::R32G32B32A32_SINT>,
-    MortonCopy<false, PixelFormat::R32G32_FLOAT>,
-    MortonCopy<false, PixelFormat::R32G32_SINT>,
-    MortonCopy<false, PixelFormat::R32_FLOAT>,
-    MortonCopy<false, PixelFormat::R16_FLOAT>,
-    MortonCopy<false, PixelFormat::R16_UNORM>,
-    MortonCopy<false, PixelFormat::R16_SNORM>,
-    MortonCopy<false, PixelFormat::R16_UINT>,
-    MortonCopy<false, PixelFormat::R16_SINT>,
-    MortonCopy<false, PixelFormat::R16G16_UNORM>,
-    MortonCopy<false, PixelFormat::R16G16_FLOAT>,
-    MortonCopy<false, PixelFormat::R16G16_UINT>,
-    MortonCopy<false, PixelFormat::R16G16_SINT>,
-    MortonCopy<false, PixelFormat::R16G16_SNORM>,
-    MortonCopy<false, PixelFormat::R32G32B32_FLOAT>,
-    MortonCopy<false, PixelFormat::A8B8G8R8_SRGB>,
-    MortonCopy<false, PixelFormat::R8G8_UNORM>,
-    MortonCopy<false, PixelFormat::R8G8_SNORM>,
-    MortonCopy<false, PixelFormat::R8G8_SINT>,
-    MortonCopy<false, PixelFormat::R8G8_UINT>,
-    MortonCopy<false, PixelFormat::R32G32_UINT>,
-    MortonCopy<false, PixelFormat::R16G16B16X16_FLOAT>,
-    MortonCopy<false, PixelFormat::R32_UINT>,
-    MortonCopy<false, PixelFormat::R32_SINT>,
-    nullptr,
-    nullptr,
-    nullptr,
-    MortonCopy<false, PixelFormat::B8G8R8A8_SRGB>,
-    MortonCopy<false, PixelFormat::BC1_RGBA_SRGB>,
-    MortonCopy<false, PixelFormat::BC2_SRGB>,
-    MortonCopy<false, PixelFormat::BC3_SRGB>,
-    MortonCopy<false, PixelFormat::BC7_SRGB>,
-    MortonCopy<false, PixelFormat::A4B4G4R4_UNORM>,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    MortonCopy<false, PixelFormat::E5B9G9R9_FLOAT>,
-    MortonCopy<false, PixelFormat::D32_FLOAT>,
-    MortonCopy<false, PixelFormat::D16_UNORM>,
-    MortonCopy<false, PixelFormat::D24_UNORM_S8_UINT>,
-    MortonCopy<false, PixelFormat::S8_UINT_D24_UNORM>,
-    MortonCopy<false, PixelFormat::D32_FLOAT_S8_UINT>,
-};
-
-static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) {
-    switch (mode) {
-    case MortonSwizzleMode::MortonToLinear:
-        return morton_to_linear_fns[static_cast<std::size_t>(format)];
-    case MortonSwizzleMode::LinearToMorton:
-        return linear_to_morton_fns[static_cast<std::size_t>(format)];
-    }
-    UNREACHABLE();
-    return morton_to_linear_fns[static_cast<std::size_t>(format)];
-}
-
-void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride,
-                   u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
-                   u8* buffer, u8* addr) {
-    GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth,
-                                     tile_width_spacing, buffer, addr);
-}
-
-} // namespace VideoCore
--- a/src/video_core/morton.h
+++ b/src/video_core/morton.h
@ -1,18 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include "common/common_types.h"
-#include "video_core/surface.h"
-
-namespace VideoCore {
-
-enum class MortonSwizzleMode { MortonToLinear, LinearToMorton };
-
-void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride,
-                   u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
-                   u8* buffer, u8* addr);
-
-} // namespace VideoCore
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@ -28,8 +28,8 @@ namespace VideoCommon {
 template <class QueryCache, class HostCounter>
 class CounterStreamBase {
 public:
-    explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type)
-        : cache{cache}, type{type} {}
+    explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_)
+        : cache{cache_}, type{type_} {}

    /// Updates the state of the stream, enabling or disabling as needed.
    void Update(bool enabled) {
@ -334,8 +334,8 @@ private:
 template <class HostCounter>
 class CachedQueryBase {
 public:
-    explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr)
-        : cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
+    explicit CachedQueryBase(VAddr cpu_addr_, u8* host_ptr_)
+        : cpu_addr{cpu_addr_}, host_ptr{host_ptr_} {}
    virtual ~CachedQueryBase() = default;

    CachedQueryBase(CachedQueryBase&&) noexcept = default;
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -32,7 +32,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size

 class RasterizerInterface {
 public:
-    virtual ~RasterizerInterface() {}
+    virtual ~RasterizerInterface() = default;

    /// Dispatches a draw invocation
    virtual void Draw(bool is_indexed, bool is_instanced) = 0;
@ -76,6 +76,9 @@ public:
    /// Sync memory between guest and host.
    virtual void SyncGuestHost() = 0;

+    /// Unmap memory range
+    virtual void UnmapMemory(VAddr addr, u64 size) = 0;
+
    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    /// and invalidated
    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
@ -83,6 +86,12 @@ public:
    /// Notify the host renderer to wait for previous primitive and compute operations.
    virtual void WaitForIdle() = 0;

+    /// Notify the host renderer to wait for reads and writes to render targets and flush caches.
+    virtual void FragmentBarrier() = 0;
+
+    /// Notify the host renderer to make available previous render target writes.
+    virtual void TiledCacheBarrier() = 0;
+
    /// Notify the rasterizer to send all written commands to the host GPU.
    virtual void FlushCommands() = 0;

@ -90,15 +99,15 @@ public:
    virtual void TickFrame() = 0;

    /// Attempt to use a faster method to perform a surface copy
-    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
-                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst,
-                                       const Tegra::Engines::Fermi2D::Config& copy_config) {
+    [[nodiscard]] virtual bool AccelerateSurfaceCopy(
+        const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst,
+        const Tegra::Engines::Fermi2D::Config& copy_config) {
        return false;
    }

    /// Attempt to use a faster method to display the framebuffer to screen
-    virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
-                                   u32 pixel_stride) {
+    [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
+                                                 VAddr framebuffer_addr, u32 pixel_stride) {
        return false;
    }

@ -110,12 +119,12 @@ public:
                                   const DiskResourceLoadCallback& callback) {}

    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
-    GuestDriverProfile& AccessGuestDriverProfile() {
+    [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() {
        return guest_driver_profile;
    }

    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
-    const GuestDriverProfile& AccessGuestDriverProfile() const {
+    [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const {
        return guest_driver_profile;
    }

--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@ -38,7 +38,7 @@ public:
    virtual ~RendererBase();

    /// Initialize the renderer
-    virtual bool Init() = 0;
+    [[nodiscard]] virtual bool Init() = 0;

    /// Shutdown the renderer
    virtual void ShutDown() = 0;
@ -49,43 +49,43 @@ public:
    // Getter/setter functions:
    // ------------------------

-    f32 GetCurrentFPS() const {
+    [[nodiscard]] f32 GetCurrentFPS() const {
        return m_current_fps;
    }

-    int GetCurrentFrame() const {
+    [[nodiscard]] int GetCurrentFrame() const {
        return m_current_frame;
    }

-    RasterizerInterface& Rasterizer() {
+    [[nodiscard]] RasterizerInterface& Rasterizer() {
        return *rasterizer;
    }

-    const RasterizerInterface& Rasterizer() const {
+    [[nodiscard]] const RasterizerInterface& Rasterizer() const {
        return *rasterizer;
    }

-    Core::Frontend::GraphicsContext& Context() {
+    [[nodiscard]] Core::Frontend::GraphicsContext& Context() {
        return *context;
    }

-    const Core::Frontend::GraphicsContext& Context() const {
+    [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const {
        return *context;
    }

-    Core::Frontend::EmuWindow& GetRenderWindow() {
+    [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() {
        return render_window;
    }

-    const Core::Frontend::EmuWindow& GetRenderWindow() const {
+    [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const {
        return render_window;
    }

-    RendererSettings& Settings() {
+    [[nodiscard]] RendererSettings& Settings() {
        return renderer_settings;
    }

-    const RendererSettings& Settings() const {
+    [[nodiscard]] const RendererSettings& Settings() const {
        return renderer_settings;
    }

--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@ -39,8 +39,8 @@ using Operation = const OperationNode&;
 constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};

 char Swizzle(std::size_t component) {
-    ASSERT(component < 4);
-    return component["xyzw"];
+    static constexpr std::string_view SWIZZLE{"xyzw"};
+    return SWIZZLE.at(component);
 }

 constexpr bool IsGenericAttribute(Attribute::Index index) {
@ -71,7 +71,7 @@ std::string_view GetInputFlags(PixelImap attribute) {
    case PixelImap::Unused:
        break;
    }
-    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", attribute);
    return {};
 }

@ -123,7 +123,7 @@ std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::Primitive
    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
        return "TRIANGLES_ADJACENCY";
    default:
-        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        UNIMPLEMENTED_MSG("topology={}", topology);
        return "POINTS";
    }
 }
@ -137,7 +137,7 @@ std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
    case Tegra::Shader::OutputTopology::TriangleStrip:
        return "TRIANGLE_STRIP";
    default:
-        UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+        UNIMPLEMENTED_MSG("Unknown output topology: {}", topology);
        return "points";
    }
 }
@ -187,8 +187,8 @@ std::string TextureType(const MetaTexture& meta) {

 class ARBDecompiler final {
 public:
-    explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
-                           ShaderType stage, std::string_view identifier);
+    explicit ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
+                           ShaderType stage_, std::string_view identifier);

    std::string Code() const {
        return shader_source;
@ -224,7 +224,7 @@ private:

    std::string Visit(const Node& node);

-    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation);
    std::string BuildAoffi(Operation);
    std::string GlobalMemoryPointer(const GmemNode& gmem);
    void Exit();
@ -376,9 +376,11 @@ private:
        std::string temporary = AllocTemporary();
        std::string address;
        std::string_view opname;
+        bool robust = false;
        if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
            address = GlobalMemoryPointer(*gmem);
            opname = "ATOM";
+            robust = true;
        } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
            address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
            opname = "ATOMS";
@ -386,7 +388,15 @@ private:
            UNREACHABLE();
            return "{0, 0, 0, 0}";
        }
+        if (robust) {
+            AddLine("IF NE.x;");
+        }
        AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        if (robust) {
+            AddLine("ELSE;");
+            AddLine("MOV.S {}, 0;", temporary);
+            AddLine("ENDIF;");
+        }
        return temporary;
    }

@ -792,9 +802,9 @@ private:
    };
 };

-ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
-                             ShaderType stage, std::string_view identifier)
-    : device{device}, ir{ir}, registry{registry}, stage{stage} {
+ARBDecompiler::ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
+                             ShaderType stage_, std::string_view identifier)
+    : device{device_}, ir{ir_}, registry{registry_}, stage{stage_} {
    DefineGlobalMemory();

    AddLine("TEMP RC;");
@ -980,10 +990,9 @@ void ARBDecompiler::DeclareLocalMemory() {
 }

 void ARBDecompiler::DeclareGlobalMemory() {
-    const std::size_t num_entries = ir.GetGlobalMemory().size();
+    const size_t num_entries = ir.GetGlobalMemory().size();
    if (num_entries > 0) {
-        const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2;
-        AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1);
+        AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1);
    }
 }

@ -1125,44 +1134,44 @@ void ARBDecompiler::VisitAST(const ASTNode& node) {
        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
            VisitAST(current);
        }
-    } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
-        const std::string condition = VisitExpression(ast->condition);
+    } else if (const auto if_then = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(if_then->condition);
        ResetTemporaries();

        AddLine("MOVC.U RC.x, {};", condition);
        AddLine("IF NE.x;");
-        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+        for (ASTNode current = if_then->nodes.GetFirst(); current; current = current->GetNext()) {
            VisitAST(current);
        }
        AddLine("ENDIF;");
-    } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+    } else if (const auto if_else = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
        AddLine("ELSE;");
-        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+        for (ASTNode current = if_else->nodes.GetFirst(); current; current = current->GetNext()) {
            VisitAST(current);
        }
-    } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
-        VisitBlock(ast->nodes);
-    } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
-        AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+    } else if (const auto decoded = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+        VisitBlock(decoded->nodes);
+    } else if (const auto var_set = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+        AddLine("MOV.U F{}, {};", var_set->index, VisitExpression(var_set->condition));
        ResetTemporaries();
-    } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
-        const std::string condition = VisitExpression(ast->condition);
+    } else if (const auto do_while = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(do_while->condition);
        ResetTemporaries();
        AddLine("REP;");
-        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+        for (ASTNode current = do_while->nodes.GetFirst(); current; current = current->GetNext()) {
            VisitAST(current);
        }
        AddLine("MOVC.U RC.x, {};", condition);
        AddLine("BRK (NE.x);");
        AddLine("ENDREP;");
-    } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
-        const bool is_true = ExprIsTrue(ast->condition);
+    } else if (const auto ast_return = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+        const bool is_true = ExprIsTrue(ast_return->condition);
        if (!is_true) {
-            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast_return->condition));
            AddLine("IF NE.x;");
            ResetTemporaries();
        }
-        if (ast->kills) {
+        if (ast_return->kills) {
            AddLine("KIL TR;");
        } else {
            Exit();
@ -1170,11 +1179,11 @@ void ARBDecompiler::VisitAST(const ASTNode& node) {
        if (!is_true) {
            AddLine("ENDIF;");
        }
-    } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
-        if (ExprIsTrue(ast->condition)) {
+    } else if (const auto ast_break = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+        if (ExprIsTrue(ast_break->condition)) {
            AddLine("BRK;");
        } else {
-            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast_break->condition));
            AddLine("BRK (NE.x);");
            ResetTemporaries();
        }
@ -1342,7 +1351,7 @@ std::string ARBDecompiler::Visit(const Node& node) {
                                       GetGenericAttributeIndex(index), swizzle);
                }
            }
-            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", index);
            break;
        }
        return "{0, 0, 0, 0}.x";
@ -1363,7 +1372,8 @@ std::string ARBDecompiler::Visit(const Node& node) {

    if (const auto gmem = std::get_if<GmemNode>(&*node)) {
        std::string temporary = AllocTemporary();
-        AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem));
+        AddLine("MOV {}, 0;", temporary);
+        AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem));
        return temporary;
    }

@ -1406,12 +1416,12 @@ std::string ARBDecompiler::Visit(const Node& node) {
    return {};
 }

-std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
-    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
-                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);

+    const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array &&
+                             meta.sampler.type == Tegra::Shader::TextureType::TextureCube;
    const std::size_t count = operation.GetOperandsCount();
    std::string temporary = AllocVectorTemporary();
    std::size_t i = 0;
@ -1419,12 +1429,21 @@ std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operati
        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
    }
    if (meta.sampler.is_array) {
-        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array));
+        ++i;
    }
    if (meta.sampler.is_shadow) {
-        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+        std::string compare = Visit(meta.depth_compare);
+        if (is_extended) {
+            ASSERT(i == 4);
+            std::string extra_coord = AllocVectorTemporary();
+            AddLine("MOV.F {}.x, {};", extra_coord, compare);
+            return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0};
+        }
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare);
+        ++i;
    }
-    return {std::move(temporary), i};
+    return {temporary, temporary, i};
 }

 std::string ARBDecompiler::BuildAoffi(Operation operation) {
@ -1441,18 +1460,21 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) {
 }

 std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
+    // Read a bindless SSBO, return its address and set CC accordingly
+    // address = c[binding].xy
+    // length  = c[binding].z
    const u32 binding = global_memory_names.at(gmem.GetDescriptor());
-    const char result_swizzle = binding % 2 == 0 ? 'x' : 'y';

    const std::string pointer = AllocLongVectorTemporary();
    std::string temporary = AllocTemporary();

-    const u32 local_index = binding / 2;
-    AddLine("PK64.U {}, c[{}];", pointer, local_index);
+    AddLine("PK64.U {}, c[{}];", pointer, binding);
    AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
            Visit(gmem.GetBaseAddress()));
    AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
-    AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer);
+    AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer);
+    // Compare offset to length and set CC
+    AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding);
    return fmt::format("{}.x", pointer);
 }

@ -1463,9 +1485,7 @@ void ARBDecompiler::Exit() {
    }

    const auto safe_get_register = [this](u32 reg) -> std::string {
-        // TODO(Rodrigo): Replace with contains once C++20 releases
-        const auto& used_registers = ir.GetRegisters();
-        if (used_registers.find(reg) != used_registers.end()) {
+        if (ir.GetRegisters().contains(reg)) {
            return fmt::format("R{}.x", reg);
        }
        return "{0, 0, 0, 0}.x";
@ -1552,7 +1572,9 @@ std::string ARBDecompiler::Assign(Operation operation) {
        ResetTemporaries();
        return {};
    } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        AddLine("IF NE.x;");
        AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
+        AddLine("ENDIF;");
        ResetTemporaries();
        return {};
    } else {
@ -1844,7 +1866,7 @@ std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
 std::string ARBDecompiler::Texture(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);

    std::string_view opcode = "TEX";
    std::string extra;
@ -1873,7 +1895,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
        }
    }

-    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id,
            TextureType(meta), BuildAoffi(operation));
    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
    return fmt::format("{}.x", temporary);
@ -1882,7 +1904,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
 std::string ARBDecompiler::TextureGather(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);

    std::string comp;
    if (!meta.sampler.is_shadow) {
@ -1892,7 +1914,7 @@ std::string ARBDecompiler::TextureGather(Operation operation) {

    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
            TextureType(meta), BuildAoffi(operation));
-    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element));
    return fmt::format("{}.x", temporary);
 }

@ -1930,13 +1952,13 @@ std::string ARBDecompiler::TextureQueryLod(Operation operation) {
 std::string ARBDecompiler::TexelFetch(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);

    if (!meta.sampler.is_buffer) {
        ASSERT(swizzle < 4);
        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
    }
-    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta),
            BuildAoffi(operation));
    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
    return fmt::format("{}.x", temporary);
@ -1947,7 +1969,7 @@ std::string ARBDecompiler::TextureGradient(Operation operation) {
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
    const std::string ddx = AllocVectorTemporary();
    const std::string ddy = AllocVectorTemporary();
-    const std::string coord = BuildCoords(operation).first;
+    const std::string coord = std::get<1>(BuildCoords(operation));

    const std::size_t num_components = meta.derivates.size() / 2;
    for (std::size_t index = 0; index < num_components; ++index) {
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@ -22,11 +22,11 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;

 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));

-Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_)
+    : BufferBlock{cpu_addr_, size_} {
    gl_buffer.Create();
-    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
-    if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
+    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW);
+    if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) {
        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
    }
@ -34,14 +34,14 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)

 Buffer::~Buffer() = default;

-void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
-    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
-                         data);
+void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(data_size), data);
 }

-void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size);
    const GLintptr gl_offset = static_cast<GLintptr>(offset);
    if (read_buffer.handle == 0) {
        read_buffer.Create();
@ -54,17 +54,16 @@ void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
 }

 void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                      std::size_t size) {
+                      std::size_t copy_size) {
    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size));
 }

-OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
-                               Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
-                               const Device& device_, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, gpu_memory, cpu_memory,
-                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
-      device{device_} {
+OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
+                               Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+                               const Device& device_, OGLStreamBuffer& stream_buffer_,
+                               StateTracker& state_tracker)
+    : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
    if (!device.HasFastBufferSubData()) {
        return;
    }
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@ -22,18 +22,19 @@ namespace OpenGL {
 class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
+class StateTracker;

 class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+    explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_);
    ~Buffer();

-    void Upload(std::size_t offset, std::size_t size, const u8* data);
+    void Upload(std::size_t offset, std::size_t data_size, const u8* data);

-    void Download(std::size_t offset, std::size_t size, u8* data);
+    void Download(std::size_t offset, std::size_t data_size, u8* data);

    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                  std::size_t size);
+                  std::size_t copy_size);

    GLuint Handle() const noexcept {
        return gl_buffer.handle;
@ -54,7 +55,8 @@ class OGLBufferCache final : public GenericBufferCache {
 public:
    explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
                            Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
-                            const Device& device, std::size_t stream_size);
+                            const Device& device, OGLStreamBuffer& stream_buffer,
+                            StateTracker& state_tracker);
    ~OGLBufferCache();

    BufferInfo GetEmptyBuffer(std::size_t) override;
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@ -5,9 +5,11 @@
 #include <algorithm>
 #include <array>
 #include <cstddef>
+#include <cstdlib>
 #include <cstring>
 #include <limits>
 #include <optional>
+#include <span>
 #include <vector>

 #include <glad/glad.h>
@ -27,27 +29,29 @@ constexpr u32 ReservedUniformBlocks = 1;

 constexpr u32 NumStages = 5;

-constexpr std::array LimitUBOs = {
+constexpr std::array LIMIT_UBOS = {
    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
-    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS};
-
-constexpr std::array LimitSSBOs = {
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS,
+};
+constexpr std::array LIMIT_SSBOS = {
    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
    GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
-    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
-
-constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
-                                      GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
-                                      GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
-                                      GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
-                                      GL_MAX_TEXTURE_IMAGE_UNITS,
-                                      GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
-
-constexpr std::array LimitImages = {
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS,
+};
+constexpr std::array LIMIT_SAMPLERS = {
+    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+    GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+    GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+    GL_MAX_TEXTURE_IMAGE_UNITS,
+    GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS,
+};
+constexpr std::array LIMIT_IMAGES = {
    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
-    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS};
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS,
+};

 template <typename T>
 T GetInteger(GLenum pname) {
@ -76,8 +80,8 @@ std::vector<std::string_view> GetExtensions() {
    return extensions;
 }

-bool HasExtension(const std::vector<std::string_view>& images, std::string_view extension) {
-    return std::find(images.begin(), images.end(), extension) != images.end();
+bool HasExtension(std::span<const std::string_view> extensions, std::string_view extension) {
+    return std::ranges::find(extensions, extension) != extensions.end();
 }

 u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
@ -91,8 +95,8 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {

 std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
-    std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
-                   [](GLenum pname) { return GetInteger<u32>(pname); });
+    std::ranges::transform(LIMIT_UBOS, max.begin(),
+                           [](GLenum pname) { return GetInteger<u32>(pname); });
    return max;
 }

@ -115,9 +119,10 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
    for (std::size_t i = 0; i < NumStages; ++i) {
        const std::size_t stage = stage_swizzle[i];
        bindings[stage] = {
-            Extract(base_ubo, num_ubos, total_ubos / NumStages, LimitUBOs[stage]),
-            Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LimitSSBOs[stage]),
-            Extract(base_samplers, num_samplers, total_samplers / NumStages, LimitSamplers[stage])};
+            Extract(base_ubo, num_ubos, total_ubos / NumStages, LIMIT_UBOS[stage]),
+            Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LIMIT_SSBOS[stage]),
+            Extract(base_samplers, num_samplers, total_samplers / NumStages,
+                    LIMIT_SAMPLERS[stage])};
    }

    u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
@ -130,7 +135,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin

    // Reserve at least 4 image bindings on the fragment stage.
    bindings[4].image =
-        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LIMIT_IMAGES[4]);

    // This is guaranteed to be at least 1.
    const u32 total_extracted_images = num_images / (NumStages - 1);
@ -142,7 +147,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
            continue;
        }
        bindings[stage].image =
-            Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
+            Extract(base_images, num_images, total_extracted_images, LIMIT_IMAGES[stage]);
    }

    // Compute doesn't care about any of this.
@ -188,17 +193,22 @@ bool IsASTCSupported() {
    return true;
 }

+[[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) {
+    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
+    return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
+}
+
 } // Anonymous namespace

 Device::Device()
    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
    const std::vector extensions = GetExtensions();

    const bool is_nvidia = vendor == "NVIDIA Corporation";
    const bool is_amd = vendor == "ATI Technologies Inc.";
+    const bool is_intel = vendor == "Intel";

    bool disable_fast_buffer_sub_data = false;
    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
@ -207,9 +217,8 @@ Device::Device()
            "Beta driver 443.24 is known to have issues. There might be performance issues.");
        disable_fast_buffer_sub_data = true;
    }
-
-    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
-    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
+    uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
    max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
@ -223,8 +232,10 @@ Device::Device()
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = is_amd;
    has_precise_bug = TestPreciseBug();
+    has_broken_texture_view_formats = is_amd || is_intel;
    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+    has_debugging_tool_attached = IsDebugToolAttached(extensions);

    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
    // uniform buffers as "push constants"
@ -239,6 +250,8 @@ Device::Device()
    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
    LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
    LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+    LOG_INFO(Render_OpenGL, "Renderer_BrokenTextureViewFormats: {}",
+             has_broken_texture_view_formats);

    if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@ -36,11 +36,11 @@ public:
        return GetBaseBindings(static_cast<std::size_t>(shader_type));
    }

-    std::size_t GetUniformBufferAlignment() const {
+    size_t GetUniformBufferAlignment() const {
        return uniform_buffer_alignment;
    }

-    std::size_t GetShaderStorageBufferAlignment() const {
+    size_t GetShaderStorageBufferAlignment() const {
        return shader_storage_alignment;
    }

@ -96,6 +96,10 @@ public:
        return has_precise_bug;
    }

+    bool HasBrokenTextureViewFormats() const {
+        return has_broken_texture_view_formats;
+    }
+
    bool HasFastBufferSubData() const {
        return has_fast_buffer_sub_data;
    }
@ -104,6 +108,10 @@ public:
        return has_nv_viewport_array2;
    }

+    bool HasDebuggingToolAttached() const {
+        return has_debugging_tool_attached;
+    }
+
    bool UseAssemblyShaders() const {
        return use_assembly_shaders;
    }
@ -118,8 +126,8 @@ private:

    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
-    std::size_t uniform_buffer_alignment{};
-    std::size_t shader_storage_alignment{};
+    size_t uniform_buffer_alignment{};
+    size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
    u32 max_varyings{};
    u32 max_compute_shared_memory_size{};
@ -133,8 +141,10 @@ private:
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
    bool has_precise_bug{};
+    bool has_broken_texture_view_formats{};
    bool has_fast_buffer_sub_data{};
    bool has_nv_viewport_array2{};
+    bool has_debugging_tool_attached{};
    bool use_assembly_shaders{};
    bool use_asynchronous_shaders{};
 };
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@ -11,10 +11,10 @@

 namespace OpenGL {

-GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {}
+GLInnerFence::GLInnerFence(u32 payload_, bool is_stubbed_) : FenceBase{payload_, is_stubbed_} {}

-GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed)
-    : FenceBase(address, payload, is_stubbed) {}
+GLInnerFence::GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_)
+    : FenceBase{address_, payload_, is_stubbed_} {}

 GLInnerFence::~GLInnerFence() = default;

@ -45,10 +45,10 @@ void GLInnerFence::Wait() {
    glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
 }

-FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
-                                       TextureCacheOpenGL& texture_cache,
-                                       OGLBufferCache& buffer_cache, QueryCache& query_cache)
-    : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {}
+FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
+                                       Tegra::GPU& gpu_, TextureCache& texture_cache_,
+                                       OGLBufferCache& buffer_cache_, QueryCache& query_cache_)
+    : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}

 Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
    return std::make_shared<GLInnerFence>(value, is_stubbed);
--- a/src/video_core/renderer_opengl/gl_fence_manager.h
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@ -17,8 +17,8 @@ namespace OpenGL {

 class GLInnerFence : public VideoCommon::FenceBase {
 public:
-    GLInnerFence(u32 payload, bool is_stubbed);
-    GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed);
+    explicit GLInnerFence(u32 payload_, bool is_stubbed_);
+    explicit GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_);
    ~GLInnerFence();

    void Queue();
@ -33,13 +33,13 @@ private:

 using Fence = std::shared_ptr<GLInnerFence>;
 using GenericFenceManager =
-    VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>;
+    VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>;

 class FenceManagerOpenGL final : public GenericFenceManager {
 public:
-    explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
-                                TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache,
-                                QueryCache& query_cache);
+    explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
+                                TextureCache& texture_cache_, OGLBufferCache& buffer_cache_,
+                                QueryCache& query_cache_);

 protected:
    Fence CreateFence(u32 value, bool is_stubbed) override;
--- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
@ -1,85 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-
-#include <glad/glad.h>
-
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/renderer_opengl/gl_framebuffer_cache.h"
-
-namespace OpenGL {
-
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using VideoCore::Surface::SurfaceType;
-
-FramebufferCacheOpenGL::FramebufferCacheOpenGL() = default;
-
-FramebufferCacheOpenGL::~FramebufferCacheOpenGL() = default;
-
-GLuint FramebufferCacheOpenGL::GetFramebuffer(const FramebufferCacheKey& key) {
-    const auto [entry, is_cache_miss] = cache.try_emplace(key);
-    auto& framebuffer{entry->second};
-    if (is_cache_miss) {
-        framebuffer = CreateFramebuffer(key);
-    }
-    return framebuffer.handle;
-}
-
-OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheKey& key) {
-    OGLFramebuffer framebuffer;
-    framebuffer.Create();
-
-    // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs.
-    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle);
-
-    if (key.zeta) {
-        const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil;
-        const GLenum attach_target = stencil ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT;
-        key.zeta->Attach(attach_target, GL_DRAW_FRAMEBUFFER);
-    }
-
-    std::size_t num_buffers = 0;
-    std::array<GLenum, Maxwell::NumRenderTargets> targets;
-
-    for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
-        if (!key.colors[index]) {
-            targets[index] = GL_NONE;
-            continue;
-        }
-        const GLenum attach_target = GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index);
-        key.colors[index]->Attach(attach_target, GL_DRAW_FRAMEBUFFER);
-
-        const u32 attachment = (key.color_attachments >> (BitsPerAttachment * index)) & 0b1111;
-        targets[index] = GL_COLOR_ATTACHMENT0 + attachment;
-        num_buffers = index + 1;
-    }
-
-    if (num_buffers > 0) {
-        glDrawBuffers(static_cast<GLsizei>(num_buffers), std::data(targets));
-    } else {
-        glDrawBuffer(GL_NONE);
-    }
-
-    return framebuffer;
-}
-
-std::size_t FramebufferCacheKey::Hash() const noexcept {
-    std::size_t hash = std::hash<View>{}(zeta);
-    for (const auto& color : colors) {
-        hash ^= std::hash<View>{}(color);
-    }
-    hash ^= static_cast<std::size_t>(color_attachments) << 16;
-    return hash;
-}
-
-bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const noexcept {
-    return std::tie(colors, zeta, color_attachments) ==
-           std::tie(rhs.colors, rhs.zeta, rhs.color_attachments);
-}
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h
@ -1,68 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <cstddef>
-#include <unordered_map>
-
-#include <glad/glad.h>
-
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_texture_cache.h"
-
-namespace OpenGL {
-
-constexpr std::size_t BitsPerAttachment = 4;
-
-struct FramebufferCacheKey {
-    View zeta;
-    std::array<View, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors;
-    u32 color_attachments = 0;
-
-    std::size_t Hash() const noexcept;
-
-    bool operator==(const FramebufferCacheKey& rhs) const noexcept;
-
-    bool operator!=(const FramebufferCacheKey& rhs) const noexcept {
-        return !operator==(rhs);
-    }
-
-    void SetAttachment(std::size_t index, u32 attachment) {
-        color_attachments |= attachment << (BitsPerAttachment * index);
-    }
-};
-
-} // namespace OpenGL
-
-namespace std {
-
-template <>
-struct hash<OpenGL::FramebufferCacheKey> {
-    std::size_t operator()(const OpenGL::FramebufferCacheKey& k) const noexcept {
-        return k.Hash();
-    }
-};
-
-} // namespace std
-
-namespace OpenGL {
-
-class FramebufferCacheOpenGL {
-public:
-    FramebufferCacheOpenGL();
-    ~FramebufferCacheOpenGL();
-
-    GLuint GetFramebuffer(const FramebufferCacheKey& key);
-
-private:
-    OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key);
-
-    std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache;
-};
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_query_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@ -30,11 +30,9 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {

 } // Anonymous namespace

-QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
-                       Tegra::MemoryManager& gpu_memory)
-    : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>(
-          rasterizer, maxwell3d, gpu_memory),
-      gl_rasterizer{rasterizer} {}
+QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_,
+                       Tegra::MemoryManager& gpu_memory_)
+    : QueryCacheBase(rasterizer_, maxwell3d_, gpu_memory_), gl_rasterizer{rasterizer_} {}

 QueryCache::~QueryCache() = default;

@ -59,10 +57,11 @@ bool QueryCache::AnyCommandQueued() const noexcept {
    return gl_rasterizer.AnyCommandQueued();
 }

-HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
-                         VideoCore::QueryType type)
-    : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache},
-      type{type}, query{cache.AllocateQuery(type)} {
+HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
+                         VideoCore::QueryType type_)
+    : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, query{
+                                                                               cache.AllocateQuery(
+                                                                                   type)} {
    glBeginQuery(GetTarget(type), query.handle);
 }

@ -86,13 +85,14 @@ u64 HostCounter::BlockingQuery() const {
    return static_cast<u64>(value);
 }

-CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
-    : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
+CachedQuery::CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_,
+                         u8* host_ptr_)
+    : CachedQueryBase{cpu_addr_, host_ptr_}, cache{&cache_}, type{type_} {}

 CachedQuery::~CachedQuery() = default;

 CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
-    : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
+    : CachedQueryBase(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}

 CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
    cache = rhs.cache;
--- a/src/video_core/renderer_opengl/gl_query_cache.h
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@ -29,8 +29,8 @@ using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
 class QueryCache final
    : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
 public:
-    explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
-                        Tegra::MemoryManager& gpu_memory);
+    explicit QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_,
+                        Tegra::MemoryManager& gpu_memory_);
    ~QueryCache();

    OGLQuery AllocateQuery(VideoCore::QueryType type);
@ -46,8 +46,8 @@ private:

 class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
 public:
-    explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
-                         VideoCore::QueryType type);
+    explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
+                         VideoCore::QueryType type_);
    ~HostCounter();

    void EndQuery();
@ -62,8 +62,8 @@ private:

 class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
 public:
-    explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
-                         u8* host_ptr);
+    explicit CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_,
+                         u8* host_ptr_);
    ~CachedQuery() override;

    CachedQuery(CachedQuery&& rhs) noexcept;
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -25,12 +25,15 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
+#include "video_core/renderer_opengl/gl_texture_cache.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/shader_cache.h"
+#include "video_core/texture_cache/texture_cache.h"

 namespace OpenGL {

@ -55,18 +58,32 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255

 namespace {

-constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
-constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+constexpr size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
    NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
-constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+constexpr size_t TOTAL_CONST_BUFFER_BYTES =
    NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;

-constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
-constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
+constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
+
+constexpr size_t MAX_TEXTURES = 192;
+constexpr size_t MAX_IMAGES = 48;
+
+struct TextureHandle {
+    constexpr TextureHandle(u32 data, bool via_header_index) {
+        const Tegra::Texture::TextureHandle handle{data};
+        image = handle.tic_id;
+        sampler = via_header_index ? image : handle.tsc_id.Value();
+    }
+
+    u32 image;
+    u32 sampler;
+};

 template <typename Engine, typename Entry>
-Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
-                                               ShaderType shader_type, std::size_t index = 0) {
+TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const Entry& entry,
+                             ShaderType shader_type, size_t index = 0) {
    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
        if (entry.is_separated) {
            const u32 buffer_1 = entry.buffer;
@ -75,21 +92,16 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
            const u32 offset_2 = entry.secondary_offset;
            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
-            return engine.GetTextureInfo(handle_1 | handle_2);
+            return TextureHandle(handle_1 | handle_2, via_header_index);
        }
    }
    if (entry.is_bindless) {
-        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
-        return engine.GetTextureInfo(handle);
-    }
-
-    const auto& gpu_profile = engine.AccessGuestDriverProfile();
-    const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
-    if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
-        return engine.GetStageTexture(shader_type, offset);
-    } else {
-        return engine.GetTexture(offset);
+        const u32 raw = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return TextureHandle(raw, via_header_index);
    }
+    const u32 buffer = engine.GetBoundBuffer();
+    const u64 offset = (entry.offset + index) * sizeof(u32);
+    return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
 }

 std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
@ -97,7 +109,6 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
    if (!entry.IsIndirect()) {
        return entry.GetSize();
    }
-
    if (buffer.size > Maxwell::MaxConstBufferSize) {
        LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
                    Maxwell::MaxConstBufferSize);
@ -131,7 +142,7 @@ std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
    case 43:
        return {GL_BACK_SECONDARY_COLOR_NV, 0};
    }
-    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    UNIMPLEMENTED_MSG("index={}", index);
    return {GL_POSITION, 0};
 }

@ -139,35 +150,68 @@ void oglEnable(GLenum cap, bool state) {
    (state ? glEnable : glDisable)(cap);
 }

-void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) {
-    if (num_entries == 0) {
+void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
+    if (num_ssbos == 0) {
        return;
    }
-    if (num_entries % 2 == 1) {
-        pointers[num_entries] = 0;
+    glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
+                                    reinterpret_cast<const GLuint*>(ssbos));
+}
+
+ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
+    if (entry.is_buffer) {
+        return ImageViewType::Buffer;
    }
-    const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2);
-    glProgramLocalParametersI4uivNV(target, 0, num_vectors,
-                                    reinterpret_cast<const GLuint*>(pointers));
+    switch (entry.type) {
+    case Tegra::Shader::TextureType::Texture1D:
+        return entry.is_array ? ImageViewType::e1DArray : ImageViewType::e1D;
+    case Tegra::Shader::TextureType::Texture2D:
+        return entry.is_array ? ImageViewType::e2DArray : ImageViewType::e2D;
+    case Tegra::Shader::TextureType::Texture3D:
+        return ImageViewType::e3D;
+    case Tegra::Shader::TextureType::TextureCube:
+        return entry.is_array ? ImageViewType::CubeArray : ImageViewType::Cube;
+    }
+    UNREACHABLE();
+    return ImageViewType::e2D;
+}
+
+ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) {
+    switch (entry.type) {
+    case Tegra::Shader::ImageType::Texture1D:
+        return ImageViewType::e1D;
+    case Tegra::Shader::ImageType::Texture1DArray:
+        return ImageViewType::e1DArray;
+    case Tegra::Shader::ImageType::Texture2D:
+        return ImageViewType::e2D;
+    case Tegra::Shader::ImageType::Texture2DArray:
+        return ImageViewType::e2DArray;
+    case Tegra::Shader::ImageType::Texture3D:
+        return ImageViewType::e3D;
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return ImageViewType::Buffer;
+    }
+    UNREACHABLE();
+    return ImageViewType::e2D;
 }

 } // Anonymous namespace

-RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
-                                   Core::Memory::Memory& cpu_memory, const Device& device_,
+RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
+                                   Core::Memory::Memory& cpu_memory_, const Device& device_,
                                   ScreenInfo& screen_info_, ProgramManager& program_manager_,
                                   StateTracker& state_tracker_)
-    : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
+    : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
      kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
      screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
-      texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker),
-      shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device),
+      stream_buffer(device, state_tracker),
+      texture_cache_runtime(device, program_manager, state_tracker),
+      texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
+      shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
      query_cache(*this, maxwell3d, gpu_memory),
-      buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE),
+      buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker),
      fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
-      async_shaders(emu_window) {
-    CheckExtensions();
-
+      async_shaders(emu_window_) {
    unified_uniform_buffer.Create();
    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);

@ -178,7 +222,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra:
                                 nullptr, 0);
        }
    }
-
    if (device.UseAsynchronousShaders()) {
        async_shaders.AllocateWorkers();
    }
@ -190,14 +233,6 @@ RasterizerOpenGL::~RasterizerOpenGL() {
    }
 }

-void RasterizerOpenGL::CheckExtensions() {
-    if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
-        LOG_WARNING(
-            Render_OpenGL,
-            "Anisotropic filter is not supported! This can cause graphical issues in some games.");
-    }
-}
-
 void RasterizerOpenGL::SetupVertexFormat() {
    auto& flags = maxwell3d.dirty.flags;
    if (!flags[Dirty::VertexFormats]) {
@ -320,10 +355,16 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
    return info.offset;
 }

-void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
+void RasterizerOpenGL::SetupShaders() {
    MICROPROFILE_SCOPE(OpenGL_Shader);
    u32 clip_distances = 0;

+    std::array<Shader*, Maxwell::MaxShaderStage> shaders{};
+    image_view_indices.clear();
+    sampler_handles.clear();
+
+    texture_cache.SynchronizeGraphicsDescriptors();
+
    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
        const auto& shader_config = maxwell3d.regs.shader_config[index];
        const auto program{static_cast<Maxwell::ShaderProgram>(index)};
@ -342,7 +383,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
            }
            continue;
        }
-
        // Currently this stages are not supported in the OpenGL backend.
        // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
        if (program == Maxwell::ShaderProgram::TesselationControl ||
@ -351,7 +391,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
        }

        Shader* const shader = shader_cache.GetStageProgram(program, async_shaders);
-
        const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0;
        switch (program) {
        case Maxwell::ShaderProgram::VertexA:
@ -367,14 +406,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
        default:
            UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index,
                              shader_config.enable.Value(), shader_config.offset);
+            break;
        }

        // Stage indices are 0 - 5
-        const std::size_t stage = index == 0 ? 0 : index - 1;
+        const size_t stage = index == 0 ? 0 : index - 1;
+        shaders[stage] = shader;
+
        SetupDrawConstBuffers(stage, shader);
        SetupDrawGlobalMemory(stage, shader);
-        SetupDrawTextures(stage, shader);
-        SetupDrawImages(stage, shader);
+        SetupDrawTextures(shader, stage);
+        SetupDrawImages(shader, stage);

        // Workaround for Intel drivers.
        // When a clip distance is enabled but not set in the shader it crops parts of the screen
@ -388,9 +430,23 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
            ++index;
        }
    }
-
    SyncClipEnabled(clip_distances);
    maxwell3d.dirty.flags[Dirty::Shaders] = false;
+
+    const std::span indices_span(image_view_indices.data(), image_view_indices.size());
+    texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
+
+    size_t image_view_index = 0;
+    size_t texture_index = 0;
+    size_t image_index = 0;
+    for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
+        const Shader* const shader = shaders[stage];
+        if (shader) {
+            const auto base = device.GetBaseBindings(stage);
+            BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
+                         texture_index, image_index);
+        }
+    }
 }

 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@ -421,98 +477,6 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s
    shader_cache.LoadDiskCache(title_id, stop_loading, callback);
 }

-void RasterizerOpenGL::ConfigureFramebuffers() {
-    MICROPROFILE_SCOPE(OpenGL_Framebuffer);
-    if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
-        return;
-    }
-    maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
-
-    texture_cache.GuardRenderTargets(true);
-
-    View depth_surface = texture_cache.GetDepthBufferSurface(true);
-
-    const auto& regs = maxwell3d.regs;
-    UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
-
-    // Bind the framebuffer surfaces
-    FramebufferCacheKey key;
-    const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
-    for (std::size_t index = 0; index < colors_count; ++index) {
-        View color_surface{texture_cache.GetColorBufferSurface(index, true)};
-        if (!color_surface) {
-            continue;
-        }
-        // Assume that a surface will be written to if it is used as a framebuffer, even
-        // if the shader doesn't actually write to it.
-        texture_cache.MarkColorBufferInUse(index);
-
-        key.SetAttachment(index, regs.rt_control.GetMap(index));
-        key.colors[index] = std::move(color_surface);
-    }
-
-    if (depth_surface) {
-        // Assume that a surface will be written to if it is used as a framebuffer, even if
-        // the shader doesn't actually write to it.
-        texture_cache.MarkDepthBufferInUse();
-        key.zeta = std::move(depth_surface);
-    }
-
-    texture_cache.GuardRenderTargets(false);
-
-    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
-}
-
-void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) {
-    const auto& regs = maxwell3d.regs;
-
-    texture_cache.GuardRenderTargets(true);
-    View color_surface;
-
-    if (using_color) {
-        // Determine if we have to preserve the contents.
-        // First we have to make sure all clear masks are enabled.
-        bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G ||
-                                 !regs.clear_buffers.B || !regs.clear_buffers.A;
-        const std::size_t index = regs.clear_buffers.RT;
-        if (regs.clear_flags.scissor) {
-            // Then we have to confirm scissor testing clears the whole image.
-            const auto& scissor = regs.scissor_test[0];
-            preserve_contents |= scissor.min_x > 0;
-            preserve_contents |= scissor.min_y > 0;
-            preserve_contents |= scissor.max_x < regs.rt[index].width;
-            preserve_contents |= scissor.max_y < regs.rt[index].height;
-        }
-
-        color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents);
-        texture_cache.MarkColorBufferInUse(index);
-    }
-
-    View depth_surface;
-    if (using_depth_stencil) {
-        bool preserve_contents = false;
-        if (regs.clear_flags.scissor) {
-            // For depth stencil clears we only have to confirm scissor test covers the whole image.
-            const auto& scissor = regs.scissor_test[0];
-            preserve_contents |= scissor.min_x > 0;
-            preserve_contents |= scissor.min_y > 0;
-            preserve_contents |= scissor.max_x < regs.zeta_width;
-            preserve_contents |= scissor.max_y < regs.zeta_height;
-        }
-
-        depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents);
-        texture_cache.MarkDepthBufferInUse();
-    }
-    texture_cache.GuardRenderTargets(false);
-
-    FramebufferCacheKey key;
-    key.colors[0] = std::move(color_surface);
-    key.zeta = std::move(depth_surface);
-
-    state_tracker.NotifyFramebuffer();
-    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
-}
-
 void RasterizerOpenGL::Clear() {
    if (!maxwell3d.ShouldExecute()) {
        return;
@ -527,8 +491,9 @@ void RasterizerOpenGL::Clear() {
        regs.clear_buffers.A) {
        use_color = true;

-        state_tracker.NotifyColorMask0();
-        glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0,
+        const GLuint index = regs.clear_buffers.RT;
+        state_tracker.NotifyColorMask(index);
+        glColorMaski(index, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0,
                     regs.clear_buffers.B != 0, regs.clear_buffers.A != 0);

        // TODO(Rodrigo): Determine if clamping is used on clears
@ -561,15 +526,17 @@ void RasterizerOpenGL::Clear() {
        state_tracker.NotifyScissor0();
        glDisablei(GL_SCISSOR_TEST, 0);
    }
-
    UNIMPLEMENTED_IF(regs.clear_flags.viewport);

-    ConfigureClearFramebuffer(use_color, use_depth || use_stencil);
-
-    if (use_color) {
-        glClearBufferfv(GL_COLOR, 0, regs.clear_color);
+    {
+        auto lock = texture_cache.AcquireLock();
+        texture_cache.UpdateRenderTargets(true);
+        state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
    }

+    if (use_color) {
+        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
+    }
    if (use_depth && use_stencil) {
        glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
    } else if (use_depth) {
@ -626,16 +593,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());

    // Prepare the vertex array.
-    const bool invalidated = buffer_cache.Map(buffer_size);
-
-    if (invalidated) {
-        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
-        auto& dirty = maxwell3d.dirty.flags;
-        dirty[Dirty::VertexBuffers] = true;
-        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
-            dirty[index] = true;
-        }
-    }
+    buffer_cache.Map(buffer_size);

    // Prepare vertex array format.
    SetupVertexFormat();
@ -659,22 +617,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    }

    // Setup shaders and their used resources.
-    texture_cache.GuardSamplers(true);
-    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
-    SetupShaders(primitive_mode);
-    texture_cache.GuardSamplers(false);
-
-    ConfigureFramebuffers();
+    auto lock = texture_cache.AcquireLock();
+    SetupShaders();

    // Signal the buffer cache that we are not going to upload more things.
    buffer_cache.Unmap();
-
+    texture_cache.UpdateRenderTargets(false);
+    state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
    program_manager.BindGraphicsPipeline();

-    if (texture_cache.TextureBarrier()) {
-        glTextureBarrier();
-    }
-
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
    BeginTransformFeedback(primitive_mode);

    const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance);
@ -726,15 +678,13 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    buffer_cache.Acquire();
    current_cbuf = 0;

-    auto kernel = shader_cache.GetComputeKernel(code_addr);
-    program_manager.BindCompute(kernel->GetHandle());
+    Shader* const kernel = shader_cache.GetComputeKernel(code_addr);

-    SetupComputeTextures(kernel);
-    SetupComputeImages(kernel);
+    auto lock = texture_cache.AcquireLock();
+    BindComputeTextures(kernel);

-    const std::size_t buffer_size =
-        Tegra::Engines::KeplerCompute::NumConstBuffers *
-        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
+    const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers *
+                               (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
    buffer_cache.Map(buffer_size);

    SetupComputeConstBuffers(kernel);
@ -743,7 +693,6 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    buffer_cache.Unmap();

    const auto& launch_desc = kepler_compute.launch_description;
-    program_manager.BindCompute(kernel->GetHandle());
    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
    ++num_queued_commands;
 }
@ -764,7 +713,10 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
    }
-    texture_cache.FlushRegion(addr, size);
+    {
+        auto lock = texture_cache.AcquireLock();
+        texture_cache.DownloadMemory(addr, size);
+    }
    buffer_cache.FlushRegion(addr, size);
    query_cache.FlushRegion(addr, size);
 }
@ -773,7 +725,8 @@ bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
    if (!Settings::IsGPULevelHigh()) {
        return buffer_cache.MustFlushRegion(addr, size);
    }
-    return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size);
+    return texture_cache.IsRegionGpuModified(addr, size) ||
+           buffer_cache.MustFlushRegion(addr, size);
 }

 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@ -781,7 +734,10 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
    }
-    texture_cache.InvalidateRegion(addr, size);
+    {
+        auto lock = texture_cache.AcquireLock();
+        texture_cache.WriteMemory(addr, size);
+    }
    shader_cache.InvalidateRegion(addr, size);
    buffer_cache.InvalidateRegion(addr, size);
    query_cache.InvalidateRegion(addr, size);
@ -792,18 +748,29 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
    }
-    texture_cache.OnCPUWrite(addr, size);
+    {
+        auto lock = texture_cache.AcquireLock();
+        texture_cache.WriteMemory(addr, size);
+    }
    shader_cache.OnCPUWrite(addr, size);
    buffer_cache.OnCPUWrite(addr, size);
 }

 void RasterizerOpenGL::SyncGuestHost() {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    texture_cache.SyncGuestHost();
    buffer_cache.SyncGuestHost();
    shader_cache.SyncGuestHost();
 }

+void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
+    {
+        auto lock = texture_cache.AcquireLock();
+        texture_cache.UnmapMemory(addr, size);
+    }
+    buffer_cache.OnCPUWrite(addr, size);
+    shader_cache.OnCPUWrite(addr, size);
+}
+
 void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
    if (!gpu.IsAsync()) {
        gpu_memory.Write<u32>(addr, value);
@ -845,6 +812,14 @@ void RasterizerOpenGL::WaitForIdle() {
                    GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
 }

+void RasterizerOpenGL::FragmentBarrier() {
+    glMemoryBarrier(GL_FRAMEBUFFER_BARRIER_BIT);
+}
+
+void RasterizerOpenGL::TiledCacheBarrier() {
+    glTextureBarrier();
+}
+
 void RasterizerOpenGL::FlushCommands() {
    // Only flush when we have commands queued to OpenGL.
    if (num_queued_commands == 0) {
@ -858,53 +833,103 @@ void RasterizerOpenGL::TickFrame() {
    // Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
    num_queued_commands = 0;

+    fence_manager.TickFrame();
    buffer_cache.TickFrame();
+    {
+        auto lock = texture_cache.AcquireLock();
+        texture_cache.TickFrame();
+    }
 }

-bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
-                                             const Tegra::Engines::Fermi2D::Regs::Surface& dst,
+bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
+                                             const Tegra::Engines::Fermi2D::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
-    texture_cache.DoFermiCopy(src, dst, copy_config);
+    auto lock = texture_cache.AcquireLock();
+    texture_cache.BlitImage(dst, src, copy_config);
    return true;
 }

 bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
                                         VAddr framebuffer_addr, u32 pixel_stride) {
-    if (!framebuffer_addr) {
-        return {};
+    if (framebuffer_addr == 0) {
+        return false;
    }
-
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);

-    const auto surface{texture_cache.TryFindFramebufferSurface(framebuffer_addr)};
-    if (!surface) {
-        return {};
+    auto lock = texture_cache.AcquireLock();
+    ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)};
+    if (!image_view) {
+        return false;
    }
-
    // Verify that the cached surface is the same size and format as the requested framebuffer
-    const auto& params{surface->GetSurfaceParams()};
-    const auto& pixel_format{
-        VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)};
-    ASSERT_MSG(params.width == config.width, "Framebuffer width is different");
-    ASSERT_MSG(params.height == config.height, "Framebuffer height is different");
-
-    if (params.pixel_format != pixel_format) {
-        LOG_DEBUG(Render_OpenGL, "Framebuffer pixel_format is different");
-    }
-
-    screen_info.display_texture = surface->GetTexture();
-    screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion;
+    // ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different");
+    // ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different");

+    screen_info.display_texture = image_view->Handle(ImageViewType::e2D);
+    screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format);
    return true;
 }

-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
-    static constexpr std::array PARAMETER_LUT = {
-        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
-        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
-        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
+void RasterizerOpenGL::BindComputeTextures(Shader* kernel) {
+    image_view_indices.clear();
+    sampler_handles.clear();

+    texture_cache.SynchronizeComputeDescriptors();
+
+    SetupComputeTextures(kernel);
+    SetupComputeImages(kernel);
+
+    const std::span indices_span(image_view_indices.data(), image_view_indices.size());
+    texture_cache.FillComputeImageViews(indices_span, image_view_ids);
+
+    program_manager.BindCompute(kernel->GetHandle());
+    size_t image_view_index = 0;
+    size_t texture_index = 0;
+    size_t image_index = 0;
+    BindTextures(kernel->GetEntries(), 0, 0, image_view_index, texture_index, image_index);
+}
+
+void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_texture,
+                                    GLuint base_image, size_t& image_view_index,
+                                    size_t& texture_index, size_t& image_index) {
+    const GLuint* const samplers = sampler_handles.data() + texture_index;
+    const GLuint* const textures = texture_handles.data() + texture_index;
+    const GLuint* const images = image_handles.data() + image_index;
+
+    const size_t num_samplers = entries.samplers.size();
+    for (const auto& sampler : entries.samplers) {
+        for (size_t i = 0; i < sampler.size; ++i) {
+            const ImageViewId image_view_id = image_view_ids[image_view_index++];
+            const ImageView& image_view = texture_cache.GetImageView(image_view_id);
+            const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(sampler));
+            texture_handles[texture_index++] = handle;
+        }
+    }
+    const size_t num_images = entries.images.size();
+    for (size_t unit = 0; unit < num_images; ++unit) {
+        // TODO: Mark as modified
+        const ImageViewId image_view_id = image_view_ids[image_view_index++];
+        const ImageView& image_view = texture_cache.GetImageView(image_view_id);
+        const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(entries.images[unit]));
+        image_handles[image_index] = handle;
+        ++image_index;
+    }
+    if (num_samplers > 0) {
+        glBindSamplers(base_texture, static_cast<GLsizei>(num_samplers), samplers);
+        glBindTextures(base_texture, static_cast<GLsizei>(num_samplers), textures);
+    }
+    if (num_images > 0) {
+        glBindImageTextures(base_image, static_cast<GLsizei>(num_images), images);
+    }
+}
+
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
+    static constexpr std::array PARAMETER_LUT{
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
+    };
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto& stages = maxwell3d.state.shader_stages;
    const auto& shader_stage = stages[stage_index];
@ -1003,12 +1028,11 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh
        GL_VERTEX_PROGRAM_NV,   GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
        GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
    };
-
    const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
    const auto& entries{shader->GetEntries().global_memory_entries};

-    std::array<GLuint64EXT, 32> pointers;
-    ASSERT(entries.size() < pointers.size());
+    std::array<BindlessSSBO, 32> ssbos;
+    ASSERT(entries.size() < ssbos.size());

    const bool assembly_shaders = device.UseAssemblyShaders();
    u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
@ -1016,11 +1040,11 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh
        const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
        const u32 size{gpu_memory.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
+        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
        ++binding;
    }
    if (assembly_shaders) {
-        UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size());
+        UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
    }
 }

@ -1028,106 +1052,85 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
    const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
    const auto& entries{kernel->GetEntries().global_memory_entries};

-    std::array<GLuint64EXT, 32> pointers;
-    ASSERT(entries.size() < pointers.size());
+    std::array<BindlessSSBO, 32> ssbos;
+    ASSERT(entries.size() < ssbos.size());

    u32 binding = 0;
    for (const auto& entry : entries) {
        const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
        const u32 size{gpu_memory.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
+        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
        ++binding;
    }
    if (device.UseAssemblyShaders()) {
-        UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size());
+        UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
    }
 }

 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
-                                         GPUVAddr gpu_addr, std::size_t size,
-                                         GLuint64EXT* pointer) {
-    const std::size_t alignment{device.GetShaderStorageBufferAlignment()};
+                                         GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
+    const size_t alignment{device.GetShaderStorageBufferAlignment()};
    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
    if (device.UseAssemblyShaders()) {
-        *pointer = info.address + info.offset;
+        *ssbo = BindlessSSBO{
+            .address = static_cast<GLuint64EXT>(info.address + info.offset),
+            .length = static_cast<GLsizei>(size),
+            .padding = 0,
+        };
    } else {
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                          static_cast<GLsizeiptr>(size));
    }
 }

-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
-    MICROPROFILE_SCOPE(OpenGL_Texture);
-    u32 binding = device.GetBaseBindings(stage_index).sampler;
+void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) {
+    const bool via_header_index =
+        maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
    for (const auto& entry : shader->GetEntries().samplers) {
        const auto shader_type = static_cast<ShaderType>(stage_index);
-        for (std::size_t i = 0; i < entry.size; ++i) {
-            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
-            SetupTexture(binding++, texture, entry);
+        for (size_t index = 0; index < entry.size; ++index) {
+            const auto handle =
+                GetTextureInfo(maxwell3d, via_header_index, entry, shader_type, index);
+            const Sampler* const sampler = texture_cache.GetGraphicsSampler(handle.sampler);
+            sampler_handles.push_back(sampler->Handle());
+            image_view_indices.push_back(handle.image);
        }
    }
 }

-void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
-    MICROPROFILE_SCOPE(OpenGL_Texture);
-    u32 binding = 0;
+void RasterizerOpenGL::SetupComputeTextures(const Shader* kernel) {
+    const bool via_header_index = kepler_compute.launch_description.linked_tsc;
    for (const auto& entry : kernel->GetEntries().samplers) {
-        for (std::size_t i = 0; i < entry.size; ++i) {
-            const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i);
-            SetupTexture(binding++, texture, entry);
+        for (size_t i = 0; i < entry.size; ++i) {
+            const auto handle =
+                GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute, i);
+            const Sampler* const sampler = texture_cache.GetComputeSampler(handle.sampler);
+            sampler_handles.push_back(sampler->Handle());
+            image_view_indices.push_back(handle.image);
        }
    }
 }

-void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
-                                    const SamplerEntry& entry) {
-    const auto view = texture_cache.GetTextureSurface(texture.tic, entry);
-    if (!view) {
-        // Can occur when texture addr is null or its memory is unmapped/invalid
-        glBindSampler(binding, 0);
-        glBindTextureUnit(binding, 0);
-        return;
-    }
-    const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
-                                           texture.tic.z_source, texture.tic.w_source);
-    glBindTextureUnit(binding, handle);
-    if (!view->GetSurfaceParams().IsBuffer()) {
-        glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
-    }
-}
-
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
-    u32 binding = device.GetBaseBindings(stage_index).image;
+void RasterizerOpenGL::SetupDrawImages(const Shader* shader, size_t stage_index) {
+    const bool via_header_index =
+        maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
    for (const auto& entry : shader->GetEntries().images) {
        const auto shader_type = static_cast<ShaderType>(stage_index);
-        const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
-        SetupImage(binding++, tic, entry);
+        const auto handle = GetTextureInfo(maxwell3d, via_header_index, entry, shader_type);
+        image_view_indices.push_back(handle.image);
    }
 }

-void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
-    u32 binding = 0;
+void RasterizerOpenGL::SetupComputeImages(const Shader* shader) {
+    const bool via_header_index = kepler_compute.launch_description.linked_tsc;
    for (const auto& entry : shader->GetEntries().images) {
-        const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic;
-        SetupImage(binding++, tic, entry);
+        const auto handle =
+            GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute);
+        image_view_indices.push_back(handle.image);
    }
 }

-void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
-                                  const ImageEntry& entry) {
-    const auto view = texture_cache.GetImageSurface(tic, entry);
-    if (!view) {
-        glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
-        return;
-    }
-    if (entry.is_written) {
-        view->MarkAsModified(texture_cache.Tick());
-    }
-    const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
-    glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
-}
-
 void RasterizerOpenGL::SyncViewport() {
    auto& flags = maxwell3d.dirty.flags;
    const auto& regs = maxwell3d.regs;
@ -1157,7 +1160,7 @@ void RasterizerOpenGL::SyncViewport() {
        flags[Dirty::ClipControl] = false;

        bool flip_y = false;
-        if (regs.viewport_transform[0].scale_y < 0.0) {
+        if (regs.viewport_transform[0].scale_y < 0.0f) {
            flip_y = !flip_y;
        }
        if (regs.screen_y_control.y_negate != 0) {
@ -1527,17 +1530,9 @@ void RasterizerOpenGL::SyncPointState() {
    flags[Dirty::PointSize] = false;

    oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable);
+    oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable);

-    if (maxwell3d.regs.vp_point_size.enable) {
-        // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled.
-        glEnable(GL_PROGRAM_POINT_SIZE);
-        return;
-    }
-
-    // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
-    // in OpenGL).
    glPointSize(std::max(1.0f, maxwell3d.regs.point_size));
-    glDisable(GL_PROGRAM_POINT_SIZE);
 }

 void RasterizerOpenGL::SyncLineState() {
@ -1580,10 +1575,6 @@ void RasterizerOpenGL::SyncAlphaTest() {
    flags[Dirty::AlphaTest] = false;

    const auto& regs = maxwell3d.regs;
-    if (regs.alpha_test_enabled && regs.rt_control.count > 1) {
-        LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested");
-    }
-
    if (regs.alpha_test_enabled) {
        glEnable(GL_ALPHA_TEST);
        glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref);
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -7,12 +7,13 @@
 #include <array>
 #include <atomic>
 #include <cstddef>
-#include <map>
 #include <memory>
 #include <optional>
 #include <tuple>
 #include <utility>

+#include <boost/container/static_vector.hpp>
+
 #include <glad/glad.h>

 #include "common/common_types.h"
@ -23,16 +24,14 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_fence_manager.h"
-#include "video_core/renderer_opengl/gl_framebuffer_cache.h"
 #include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
+#include "video_core/renderer_opengl/gl_stream_buffer.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"
-#include "video_core/renderer_opengl/utils.h"
 #include "video_core/shader/async_shaders.h"
 #include "video_core/textures/texture.h"

@ -51,14 +50,21 @@ class MemoryManager;
 namespace OpenGL {

 struct ScreenInfo;
-struct DrawParameters;
+struct ShaderEntries;
+
+struct BindlessSSBO {
+    GLuint64EXT address;
+    GLsizei length;
+    GLsizei padding;
+};
+static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);

 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
-    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
-                              Core::Memory::Memory& cpu_memory, const Device& device,
-                              ScreenInfo& screen_info, ProgramManager& program_manager,
-                              StateTracker& state_tracker);
+    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
+                              Core::Memory::Memory& cpu_memory_, const Device& device_,
+                              ScreenInfo& screen_info_, ProgramManager& program_manager_,
+                              StateTracker& state_tracker_);
    ~RasterizerOpenGL() override;

    void Draw(bool is_indexed, bool is_instanced) override;
@ -72,15 +78,18 @@ public:
    void InvalidateRegion(VAddr addr, u64 size) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
    void SyncGuestHost() override;
+    void UnmapMemory(VAddr addr, u64 size) override;
    void SignalSemaphore(GPUVAddr addr, u32 value) override;
    void SignalSyncPoint(u32 value) override;
    void ReleaseFences() override;
    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
    void WaitForIdle() override;
+    void FragmentBarrier() override;
+    void TiledCacheBarrier() override;
    void FlushCommands() override;
    void TickFrame() override;
-    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
-                               const Tegra::Engines::Fermi2D::Regs::Surface& dst,
+    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
+                               const Tegra::Engines::Fermi2D::Surface& dst,
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
    bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                           u32 pixel_stride) override;
@ -101,11 +110,14 @@ public:
    }

 private:
-    /// Configures the color and depth framebuffer states.
-    void ConfigureFramebuffers();
+    static constexpr size_t MAX_TEXTURES = 192;
+    static constexpr size_t MAX_IMAGES = 48;
+    static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES;

-    /// Configures the color and depth framebuffer for clearing.
-    void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
+    void BindComputeTextures(Shader* kernel);
+
+    void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image,
+                      size_t& image_view_index, size_t& texture_index, size_t& image_index);

    /// Configures the current constbuffers to use for the draw command.
    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
@ -126,26 +138,19 @@ private:

    /// Configures a global memory buffer.
    void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
-                           std::size_t size, GLuint64EXT* pointer);
+                           size_t size, BindlessSSBO* ssbo);

    /// Configures the current textures to use for the draw command.
-    void SetupDrawTextures(std::size_t stage_index, Shader* shader);
+    void SetupDrawTextures(const Shader* shader, size_t stage_index);

    /// Configures the textures used in a compute shader.
-    void SetupComputeTextures(Shader* kernel);
-
-    /// Configures a texture.
-    void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
-                      const SamplerEntry& entry);
+    void SetupComputeTextures(const Shader* kernel);

    /// Configures images in a graphics shader.
-    void SetupDrawImages(std::size_t stage_index, Shader* shader);
+    void SetupDrawImages(const Shader* shader, size_t stage_index);

    /// Configures images in a compute shader.
-    void SetupComputeImages(Shader* shader);
-
-    /// Configures an image.
-    void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
+    void SetupComputeImages(const Shader* shader);

    /// Syncs the viewport and depth range to match the guest state
    void SyncViewport();
@ -220,9 +225,6 @@ private:
    /// End a transform feedback
    void EndTransformFeedback();

-    /// Check for extension that are not strictly required but are needed for correct emulation
-    void CheckExtensions();
-
    std::size_t CalculateVertexArraysSize() const;

    std::size_t CalculateIndexBufferSize() const;
@ -235,7 +237,7 @@ private:

    GLintptr SetupIndexBuffer();

-    void SetupShaders(GLenum primitive_mode);
+    void SetupShaders();

    Tegra::GPU& gpu;
    Tegra::Engines::Maxwell3D& maxwell3d;
@ -247,19 +249,21 @@ private:
    ProgramManager& program_manager;
    StateTracker& state_tracker;

-    TextureCacheOpenGL texture_cache;
+    OGLStreamBuffer stream_buffer;
+    TextureCacheRuntime texture_cache_runtime;
+    TextureCache texture_cache;
    ShaderCacheOpenGL shader_cache;
-    SamplerCacheOpenGL sampler_cache;
-    FramebufferCacheOpenGL framebuffer_cache;
    QueryCache query_cache;
    OGLBufferCache buffer_cache;
    FenceManagerOpenGL fence_manager;

    VideoCommon::Shader::AsyncShaders async_shaders;

-    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
-
-    GLint vertex_binding = 0;
+    boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
+    std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
+    boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
+    std::array<GLuint, MAX_TEXTURES> texture_handles;
+    std::array<GLuint, MAX_IMAGES> image_handles;

    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
        transform_feedback_buffers;
@ -273,7 +277,7 @@ private:
    std::size_t current_cbuf = 0;
    OGLBuffer unified_uniform_buffer;

-    /// Number of commands queued to the OpenGL driver. Reseted on flush.
+    /// Number of commands queued to the OpenGL driver. Resetted on flush.
    std::size_t num_queued_commands = 0;

    u32 last_clip_distance_mask = 0;
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@ -71,7 +71,7 @@ void OGLSampler::Create() {
        return;

    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
-    glGenSamplers(1, &handle);
+    glCreateSamplers(1, &handle);
 }

 void OGLSampler::Release() {
--- a/src/video_core/renderer_opengl/gl_sampler_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_sampler_cache.cpp
@ -1,52 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "common/logging/log.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_sampler_cache.h"
-#include "video_core/renderer_opengl/maxwell_to_gl.h"
-
-namespace OpenGL {
-
-SamplerCacheOpenGL::SamplerCacheOpenGL() = default;
-
-SamplerCacheOpenGL::~SamplerCacheOpenGL() = default;
-
-OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const {
-    OGLSampler sampler;
-    sampler.Create();
-
-    const GLuint sampler_id{sampler.handle};
-    glSamplerParameteri(
-        sampler_id, GL_TEXTURE_MAG_FILTER,
-        MaxwellToGL::TextureFilterMode(tsc.mag_filter, Tegra::Texture::TextureMipmapFilter::None));
-    glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER,
-                        MaxwellToGL::TextureFilterMode(tsc.min_filter, tsc.mipmap_filter));
-    glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(tsc.wrap_u));
-    glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(tsc.wrap_v));
-    glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(tsc.wrap_p));
-    glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_MODE,
-                        tsc.depth_compare_enabled == 1 ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE);
-    glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_FUNC,
-                        MaxwellToGL::DepthCompareFunc(tsc.depth_compare_func));
-    glSamplerParameterfv(sampler_id, GL_TEXTURE_BORDER_COLOR, tsc.GetBorderColor().data());
-    glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, tsc.GetMinLod());
-    glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, tsc.GetMaxLod());
-    glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, tsc.GetLodBias());
-    if (GLAD_GL_ARB_texture_filter_anisotropic) {
-        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy());
-    } else if (GLAD_GL_EXT_texture_filter_anisotropic) {
-        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy());
-    } else {
-        LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver");
-    }
-
-    return sampler;
-}
-
-GLuint SamplerCacheOpenGL::ToSamplerType(const OGLSampler& sampler) const {
-    return sampler.handle;
-}
-
-} // namespace OpenGL
--- a/Show More
+++ b/Show More