Merge remote-tracking branch 'upstream/master' into int-flags

This commit is contained in:
Levi
2021-01-10 22:09:56 -07:00
912 changed files with 91129 additions and 25508 deletions

View File

@ -5,8 +5,27 @@ add_library(video_core STATIC
buffer_cache/buffer_cache.h
buffer_cache/map_interval.cpp
buffer_cache/map_interval.h
cdma_pusher.cpp
cdma_pusher.h
command_classes/codecs/codec.cpp
command_classes/codecs/codec.h
command_classes/codecs/h264.cpp
command_classes/codecs/h264.h
command_classes/codecs/vp9.cpp
command_classes/codecs/vp9.h
command_classes/codecs/vp9_types.h
command_classes/host1x.cpp
command_classes/host1x.h
command_classes/nvdec.cpp
command_classes/nvdec.h
command_classes/nvdec_common.h
command_classes/sync_manager.cpp
command_classes/sync_manager.h
command_classes/vic.cpp
command_classes/vic.h
compatible_formats.cpp
compatible_formats.h
delayed_destruction_ring.h
dirty_flags.cpp
dirty_flags.h
dma_pusher.cpp
@ -29,6 +48,7 @@ add_library(video_core STATIC
engines/shader_bytecode.h
engines/shader_header.h
engines/shader_type.h
framebuffer_config.h
macro/macro.cpp
macro/macro.h
macro/macro_hle.cpp
@ -40,10 +60,6 @@ add_library(video_core STATIC
fence_manager.h
gpu.cpp
gpu.h
gpu_asynch.cpp
gpu_asynch.h
gpu_synch.cpp
gpu_synch.h
gpu_thread.cpp
gpu_thread.h
guest_driver.cpp
@ -66,14 +82,10 @@ add_library(video_core STATIC
renderer_opengl/gl_device.h
renderer_opengl/gl_fence_manager.cpp
renderer_opengl/gl_fence_manager.h
renderer_opengl/gl_framebuffer_cache.cpp
renderer_opengl/gl_framebuffer_cache.h
renderer_opengl/gl_rasterizer.cpp
renderer_opengl/gl_rasterizer.h
renderer_opengl/gl_resource_manager.cpp
renderer_opengl/gl_resource_manager.h
renderer_opengl/gl_sampler_cache.cpp
renderer_opengl/gl_sampler_cache.h
renderer_opengl/gl_shader_cache.cpp
renderer_opengl/gl_shader_cache.h
renderer_opengl/gl_shader_decompiler.cpp
@ -95,10 +107,62 @@ add_library(video_core STATIC
renderer_opengl/maxwell_to_gl.h
renderer_opengl/renderer_opengl.cpp
renderer_opengl/renderer_opengl.h
renderer_opengl/utils.cpp
renderer_opengl/utils.h
sampler_cache.cpp
sampler_cache.h
renderer_opengl/util_shaders.cpp
renderer_opengl/util_shaders.h
renderer_vulkan/blit_image.cpp
renderer_vulkan/blit_image.h
renderer_vulkan/fixed_pipeline_state.cpp
renderer_vulkan/fixed_pipeline_state.h
renderer_vulkan/maxwell_to_vk.cpp
renderer_vulkan/maxwell_to_vk.h
renderer_vulkan/renderer_vulkan.h
renderer_vulkan/renderer_vulkan.cpp
renderer_vulkan/vk_blit_screen.cpp
renderer_vulkan/vk_blit_screen.h
renderer_vulkan/vk_buffer_cache.cpp
renderer_vulkan/vk_buffer_cache.h
renderer_vulkan/vk_command_pool.cpp
renderer_vulkan/vk_command_pool.h
renderer_vulkan/vk_compute_pass.cpp
renderer_vulkan/vk_compute_pass.h
renderer_vulkan/vk_compute_pipeline.cpp
renderer_vulkan/vk_compute_pipeline.h
renderer_vulkan/vk_descriptor_pool.cpp
renderer_vulkan/vk_descriptor_pool.h
renderer_vulkan/vk_fence_manager.cpp
renderer_vulkan/vk_fence_manager.h
renderer_vulkan/vk_graphics_pipeline.cpp
renderer_vulkan/vk_graphics_pipeline.h
renderer_vulkan/vk_master_semaphore.cpp
renderer_vulkan/vk_master_semaphore.h
renderer_vulkan/vk_memory_manager.cpp
renderer_vulkan/vk_memory_manager.h
renderer_vulkan/vk_pipeline_cache.cpp
renderer_vulkan/vk_pipeline_cache.h
renderer_vulkan/vk_query_cache.cpp
renderer_vulkan/vk_query_cache.h
renderer_vulkan/vk_rasterizer.cpp
renderer_vulkan/vk_rasterizer.h
renderer_vulkan/vk_resource_pool.cpp
renderer_vulkan/vk_resource_pool.h
renderer_vulkan/vk_scheduler.cpp
renderer_vulkan/vk_scheduler.h
renderer_vulkan/vk_shader_decompiler.cpp
renderer_vulkan/vk_shader_decompiler.h
renderer_vulkan/vk_shader_util.cpp
renderer_vulkan/vk_shader_util.h
renderer_vulkan/vk_staging_buffer_pool.cpp
renderer_vulkan/vk_staging_buffer_pool.h
renderer_vulkan/vk_state_tracker.cpp
renderer_vulkan/vk_state_tracker.h
renderer_vulkan/vk_stream_buffer.cpp
renderer_vulkan/vk_stream_buffer.h
renderer_vulkan/vk_swapchain.cpp
renderer_vulkan/vk_swapchain.h
renderer_vulkan/vk_texture_cache.cpp
renderer_vulkan/vk_texture_cache.h
renderer_vulkan/vk_update_descriptor.cpp
renderer_vulkan/vk_update_descriptor.h
shader_cache.h
shader_notify.cpp
shader_notify.h
@ -155,109 +219,71 @@ add_library(video_core STATIC
shader/transform_feedback.h
surface.cpp
surface.h
texture_cache/accelerated_swizzle.cpp
texture_cache/accelerated_swizzle.h
texture_cache/decode_bc4.cpp
texture_cache/decode_bc4.h
texture_cache/descriptor_table.h
texture_cache/formatter.cpp
texture_cache/formatter.h
texture_cache/format_lookup_table.cpp
texture_cache/format_lookup_table.h
texture_cache/surface_base.cpp
texture_cache/surface_base.h
texture_cache/surface_params.cpp
texture_cache/surface_params.h
texture_cache/surface_view.cpp
texture_cache/surface_view.h
texture_cache/image_base.cpp
texture_cache/image_base.h
texture_cache/image_info.cpp
texture_cache/image_info.h
texture_cache/image_view_base.cpp
texture_cache/image_view_base.h
texture_cache/image_view_info.cpp
texture_cache/image_view_info.h
texture_cache/render_targets.h
texture_cache/samples_helper.h
texture_cache/slot_vector.h
texture_cache/texture_cache.h
texture_cache/types.h
texture_cache/util.cpp
texture_cache/util.h
textures/astc.cpp
textures/astc.h
textures/convert.cpp
textures/convert.h
textures/decoders.cpp
textures/decoders.h
textures/texture.cpp
textures/texture.h
video_core.cpp
video_core.h
vulkan_common/vulkan_debug_callback.cpp
vulkan_common/vulkan_debug_callback.h
vulkan_common/vulkan_device.cpp
vulkan_common/vulkan_device.h
vulkan_common/vulkan_instance.cpp
vulkan_common/vulkan_instance.h
vulkan_common/vulkan_library.cpp
vulkan_common/vulkan_library.h
vulkan_common/vulkan_surface.cpp
vulkan_common/vulkan_surface.h
vulkan_common/vulkan_wrapper.cpp
vulkan_common/vulkan_wrapper.h
vulkan_common/nsight_aftermath_tracker.cpp
vulkan_common/nsight_aftermath_tracker.h
)
if (ENABLE_VULKAN)
target_sources(video_core PRIVATE
renderer_vulkan/fixed_pipeline_state.cpp
renderer_vulkan/fixed_pipeline_state.h
renderer_vulkan/maxwell_to_vk.cpp
renderer_vulkan/maxwell_to_vk.h
renderer_vulkan/nsight_aftermath_tracker.cpp
renderer_vulkan/nsight_aftermath_tracker.h
renderer_vulkan/renderer_vulkan.h
renderer_vulkan/renderer_vulkan.cpp
renderer_vulkan/vk_blit_screen.cpp
renderer_vulkan/vk_blit_screen.h
renderer_vulkan/vk_buffer_cache.cpp
renderer_vulkan/vk_buffer_cache.h
renderer_vulkan/vk_command_pool.cpp
renderer_vulkan/vk_command_pool.h
renderer_vulkan/vk_compute_pass.cpp
renderer_vulkan/vk_compute_pass.h
renderer_vulkan/vk_compute_pipeline.cpp
renderer_vulkan/vk_compute_pipeline.h
renderer_vulkan/vk_descriptor_pool.cpp
renderer_vulkan/vk_descriptor_pool.h
renderer_vulkan/vk_device.cpp
renderer_vulkan/vk_device.h
renderer_vulkan/vk_fence_manager.cpp
renderer_vulkan/vk_fence_manager.h
renderer_vulkan/vk_graphics_pipeline.cpp
renderer_vulkan/vk_graphics_pipeline.h
renderer_vulkan/vk_image.cpp
renderer_vulkan/vk_image.h
renderer_vulkan/vk_master_semaphore.cpp
renderer_vulkan/vk_master_semaphore.h
renderer_vulkan/vk_memory_manager.cpp
renderer_vulkan/vk_memory_manager.h
renderer_vulkan/vk_pipeline_cache.cpp
renderer_vulkan/vk_pipeline_cache.h
renderer_vulkan/vk_query_cache.cpp
renderer_vulkan/vk_query_cache.h
renderer_vulkan/vk_rasterizer.cpp
renderer_vulkan/vk_rasterizer.h
renderer_vulkan/vk_renderpass_cache.cpp
renderer_vulkan/vk_renderpass_cache.h
renderer_vulkan/vk_resource_pool.cpp
renderer_vulkan/vk_resource_pool.h
renderer_vulkan/vk_sampler_cache.cpp
renderer_vulkan/vk_sampler_cache.h
renderer_vulkan/vk_scheduler.cpp
renderer_vulkan/vk_scheduler.h
renderer_vulkan/vk_shader_decompiler.cpp
renderer_vulkan/vk_shader_decompiler.h
renderer_vulkan/vk_shader_util.cpp
renderer_vulkan/vk_shader_util.h
renderer_vulkan/vk_staging_buffer_pool.cpp
renderer_vulkan/vk_staging_buffer_pool.h
renderer_vulkan/vk_state_tracker.cpp
renderer_vulkan/vk_state_tracker.h
renderer_vulkan/vk_stream_buffer.cpp
renderer_vulkan/vk_stream_buffer.h
renderer_vulkan/vk_swapchain.cpp
renderer_vulkan/vk_swapchain.h
renderer_vulkan/vk_texture_cache.cpp
renderer_vulkan/vk_texture_cache.h
renderer_vulkan/vk_update_descriptor.cpp
renderer_vulkan/vk_update_descriptor.h
renderer_vulkan/wrapper.cpp
renderer_vulkan/wrapper.h
)
endif()
create_target_directory_groups(video_core)
target_link_libraries(video_core PUBLIC common core)
target_link_libraries(video_core PRIVATE glad xbyak)
if (MSVC)
target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
else()
target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
endif()
add_dependencies(video_core host_shaders)
target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
if (ENABLE_VULKAN)
target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
target_compile_definitions(video_core PRIVATE HAS_VULKAN)
target_link_libraries(video_core PRIVATE sirit)
endif()
target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
target_link_libraries(video_core PRIVATE sirit)
if (ENABLE_NSIGHT_AFTERMATH)
if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})
@ -271,7 +297,27 @@ if (ENABLE_NSIGHT_AFTERMATH)
endif()
if (MSVC)
target_compile_options(video_core PRIVATE /we4267)
target_compile_options(video_core PRIVATE
/we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data
/we4456 # Declaration of 'identifier' hides previous local declaration
/we4457 # Declaration of 'identifier' hides function parameter
/we4458 # Declaration of 'identifier' hides class member
/we4459 # Declaration of 'identifier' hides global declaration
/we4715 # 'function' : not all control paths return a value
)
else()
target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion -Werror=switch)
target_compile_options(video_core PRIVATE
-Werror=conversion
-Wno-error=sign-conversion
-Werror=pessimizing-move
-Werror=redundant-move
-Werror=shadow
-Werror=switch
-Werror=type-limits
-Werror=unused-variable
$<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess>
$<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter>
$<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable>
)
endif()

View File

@ -4,34 +4,29 @@
#pragma once
#include <unordered_set>
#include <utility>
#include "common/alignment.h"
#include "common/common_types.h"
#include "video_core/gpu.h"
namespace VideoCommon {
class BufferBlock {
public:
bool Overlaps(VAddr start, VAddr end) const {
[[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
return (cpu_addr < end) && (cpu_addr_end > start);
}
bool IsInside(VAddr other_start, VAddr other_end) const {
[[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
return cpu_addr <= other_start && other_end <= cpu_addr_end;
}
std::size_t Offset(VAddr in_addr) const {
[[nodiscard]] std::size_t Offset(VAddr in_addr) const {
return static_cast<std::size_t>(in_addr - cpu_addr);
}
VAddr CpuAddr() const {
[[nodiscard]] VAddr CpuAddr() const {
return cpu_addr;
}
VAddr CpuAddrEnd() const {
[[nodiscard]] VAddr CpuAddrEnd() const {
return cpu_addr_end;
}
@ -40,11 +35,11 @@ public:
cpu_addr_end = new_addr + size;
}
std::size_t Size() const {
[[nodiscard]] std::size_t Size() const {
return size;
}
u64 Epoch() const {
[[nodiscard]] u64 Epoch() const {
return epoch;
}

View File

@ -118,20 +118,17 @@ public:
/// Prepares the buffer cache for data uploading
/// @param max_size Maximum number of bytes that will be uploaded
/// @return True when a stream buffer invalidation was required, false otherwise
bool Map(std::size_t max_size) {
void Map(std::size_t max_size) {
std::lock_guard lock{mutex};
bool invalidated;
std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
buffer_offset = buffer_offset_base;
return invalidated;
}
/// Finishes the upload stream
void Unmap() {
std::lock_guard lock{mutex};
stream_buffer->Unmap(buffer_offset - buffer_offset_base);
stream_buffer.Unmap(buffer_offset - buffer_offset_base);
}
/// Function called at the end of each frame, inteded for deferred operations
@ -261,9 +258,9 @@ public:
protected:
explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
std::unique_ptr<StreamBuffer> stream_buffer_)
StreamBuffer& stream_buffer_)
: rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {}
stream_buffer{stream_buffer_} {}
~BufferCache() = default;
@ -441,7 +438,7 @@ private:
buffer_ptr += size;
buffer_offset += size;
return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
}
void AlignBuffer(std::size_t alignment) {
@ -545,7 +542,7 @@ private:
bool IsRegionWritten(VAddr start, VAddr end) const {
const u64 page_end = end >> WRITE_PAGE_BIT;
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
if (written_pages.count(page_start) > 0) {
if (written_pages.contains(page_start)) {
return true;
}
}
@ -567,9 +564,7 @@ private:
VideoCore::RasterizerInterface& rasterizer;
Tegra::MemoryManager& gpu_memory;
Core::Memory::Memory& cpu_memory;
std::unique_ptr<StreamBuffer> stream_buffer;
BufferType stream_buffer_handle;
StreamBuffer& stream_buffer;
u8* buffer_ptr = nullptr;
u64 buffer_offset = 0;

View File

@ -84,9 +84,10 @@ private:
void FillFreeList(Chunk& chunk);
std::vector<MapInterval*> free_list;
std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
Chunk first_chunk;
std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
};
} // namespace VideoCommon

View File

@ -0,0 +1,170 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include "command_classes/host1x.h"
#include "command_classes/nvdec.h"
#include "command_classes/vic.h"
#include "common/bit_util.h"
#include "video_core/cdma_pusher.h"
#include "video_core/command_classes/nvdec_common.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra {
CDmaPusher::CDmaPusher(GPU& gpu_)
: gpu{gpu_}, nvdec_processor(std::make_shared<Nvdec>(gpu)),
vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
host1x_processor(std::make_unique<Host1x>(gpu)),
sync_manager(std::make_unique<SyncptIncrManager>(gpu)) {}
CDmaPusher::~CDmaPusher() = default;
void CDmaPusher::Push(ChCommandHeaderList&& entries) {
cdma_queue.push(std::move(entries));
}
void CDmaPusher::DispatchCalls() {
while (!cdma_queue.empty()) {
Step();
}
}
void CDmaPusher::Step() {
const auto entries{cdma_queue.front()};
cdma_queue.pop();
std::vector<u32> values(entries.size());
std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
for (const u32 value : values) {
if (mask != 0) {
const u32 lbs = Common::CountTrailingZeroes32(mask);
mask &= ~(1U << lbs);
ExecuteCommand(static_cast<u32>(offset + lbs), value);
continue;
} else if (count != 0) {
--count;
ExecuteCommand(static_cast<u32>(offset), value);
if (incrementing) {
++offset;
}
continue;
}
const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
switch (mode) {
case ChSubmissionMode::SetClass: {
mask = value & 0x3f;
offset = (value >> 16) & 0xfff;
current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
break;
}
case ChSubmissionMode::Incrementing:
case ChSubmissionMode::NonIncrementing:
count = value & 0xffff;
offset = (value >> 16) & 0xfff;
incrementing = mode == ChSubmissionMode::Incrementing;
break;
case ChSubmissionMode::Mask:
mask = value & 0xffff;
offset = (value >> 16) & 0xfff;
break;
case ChSubmissionMode::Immediate: {
const u32 data = value & 0xfff;
offset = (value >> 16) & 0xfff;
ExecuteCommand(static_cast<u32>(offset), data);
break;
}
default:
UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
break;
}
}
}
void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
switch (current_class) {
case ChClassId::NvDec:
ThiStateWrite(nvdec_thi_state, state_offset, {data});
switch (static_cast<ThiMethod>(state_offset)) {
case ThiMethod::IncSyncpt: {
LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
const auto syncpoint_id = static_cast<u32>(data & 0xFF);
const auto cond = static_cast<u32>((data >> 8) & 0xFF);
if (cond == 0) {
sync_manager->Increment(syncpoint_id);
} else {
sync_manager->SignalDone(
sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
}
break;
}
case ThiMethod::SetMethod1:
LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
static_cast<u32>(nvdec_thi_state.method_0));
nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0),
{data});
break;
default:
break;
}
break;
case ChClassId::GraphicsVic:
ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
switch (static_cast<ThiMethod>(state_offset)) {
case ThiMethod::IncSyncpt: {
LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
const auto syncpoint_id = static_cast<u32>(data & 0xFF);
const auto cond = static_cast<u32>((data >> 8) & 0xFF);
if (cond == 0) {
sync_manager->Increment(syncpoint_id);
} else {
sync_manager->SignalDone(
sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
}
break;
}
case ThiMethod::SetMethod1:
LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
static_cast<u32>(vic_thi_state.method_0), data);
vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data});
break;
default:
break;
}
break;
case ChClassId::Host1x:
// This device is mainly for syncpoint synchronization
LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data});
break;
default:
UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
break;
}
}
void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset,
const std::vector<u32>& arguments) {
u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size());
}
} // namespace Tegra

View File

@ -0,0 +1,136 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include <queue>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "video_core/command_classes/sync_manager.h"
namespace Tegra {
class GPU;
class Nvdec;
class Vic;
class Host1x;
enum class ChSubmissionMode : u32 {
SetClass = 0,
Incrementing = 1,
NonIncrementing = 2,
Mask = 3,
Immediate = 4,
Restart = 5,
Gather = 6,
};
enum class ChClassId : u32 {
NoClass = 0x0,
Host1x = 0x1,
VideoEncodeMpeg = 0x20,
VideoEncodeNvEnc = 0x21,
VideoStreamingVi = 0x30,
VideoStreamingIsp = 0x32,
VideoStreamingIspB = 0x34,
VideoStreamingViI2c = 0x36,
GraphicsVic = 0x5d,
Graphics3D = 0x60,
GraphicsGpu = 0x61,
Tsec = 0xe0,
TsecB = 0xe1,
NvJpg = 0xc0,
NvDec = 0xf0
};
enum class ChMethod : u32 {
Empty = 0,
SetMethod = 0x10,
SetData = 0x11,
};
union ChCommandHeader {
u32 raw;
BitField<0, 16, u32> value;
BitField<16, 12, ChMethod> method_offset;
BitField<28, 4, ChSubmissionMode> submission_mode;
};
static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
struct ChCommand {
ChClassId class_id{};
int method_offset{};
std::vector<u32> arguments;
};
using ChCommandHeaderList = std::vector<ChCommandHeader>;
using ChCommandList = std::vector<ChCommand>;
struct ThiRegisters {
u32_le increment_syncpt{};
INSERT_PADDING_WORDS(1);
u32_le increment_syncpt_error{};
u32_le ctx_switch_incremement_syncpt{};
INSERT_PADDING_WORDS(4);
u32_le ctx_switch{};
INSERT_PADDING_WORDS(1);
u32_le ctx_syncpt_eof{};
INSERT_PADDING_WORDS(5);
u32_le method_0{};
u32_le method_1{};
INSERT_PADDING_WORDS(12);
u32_le int_status{};
u32_le int_mask{};
};
enum class ThiMethod : u32 {
IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
};
class CDmaPusher {
public:
explicit CDmaPusher(GPU& gpu_);
~CDmaPusher();
/// Push NVDEC command buffer entries into queue
void Push(ChCommandHeaderList&& entries);
/// Process queued command buffer entries
void DispatchCalls();
/// Process one queue element
void Step();
/// Invoke command class devices to execute the command based on the current state
void ExecuteCommand(u32 state_offset, u32 data);
private:
/// Write arguments value to the ThiRegisters member at the specified offset
void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments);
GPU& gpu;
std::shared_ptr<Tegra::Nvdec> nvdec_processor;
std::unique_ptr<Tegra::Vic> vic_processor;
std::unique_ptr<Tegra::Host1x> host1x_processor;
std::unique_ptr<SyncptIncrManager> sync_manager;
ChClassId current_class{};
ThiRegisters vic_thi_state{};
ThiRegisters nvdec_thi_state{};
s32 count{};
s32 offset{};
s32 mask{};
bool incrementing{};
// Queue of command lists to be processed
std::queue<ChCommandHeaderList> cdma_queue;
};
} // namespace Tegra

View File

@ -0,0 +1,129 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include <fstream>
#include <vector>
#include "common/assert.h"
#include "video_core/command_classes/codecs/codec.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/command_classes/codecs/vp9.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
extern "C" {
#include <libavutil/opt.h>
}
namespace Tegra {
void AVFrameDeleter(AVFrame* ptr) {
av_frame_unref(ptr);
av_free(ptr);
}
Codec::Codec(GPU& gpu_)
: gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
Codec::~Codec() {
if (!initialized) {
return;
}
// Free libav memory
AVFrame* av_frame{nullptr};
avcodec_send_packet(av_codec_ctx, nullptr);
av_frame = av_frame_alloc();
avcodec_receive_frame(av_codec_ctx, av_frame);
avcodec_flush_buffers(av_codec_ctx);
av_frame_unref(av_frame);
av_free(av_frame);
avcodec_close(av_codec_ctx);
}
void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec);
current_codec = codec;
}
void Codec::StateWrite(u32 offset, u64 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
std::memcpy(state_offset, &arguments, sizeof(u64));
}
void Codec::Decode() {
bool is_first_frame = false;
if (!initialized) {
if (current_codec == NvdecCommon::VideoCodec::H264) {
av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
} else {
LOG_ERROR(Service_NVDRV, "Unknown video codec {}", current_codec);
return;
}
av_codec_ctx = avcodec_alloc_context3(av_codec);
av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
// TODO(ameerj): libavcodec gpu hw acceleration
const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
if (av_error < 0) {
LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
avcodec_close(av_codec_ctx);
return;
}
initialized = true;
is_first_frame = true;
}
bool vp9_hidden_frame = false;
AVPacket packet{};
av_init_packet(&packet);
std::vector<u8> frame_data;
if (current_codec == NvdecCommon::VideoCodec::H264) {
frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
frame_data = vp9_decoder->ComposeFrameHeader(state);
vp9_hidden_frame = vp9_decoder->WasFrameHidden();
}
packet.data = frame_data.data();
packet.size = static_cast<int>(frame_data.size());
avcodec_send_packet(av_codec_ctx, &packet);
if (!vp9_hidden_frame) {
// Only receive/store visible frames
AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
avcodec_receive_frame(av_codec_ctx, frame.get());
av_frames.push(std::move(frame));
// Limit queue to 10 frames. Workaround for ZLA decode and queue spam
if (av_frames.size() > 10) {
av_frames.pop();
}
}
}
AVFramePtr Codec::GetCurrentFrame() {
// Sometimes VIC will request more frames than have been decoded.
// in this case, return a nullptr and don't overwrite previous frame data
if (av_frames.empty()) {
return AVFramePtr{nullptr, AVFrameDeleter};
}
AVFramePtr frame = std::move(av_frames.front());
av_frames.pop();
return frame;
}
NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
return current_codec;
}
} // namespace Tegra

View File

@ -0,0 +1,70 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <queue>
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
extern "C" {
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#include <libavcodec/avcodec.h>
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
namespace Tegra {
class GPU;
struct VicRegisters;
void AVFrameDeleter(AVFrame* ptr);
using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
namespace Decoder {
class H264;
class VP9;
} // namespace Decoder
class Codec {
public:
explicit Codec(GPU& gpu);
~Codec();
/// Sets NVDEC video stream codec
void SetTargetCodec(NvdecCommon::VideoCodec codec);
/// Populate NvdecRegisters state with argument value at the provided offset
void StateWrite(u32 offset, u64 arguments);
/// Call decoders to construct headers, decode AVFrame with ffmpeg
void Decode();
/// Returns next decoded frame
[[nodiscard]] AVFramePtr GetCurrentFrame();
/// Returns the value of current_codec
[[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
private:
bool initialized{};
NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
AVCodec* av_codec{nullptr};
AVCodecContext* av_codec_ctx{nullptr};
GPU& gpu;
std::unique_ptr<Decoder::H264> h264_decoder;
std::unique_ptr<Decoder::VP9> vp9_decoder;
NvdecCommon::NvdecRegisters state{};
std::queue<AVFramePtr> av_frames{};
};
} // namespace Tegra

View File

@ -0,0 +1,293 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include <array>
#include "common/bit_util.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra::Decoder {
namespace {
// ZigZag LUTs from libavcodec.
constexpr std::array<u8, 64> zig_zag_direct{
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48,
41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
};
constexpr std::array<u8, 16> zig_zag_scan{
0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
};
} // Anonymous namespace
H264::H264(GPU& gpu_) : gpu(gpu_) {}
H264::~H264() = default;
const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state,
bool is_first_frame) {
H264DecoderContext context{};
gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
if (!is_first_frame && frame_number != 0) {
frame.resize(context.frame_data_size);
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
} else {
/// Encode header
H264BitWriter writer{};
writer.WriteU(1, 24);
writer.WriteU(0, 1);
writer.WriteU(3, 2);
writer.WriteU(7, 5);
writer.WriteU(100, 8);
writer.WriteU(0, 8);
writer.WriteU(31, 8);
writer.WriteUe(0);
const auto chroma_format_idc =
static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3);
writer.WriteUe(chroma_format_idc);
if (chroma_format_idc == 3) {
writer.WriteBit(false);
}
writer.WriteUe(0);
writer.WriteUe(0);
writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
writer.WriteBit(false); // Scaling matrix present flag
const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3);
writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf));
writer.WriteUe(order_cnt_type);
if (order_cnt_type == 0) {
writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
} else if (order_cnt_type == 1) {
writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
writer.WriteSe(0);
writer.WriteSe(0);
writer.WriteUe(0);
}
const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
(context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
writer.WriteUe(16);
writer.WriteBit(false);
writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
writer.WriteUe(pic_height - 1);
writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
if (!context.h264_parameter_set.frame_mbs_only_flag) {
writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
}
writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
writer.WriteBit(false); // Frame cropping flag
writer.WriteBit(false); // VUI parameter present flag
writer.End();
// H264 PPS
writer.WriteU(1, 24);
writer.WriteU(0, 1);
writer.WriteU(3, 2);
writer.WriteU(8, 5);
writer.WriteUe(0);
writer.WriteUe(0);
writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
writer.WriteBit(false);
writer.WriteUe(0);
writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
pic_init_qp = (pic_init_qp << 26) >> 26;
writer.WriteSe(pic_init_qp);
writer.WriteSe(0);
s32 chroma_qp_index_offset =
static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
writer.WriteSe(chroma_qp_index_offset);
writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
writer.WriteBit(true);
for (s32 index = 0; index < 6; index++) {
writer.WriteBit(true);
const auto matrix_x4 =
std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
writer.WriteScalingList(matrix_x4, index * 16, 16);
}
if (context.h264_parameter_set.transform_8x8_mode_flag) {
for (s32 index = 0; index < 2; index++) {
writer.WriteBit(true);
const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
context.scaling_matrix_8.end());
writer.WriteScalingList(matrix_x8, index * 64, 64);
}
}
s32 chroma_qp_index_offset2 =
static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
writer.WriteSe(chroma_qp_index_offset2);
writer.End();
const auto& encoded_header = writer.GetByteArray();
frame.resize(encoded_header.size() + context.frame_data_size);
std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
frame.data() + encoded_header.size(),
context.frame_data_size);
}
return frame;
}
H264BitWriter::H264BitWriter() = default;
H264BitWriter::~H264BitWriter() = default;
void H264BitWriter::WriteU(s32 value, s32 value_sz) {
WriteBits(value, value_sz);
}
void H264BitWriter::WriteSe(s32 value) {
WriteExpGolombCodedInt(value);
}
void H264BitWriter::WriteUe(u32 value) {
WriteExpGolombCodedUInt(value);
}
void H264BitWriter::End() {
WriteBit(true);
Flush();
}
void H264BitWriter::WriteBit(bool state) {
WriteBits(state ? 1 : 0, 1);
}
void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
std::vector<u8> scan(count);
if (count == 16) {
std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
} else {
std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
}
u8 last_scale = 8;
for (s32 index = 0; index < count; index++) {
const u8 value = list[start + scan[index]];
const s32 delta_scale = static_cast<s32>(value - last_scale);
WriteSe(delta_scale);
last_scale = value;
}
}
std::vector<u8>& H264BitWriter::GetByteArray() {
return byte_array;
}
const std::vector<u8>& H264BitWriter::GetByteArray() const {
return byte_array;
}
void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
s32 value_pos = 0;
s32 remaining = bit_count;
while (remaining > 0) {
s32 copy_size = remaining;
const s32 free_bits = GetFreeBufferBits();
if (copy_size > free_bits) {
copy_size = free_bits;
}
const s32 mask = (1 << copy_size) - 1;
const s32 src_shift = (bit_count - value_pos) - copy_size;
const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
buffer |= ((value >> src_shift) & mask) << dst_shift;
value_pos += copy_size;
buffer_pos += copy_size;
remaining -= copy_size;
}
}
void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
const s32 sign = value <= 0 ? 0 : 1;
if (value < 0) {
value = -value;
}
value = (value << 1) - sign;
WriteExpGolombCodedUInt(value);
}
void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
WriteBits(1, size);
value -= (1U << (size - 1)) - 1;
WriteBits(static_cast<s32>(value), size - 1);
}
s32 H264BitWriter::GetFreeBufferBits() {
if (buffer_pos == buffer_size) {
Flush();
}
return buffer_size - buffer_pos;
}
void H264BitWriter::Flush() {
if (buffer_pos == 0) {
return;
}
byte_array.push_back(static_cast<u8>(buffer));
buffer = 0;
buffer_pos = 0;
}
} // namespace Tegra::Decoder

View File

@ -0,0 +1,118 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
namespace Decoder {
class H264BitWriter {
public:
H264BitWriter();
~H264BitWriter();
/// The following Write methods are based on clause 9.1 in the H.264 specification.
/// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
void WriteU(s32 value, s32 value_sz);
void WriteSe(s32 value);
void WriteUe(u32 value);
/// Finalize the bitstream
void End();
/// append a bit to the stream, equivalent value to the state parameter
void WriteBit(bool state);
/// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
/// Writes the scaling matrices of the sream
void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
/// Return the bitstream as a vector.
[[nodiscard]] std::vector<u8>& GetByteArray();
[[nodiscard]] const std::vector<u8>& GetByteArray() const;
private:
void WriteBits(s32 value, s32 bit_count);
void WriteExpGolombCodedInt(s32 value);
void WriteExpGolombCodedUInt(u32 value);
[[nodiscard]] s32 GetFreeBufferBits();
void Flush();
s32 buffer_size{8};
s32 buffer{};
s32 buffer_pos{};
std::vector<u8> byte_array;
};
class H264 {
public:
explicit H264(GPU& gpu);
~H264();
/// Compose the H264 header of the frame for FFmpeg decoding
[[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
const NvdecCommon::NvdecRegisters& state, bool is_first_frame = false);
private:
struct H264ParameterSet {
u32 log2_max_pic_order_cnt{};
u32 delta_pic_order_always_zero_flag{};
u32 frame_mbs_only_flag{};
u32 pic_width_in_mbs{};
u32 pic_height_in_map_units{};
INSERT_PADDING_WORDS(1);
u32 entropy_coding_mode_flag{};
u32 bottom_field_pic_order_flag{};
u32 num_refidx_l0_default_active{};
u32 num_refidx_l1_default_active{};
u32 deblocking_filter_control_flag{};
u32 redundant_pic_count_flag{};
u32 transform_8x8_mode_flag{};
INSERT_PADDING_WORDS(9);
u64 flags{};
u32 frame_number{};
u32 frame_number2{};
};
static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
struct H264DecoderContext {
INSERT_PADDING_BYTES(0x48);
u32 frame_data_size{};
INSERT_PADDING_BYTES(0xc);
H264ParameterSet h264_parameter_set{};
INSERT_PADDING_BYTES(0x100);
std::array<u8, 0x60> scaling_matrix_4;
std::array<u8, 0x80> scaling_matrix_8;
};
static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
std::vector<u8> frame;
GPU& gpu;
};
} // namespace Decoder
} // namespace Tegra

View File

@ -0,0 +1,989 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring> // for std::memcpy
#include <numeric>
#include "video_core/command_classes/codecs/vp9.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra::Decoder {
namespace {
// Default compressed header probabilities once frame context resets
constexpr Vp9EntropyProbs default_probs{
.y_mode_prob{
65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78,
173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
},
.partition_prob{
199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0,
177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0,
222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0,
},
.coef_probs{
195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32,
40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47,
69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31,
102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29,
156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31,
191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33,
41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26,
59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34,
114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34,
149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42,
214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28,
89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33,
108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38,
124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37,
141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47,
229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57,
83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39,
99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47,
127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52,
157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61,
125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29,
31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45,
23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24,
29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23,
57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29,
202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57,
33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21,
30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20,
32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22,
57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32,
212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25,
71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42,
64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30,
65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30,
65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34,
225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84,
75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28,
73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29,
62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32,
61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41,
7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18,
43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27,
23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23,
18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23,
15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20,
19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52,
35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28,
28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20,
20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20,
13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24,
211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28,
71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29,
57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27,
47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26,
26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28,
233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97,
85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30,
65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29,
40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26,
16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31,
17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16,
50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20,
40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17,
30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18,
10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17,
36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61,
85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30,
72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27,
55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22,
15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23,
181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23,
62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19,
55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16,
49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17,
13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23,
197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137,
111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91,
104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102,
84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79,
43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6,
},
.switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
.inter_mode_prob{
2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94,
66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0,
},
.intra_inter_prob{9, 102, 187, 225},
.comp_inter_prob{9, 102, 187, 225, 0},
.single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
.comp_ref_prob{50, 126, 123, 221, 226},
.tx_32x32_prob{3, 136, 37, 5, 52, 13},
.tx_16x16_prob{20, 152, 15, 101},
.tx_8x8_prob{100, 66},
.skip_probs{192, 128, 64},
.joints{32, 64, 96},
.sign{128, 128},
.classes{
224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
},
.class_0{216, 208},
.prob_bits{
136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
},
.class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
.fr{64, 96, 64, 64, 96, 64},
.class_0_hp{160, 160},
.high_precision{128, 128},
};
constexpr std::array<s32, 256> norm_lut{
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
constexpr std::array<s32, 254> map_lut{
20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124,
125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142,
143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177,
178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194,
195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 19,
};
// 6.2.14 Tile size calculation
[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) {
const s32 sb64_cols = (frame_width + 63) / 64;
s32 min_log2 = 0;
while ((64 << min_log2) < sb64_cols) {
min_log2++;
}
return min_log2;
}
[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) {
const s32 sb64_cols = (frame_width + 63) / 64;
s32 max_log2 = 1;
while ((sb64_cols >> max_log2) >= 4) {
max_log2++;
}
return max_log2 - 1;
}
// Recenters probability. Based on section 6.3.6 of VP9 Specification
[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) {
if (new_prob > old_prob * 2) {
return new_prob;
}
if (new_prob >= old_prob) {
return (new_prob - old_prob) * 2;
}
return (old_prob - new_prob) * 2 - 1;
}
// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) {
new_prob--;
old_prob--;
std::size_t index{};
if (old_prob * 2 <= 0xff) {
index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
} else {
index = static_cast<std::size_t>(
std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
}
return map_lut[index];
}
} // Anonymous namespace
VP9::VP9(GPU& gpu_) : gpu{gpu_} {}
VP9::~VP9() = default;
void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
const bool update = new_prob != old_prob;
writer.Write(update, diff_update_probability);
if (update) {
WriteProbabilityDelta(writer, new_prob, old_prob);
}
}
template <typename T, std::size_t N>
void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob) {
for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
}
}
template <typename T, std::size_t N>
void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob) {
for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
}
}
void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
const int delta = RemapProbability(new_prob, old_prob);
EncodeTermSubExp(writer, delta);
}
void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
if (WriteLessThan(writer, value, 16)) {
writer.Write(value, 4);
} else if (WriteLessThan(writer, value, 32)) {
writer.Write(value - 16, 4);
} else if (WriteLessThan(writer, value, 64)) {
writer.Write(value - 32, 5);
} else {
value -= 64;
constexpr s32 size = 8;
const s32 mask = (1 << size) - 191;
const s32 delta = value - mask;
if (delta < 0) {
writer.Write(value, size - 1);
} else {
writer.Write(delta / 2 + mask, size - 1);
writer.Write(delta & 1, 1);
}
}
}
bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
const bool is_lt = value < test;
writer.Write(!is_lt);
return is_lt;
}
void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
const std::array<u8, 1728>& new_prob,
const std::array<u8, 1728>& old_prob) {
constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3;
const auto needs_update = [&](u32 base_index) {
return !std::equal(new_prob.begin() + base_index,
new_prob.begin() + base_index + block_bytes,
old_prob.begin() + base_index);
};
for (u32 block_index = 0; block_index < 4; block_index++) {
const u32 base_index = block_index * block_bytes;
const bool update = needs_update(base_index);
writer.Write(update);
if (update) {
u32 index = base_index;
for (s32 i = 0; i < 2; i++) {
for (s32 j = 0; j < 2; j++) {
for (s32 k = 0; k < 6; k++) {
for (s32 l = 0; l < 6; l++) {
if (k != 0 || l < 3) {
WriteProbabilityUpdate(writer, new_prob[index + 0],
old_prob[index + 0]);
WriteProbabilityUpdate(writer, new_prob[index + 1],
old_prob[index + 1]);
WriteProbabilityUpdate(writer, new_prob[index + 2],
old_prob[index + 2]);
}
index += 3;
}
}
}
}
}
if (block_index == static_cast<u32>(tx_mode)) {
break;
}
}
}
void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
const bool update = new_prob != old_prob;
writer.Write(update, diff_update_probability);
if (update) {
writer.Write(new_prob >> 1, 7);
}
}
Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
PictureInfo picture_info{};
gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
Vp9PictureInfo vp9_info = picture_info.Convert();
InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
// surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
// order: last, golden, altref, current. It may be worthwhile to track the updates done here
// to avoid buffering frame data needed for reference frame updating in the header composition.
std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
return vp9_info;
}
void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
EntropyProbs entropy{};
gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
entropy.Convert(dst);
}
Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
Vp9FrameContainer current_frame{};
{
gpu.SyncGuestHost();
current_frame.info = GetVp9PictureInfo(state);
current_frame.bit_stream.resize(current_frame.info.bitstream_size);
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
current_frame.info.bitstream_size);
}
// Buffer two frames, saving the last show frame info
if (!next_next_frame.bit_stream.empty()) {
Vp9FrameContainer temp{
.info = current_frame.info,
.bit_stream = std::move(current_frame.bit_stream),
};
next_next_frame.info.show_frame = current_frame.info.last_frame_shown;
current_frame.info = next_next_frame.info;
current_frame.bit_stream = std::move(next_next_frame.bit_stream);
next_next_frame = std::move(temp);
if (!next_frame.bit_stream.empty()) {
Vp9FrameContainer temp2{
.info = current_frame.info,
.bit_stream = std::move(current_frame.bit_stream),
};
next_frame.info.show_frame = current_frame.info.last_frame_shown;
current_frame.info = next_frame.info;
current_frame.bit_stream = std::move(next_frame.bit_stream);
next_frame = std::move(temp2);
} else {
next_frame.info = current_frame.info;
next_frame.bit_stream = std::move(current_frame.bit_stream);
}
} else {
next_next_frame.info = current_frame.info;
next_next_frame.bit_stream = std::move(current_frame.bit_stream);
}
return current_frame;
}
std::vector<u8> VP9::ComposeCompressedHeader() {
VpxRangeEncoder writer{};
const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame;
if (!current_frame_info.lossless) {
if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
writer.Write(3, 2);
writer.Write(current_frame_info.transform_mode == 4);
} else {
writer.Write(current_frame_info.transform_mode, 2);
}
}
if (current_frame_info.transform_mode == 4) {
// tx_mode_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
prev_frame_probs.tx_8x8_prob);
WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
prev_frame_probs.tx_16x16_prob);
WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
prev_frame_probs.tx_32x32_prob);
if (update_probs) {
prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
}
}
// read_coef_probs() in the spec
WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
// read_skip_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
prev_frame_probs.skip_probs);
if (update_probs) {
prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
}
if (!current_frame_info.intra_only) {
// read_inter_probs() in the spec
WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
prev_frame_probs.inter_mode_prob);
if (current_frame_info.interp_filter == 4) {
// read_interp_filter_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
prev_frame_probs.switchable_interp_prob);
if (update_probs) {
prev_frame_probs.switchable_interp_prob =
current_frame_info.entropy.switchable_interp_prob;
}
}
// read_is_inter_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
prev_frame_probs.intra_inter_prob);
// frame_reference_mode() in the spec
if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
(current_frame_info.ref_frame_sign_bias[2] & 1) ||
(current_frame_info.ref_frame_sign_bias[1] & 1) !=
(current_frame_info.ref_frame_sign_bias[3] & 1)) {
if (current_frame_info.reference_mode >= 1) {
writer.Write(1, 1);
writer.Write(current_frame_info.reference_mode == 2);
} else {
writer.Write(0, 1);
}
}
// frame_reference_mode_probs() in the spec
if (current_frame_info.reference_mode == 2) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
prev_frame_probs.comp_inter_prob);
if (update_probs) {
prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
}
}
if (current_frame_info.reference_mode != 1) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
prev_frame_probs.single_ref_prob);
if (update_probs) {
prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
}
}
if (current_frame_info.reference_mode != 0) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
prev_frame_probs.comp_ref_prob);
if (update_probs) {
prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
}
}
// read_y_mode_probs
for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
++index) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
prev_frame_probs.y_mode_prob[index]);
}
// read_partition_probs
WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
prev_frame_probs.partition_prob);
// mv_probs
for (s32 i = 0; i < 3; i++) {
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
prev_frame_probs.joints[i]);
}
if (update_probs) {
prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
prev_frame_probs.joints = current_frame_info.entropy.joints;
}
for (s32 i = 0; i < 2; i++) {
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
prev_frame_probs.sign[i]);
for (s32 j = 0; j < 10; j++) {
const int index = i * 10 + j;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
prev_frame_probs.classes[index]);
}
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
prev_frame_probs.class_0[i]);
for (s32 j = 0; j < 10; j++) {
const int index = i * 10 + j;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
prev_frame_probs.prob_bits[index]);
}
}
for (s32 i = 0; i < 2; i++) {
for (s32 j = 0; j < 2; j++) {
for (s32 k = 0; k < 3; k++) {
const int index = i * 2 * 3 + j * 3 + k;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
prev_frame_probs.class_0_fr[index]);
}
}
for (s32 j = 0; j < 3; j++) {
const int index = i * 3 + j;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
prev_frame_probs.fr[index]);
}
}
if (current_frame_info.allow_high_precision_mv) {
for (s32 index = 0; index < 2; index++) {
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
prev_frame_probs.class_0_hp[index]);
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
prev_frame_probs.high_precision[index]);
}
}
// save previous probs
if (update_probs) {
prev_frame_probs.sign = current_frame_info.entropy.sign;
prev_frame_probs.classes = current_frame_info.entropy.classes;
prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
prev_frame_probs.fr = current_frame_info.entropy.fr;
prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
}
}
writer.End();
return writer.GetBuffer();
}
VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
VpxBitStreamWriter uncomp_writer{};
uncomp_writer.WriteU(2, 2); // Frame marker.
uncomp_writer.WriteU(0, 2); // Profile.
uncomp_writer.WriteBit(false); // Show existing frame.
uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame?
uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame?
uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
if (current_frame_info.is_key_frame) {
uncomp_writer.WriteU(frame_sync_code, 24);
uncomp_writer.WriteU(0, 3); // Color space.
uncomp_writer.WriteU(0, 1); // Color range.
uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
uncomp_writer.WriteBit(false); // Render and frame size different.
// Reset context
prev_frame_probs = default_probs;
swap_next_golden = false;
loop_filter_ref_deltas.fill(0);
loop_filter_mode_deltas.fill(0);
// allow frames offsets to stabilize before checking for golden frames
grace_period = 4;
// On key frames, all frame slots are set to the current frame,
// so the value of the selected slot doesn't really matter.
frame_ctxs.fill({current_frame_number, false, default_probs});
// intra only, meaning the frame can be recreated with no other references
current_frame_info.intra_only = true;
} else {
if (!current_frame_info.show_frame) {
uncomp_writer.WriteBit(current_frame_info.intra_only);
if (!current_frame_info.last_frame_was_key) {
swap_next_golden = !swap_next_golden;
}
} else {
current_frame_info.intra_only = false;
}
if (!current_frame_info.error_resilient_mode) {
uncomp_writer.WriteU(0, 2); // Reset frame context.
}
// Last, Golden, Altref frames
std::array<s32, 3> ref_frame_index{0, 1, 2};
// Set when next frame is hidden
// altref and golden references are swapped
if (swap_next_golden) {
ref_frame_index = std::array<s32, 3>{0, 2, 1};
}
// update Last Frame
u64 refresh_frame_flags = 1;
// golden frame may refresh, determined if the next golden frame offset is changed
bool golden_refresh = false;
if (grace_period <= 0) {
for (s32 index = 1; index < 3; ++index) {
if (current_frame_info.frame_offsets[index] !=
next_frame.info.frame_offsets[index]) {
current_frame_info.refresh_frame[index] = true;
golden_refresh = true;
grace_period = 3;
}
}
}
if (current_frame_info.show_frame &&
(!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
// Update golden frame
refresh_frame_flags = swap_next_golden ? 2 : 4;
}
if (!current_frame_info.show_frame) {
// Update altref
refresh_frame_flags = swap_next_golden ? 2 : 4;
} else if (golden_refresh) {
refresh_frame_flags = 3;
}
if (current_frame_info.intra_only) {
uncomp_writer.WriteU(frame_sync_code, 24);
uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
uncomp_writer.WriteBit(false); // Render and frame size different.
} else {
uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
for (s32 index = 1; index < 4; index++) {
uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
}
uncomp_writer.WriteBit(true); // Frame size with refs.
uncomp_writer.WriteBit(false); // Render and frame size different.
uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
if (current_frame_info.interp_filter != 4) {
uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
}
}
}
if (!current_frame_info.error_resilient_mode) {
uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
}
int frame_ctx_idx = 0;
if (!current_frame_info.show_frame) {
frame_ctx_idx = 1;
}
uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
prev_frame_probs =
frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
uncomp_writer.WriteU(current_frame_info.first_level, 6);
uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
if (current_frame_info.mode_ref_delta_enabled) {
// check if ref deltas are different, update accordingly
std::array<bool, 4> update_loop_filter_ref_deltas;
std::array<bool, 2> update_loop_filter_mode_deltas;
bool loop_filter_delta_update = false;
for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
const s8 old_deltas = loop_filter_ref_deltas[index];
const s8 new_deltas = current_frame_info.ref_deltas[index];
const bool differing_delta = old_deltas != new_deltas;
update_loop_filter_ref_deltas[index] = differing_delta;
loop_filter_delta_update |= differing_delta;
}
for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
const s8 old_deltas = loop_filter_mode_deltas[index];
const s8 new_deltas = current_frame_info.mode_deltas[index];
const bool differing_delta = old_deltas != new_deltas;
update_loop_filter_mode_deltas[index] = differing_delta;
loop_filter_delta_update |= differing_delta;
}
uncomp_writer.WriteBit(loop_filter_delta_update);
if (loop_filter_delta_update) {
for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
if (update_loop_filter_ref_deltas[index]) {
uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
}
}
for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
if (update_loop_filter_mode_deltas[index]) {
uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
}
}
// save new deltas
loop_filter_ref_deltas = current_frame_info.ref_deltas;
loop_filter_mode_deltas = current_frame_info.mode_deltas;
}
}
uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
// If it's less than the maximum, we need to add an extra 0 on the bitstream
// to indicate that it should stop reading.
if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
} else {
uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
}
const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
if (tile_rows_log2_is_nonzero) {
uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
}
return uncomp_writer;
}
const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state) {
std::vector<u8> bitstream;
{
Vp9FrameContainer curr_frame = GetCurrentFrame(state);
current_frame_info = curr_frame.info;
bitstream = std::move(curr_frame.bit_stream);
}
// The uncompressed header routine sets PrevProb parameters needed for the compressed header
auto uncomp_writer = ComposeUncompressedHeader();
std::vector<u8> compressed_header = ComposeCompressedHeader();
uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
uncomp_writer.Flush();
std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
// Write headers and frame to buffer
frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
compressed_header.size());
std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
bitstream.data(), bitstream.size());
// keep track of frame number
current_frame_number++;
grace_period--;
// don't display hidden frames
hidden = !current_frame_info.show_frame;
return frame;
}
VpxRangeEncoder::VpxRangeEncoder() {
Write(false);
}
VpxRangeEncoder::~VpxRangeEncoder() = default;
void VpxRangeEncoder::Write(s32 value, s32 value_size) {
for (s32 bit = value_size - 1; bit >= 0; bit--) {
Write(((value >> bit) & 1) != 0);
}
}
void VpxRangeEncoder::Write(bool bit) {
Write(bit, half_probability);
}
void VpxRangeEncoder::Write(bool bit, s32 probability) {
u32 local_range = range;
const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
local_range = split;
if (bit) {
low_value += split;
local_range = range - split;
}
s32 shift = norm_lut[local_range];
local_range <<= shift;
count += shift;
if (count >= 0) {
const s32 offset = shift - count;
if (((low_value << (offset - 1)) >> 31) != 0) {
const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
while (PeekByte() == 0xff) {
base_stream.WriteByte(0);
base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
}
base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
}
base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
low_value <<= offset;
shift = count;
low_value &= 0xffffff;
count -= 8;
}
low_value <<= shift;
range = local_range;
}
void VpxRangeEncoder::End() {
for (std::size_t index = 0; index < 32; ++index) {
Write(false);
}
}
u8 VpxRangeEncoder::PeekByte() {
const u8 value = base_stream.ReadByte();
base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
return value;
}
VpxBitStreamWriter::VpxBitStreamWriter() = default;
VpxBitStreamWriter::~VpxBitStreamWriter() = default;
void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
WriteBits(value, value_size);
}
void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
const bool sign = value < 0;
if (sign) {
value = -value;
}
WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
}
void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
const bool delta_coded = value != 0;
WriteBit(delta_coded);
if (delta_coded) {
WriteBits(value, 4);
}
}
void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
s32 value_pos = 0;
s32 remaining = bit_count;
while (remaining > 0) {
s32 copy_size = remaining;
const s32 free = GetFreeBufferBits();
if (copy_size > free) {
copy_size = free;
}
const s32 mask = (1 << copy_size) - 1;
const s32 src_shift = (bit_count - value_pos) - copy_size;
const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
buffer |= ((value >> src_shift) & mask) << dst_shift;
value_pos += copy_size;
buffer_pos += copy_size;
remaining -= copy_size;
}
}
void VpxBitStreamWriter::WriteBit(bool state) {
WriteBits(state ? 1 : 0, 1);
}
s32 VpxBitStreamWriter::GetFreeBufferBits() {
if (buffer_pos == buffer_size) {
Flush();
}
return buffer_size - buffer_pos;
}
void VpxBitStreamWriter::Flush() {
if (buffer_pos == 0) {
return;
}
byte_array.push_back(static_cast<u8>(buffer));
buffer = 0;
buffer_pos = 0;
}
std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
return byte_array;
}
const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
return byte_array;
}
} // namespace Tegra::Decoder

View File

@ -0,0 +1,197 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <vector>
#include "common/common_types.h"
#include "common/stream.h"
#include "video_core/command_classes/codecs/vp9_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
enum class FrameType { KeyFrame = 0, InterFrame = 1 };
namespace Decoder {
/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
/// VP9 header bitstreams.
class VpxRangeEncoder {
public:
VpxRangeEncoder();
~VpxRangeEncoder();
VpxRangeEncoder(const VpxRangeEncoder&) = delete;
VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete;
VpxRangeEncoder(VpxRangeEncoder&&) = default;
VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default;
/// Writes the rightmost value_size bits from value into the stream
void Write(s32 value, s32 value_size);
/// Writes a single bit with half probability
void Write(bool bit);
/// Writes a bit to the base_stream encoded with probability
void Write(bool bit, s32 probability);
/// Signal the end of the bitstream
void End();
[[nodiscard]] std::vector<u8>& GetBuffer() {
return base_stream.GetBuffer();
}
[[nodiscard]] const std::vector<u8>& GetBuffer() const {
return base_stream.GetBuffer();
}
private:
u8 PeekByte();
Common::Stream base_stream{};
u32 low_value{};
u32 range{0xff};
s32 count{-24};
s32 half_probability{128};
};
class VpxBitStreamWriter {
public:
VpxBitStreamWriter();
~VpxBitStreamWriter();
VpxBitStreamWriter(const VpxBitStreamWriter&) = delete;
VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete;
VpxBitStreamWriter(VpxBitStreamWriter&&) = default;
VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default;
/// Write an unsigned integer value
void WriteU(u32 value, u32 value_size);
/// Write a signed integer value
void WriteS(s32 value, u32 value_size);
/// Based on 6.2.10 of VP9 Spec, writes a delta coded value
void WriteDeltaQ(u32 value);
/// Write a single bit.
void WriteBit(bool state);
/// Pushes current buffer into buffer_array, resets buffer
void Flush();
/// Returns byte_array
[[nodiscard]] std::vector<u8>& GetByteArray();
/// Returns const byte_array
[[nodiscard]] const std::vector<u8>& GetByteArray() const;
private:
/// Write bit_count bits from value into buffer
void WriteBits(u32 value, u32 bit_count);
/// Gets next available position in buffer, invokes Flush() if buffer is full
s32 GetFreeBufferBits();
s32 buffer_size{8};
s32 buffer{};
s32 buffer_pos{};
std::vector<u8> byte_array;
};
class VP9 {
public:
explicit VP9(GPU& gpu_);
~VP9();
VP9(const VP9&) = delete;
VP9& operator=(const VP9&) = delete;
VP9(VP9&&) = default;
VP9& operator=(VP9&&) = delete;
/// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
/// documentation
[[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
const NvdecCommon::NvdecRegisters& state);
/// Returns true if the most recent frame was a hidden frame.
[[nodiscard]] bool WasFrameHidden() const {
return hidden;
}
private:
/// Generates compressed header probability updates in the bitstream writer
template <typename T, std::size_t N>
void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob);
/// Generates compressed header probability updates in the bitstream writer
/// If probs are not equal, WriteProbabilityDelta is invoked
void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Generates compressed header probability deltas in the bitstream writer
void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Inverse of 6.3.4 Decode term subexp
void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
/// Writes if the value is less than the test value
bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
/// Writes probability updates for the Coef probabilities
void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
const std::array<u8, 1728>& new_prob,
const std::array<u8, 1728>& old_prob);
/// Write probabilities for 4-byte aligned structures
template <typename T, std::size_t N>
void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob);
/// Write motion vector probability updates. 6.3.17 in the spec
void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Returns VP9 information from NVDEC provided offset and size
[[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
/// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
/// Returns frame to be decoded after buffering
[[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
/// Use NVDEC providied information to compose the headers for the current frame
[[nodiscard]] std::vector<u8> ComposeCompressedHeader();
[[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
GPU& gpu;
std::vector<u8> frame;
std::array<s8, 4> loop_filter_ref_deltas{};
std::array<s8, 2> loop_filter_mode_deltas{};
bool hidden = false;
s64 current_frame_number = -2; // since we buffer 2 frames
s32 grace_period = 6; // frame offsets need to stabilize
std::array<FrameContexts, 4> frame_ctxs{};
Vp9FrameContainer next_frame{};
Vp9FrameContainer next_next_frame{};
bool swap_next_golden{};
Vp9PictureInfo current_frame_info{};
Vp9EntropyProbs prev_frame_probs{};
s32 diff_update_probability = 252;
s32 frame_sync_code = 0x498342;
};
} // namespace Decoder
} // namespace Tegra

View File

@ -0,0 +1,302 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstring>
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra {
class GPU;
namespace Decoder {
struct Vp9FrameDimensions {
s16 width{};
s16 height{};
s16 luma_pitch{};
s16 chroma_pitch{};
};
static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
enum FrameFlags : u32 {
IsKeyFrame = 1 << 0,
LastFrameIsKeyFrame = 1 << 1,
FrameSizeChanged = 1 << 2,
ErrorResilientMode = 1 << 3,
LastShowFrame = 1 << 4,
IntraOnly = 1 << 5,
};
enum class TxSize {
Tx4x4 = 0, // 4x4 transform
Tx8x8 = 1, // 8x8 transform
Tx16x16 = 2, // 16x16 transform
Tx32x32 = 3, // 32x32 transform
TxSizes = 4
};
enum class TxMode {
Only4X4 = 0, // Only 4x4 transform used
Allow8X8 = 1, // Allow block transform size up to 8x8
Allow16X16 = 2, // Allow block transform size up to 16x16
Allow32X32 = 3, // Allow block transform size up to 32x32
TxModeSelect = 4, // Transform specified for each block
TxModes = 5
};
struct Segmentation {
u8 enabled{};
u8 update_map{};
u8 temporal_update{};
u8 abs_delta{};
std::array<u32, 8> feature_mask{};
std::array<std::array<s16, 4>, 8> feature_data{};
};
static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
struct LoopFilter {
u8 mode_ref_delta_enabled{};
std::array<s8, 4> ref_deltas{};
std::array<s8, 2> mode_deltas{};
};
static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
struct Vp9EntropyProbs {
std::array<u8, 36> y_mode_prob{};
std::array<u8, 64> partition_prob{};
std::array<u8, 1728> coef_probs{};
std::array<u8, 8> switchable_interp_prob{};
std::array<u8, 28> inter_mode_prob{};
std::array<u8, 4> intra_inter_prob{};
std::array<u8, 5> comp_inter_prob{};
std::array<u8, 10> single_ref_prob{};
std::array<u8, 5> comp_ref_prob{};
std::array<u8, 6> tx_32x32_prob{};
std::array<u8, 4> tx_16x16_prob{};
std::array<u8, 2> tx_8x8_prob{};
std::array<u8, 3> skip_probs{};
std::array<u8, 3> joints{};
std::array<u8, 2> sign{};
std::array<u8, 20> classes{};
std::array<u8, 2> class_0{};
std::array<u8, 20> prob_bits{};
std::array<u8, 12> class_0_fr{};
std::array<u8, 6> fr{};
std::array<u8, 2> class_0_hp{};
std::array<u8, 2> high_precision{};
};
static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size");
struct Vp9PictureInfo {
bool is_key_frame{};
bool intra_only{};
bool last_frame_was_key{};
bool frame_size_changed{};
bool error_resilient_mode{};
bool last_frame_shown{};
bool show_frame{};
std::array<s8, 4> ref_frame_sign_bias{};
s32 base_q_index{};
s32 y_dc_delta_q{};
s32 uv_dc_delta_q{};
s32 uv_ac_delta_q{};
bool lossless{};
s32 transform_mode{};
bool allow_high_precision_mv{};
s32 interp_filter{};
s32 reference_mode{};
s8 comp_fixed_ref{};
std::array<s8, 2> comp_var_ref{};
s32 log2_tile_cols{};
s32 log2_tile_rows{};
bool segment_enabled{};
bool segment_map_update{};
bool segment_map_temporal_update{};
s32 segment_abs_delta{};
std::array<u32, 8> segment_feature_enable{};
std::array<std::array<s16, 4>, 8> segment_feature_data{};
bool mode_ref_delta_enabled{};
bool use_prev_in_find_mv_refs{};
std::array<s8, 4> ref_deltas{};
std::array<s8, 2> mode_deltas{};
Vp9EntropyProbs entropy{};
Vp9FrameDimensions frame_size{};
u8 first_level{};
u8 sharpness_level{};
u32 bitstream_size{};
std::array<u64, 4> frame_offsets{};
std::array<bool, 4> refresh_frame{};
};
struct Vp9FrameContainer {
Vp9PictureInfo info{};
std::vector<u8> bit_stream;
};
struct PictureInfo {
INSERT_PADDING_WORDS(12);
u32 bitstream_size{};
INSERT_PADDING_WORDS(5);
Vp9FrameDimensions last_frame_size{};
Vp9FrameDimensions golden_frame_size{};
Vp9FrameDimensions alt_frame_size{};
Vp9FrameDimensions current_frame_size{};
u32 vp9_flags{};
std::array<s8, 4> ref_frame_sign_bias{};
u8 first_level{};
u8 sharpness_level{};
u8 base_q_index{};
u8 y_dc_delta_q{};
u8 uv_ac_delta_q{};
u8 uv_dc_delta_q{};
u8 lossless{};
u8 tx_mode{};
u8 allow_high_precision_mv{};
u8 interp_filter{};
u8 reference_mode{};
s8 comp_fixed_ref{};
std::array<s8, 2> comp_var_ref{};
u8 log2_tile_cols{};
u8 log2_tile_rows{};
Segmentation segmentation{};
LoopFilter loop_filter{};
INSERT_PADDING_BYTES(5);
u32 surface_params{};
INSERT_PADDING_WORDS(3);
[[nodiscard]] Vp9PictureInfo Convert() const {
return {
.is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
.intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
.last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
.frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
.error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
.last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
.ref_frame_sign_bias = ref_frame_sign_bias,
.base_q_index = base_q_index,
.y_dc_delta_q = y_dc_delta_q,
.uv_dc_delta_q = uv_dc_delta_q,
.uv_ac_delta_q = uv_ac_delta_q,
.lossless = lossless != 0,
.transform_mode = tx_mode,
.allow_high_precision_mv = allow_high_precision_mv != 0,
.interp_filter = interp_filter,
.reference_mode = reference_mode,
.comp_fixed_ref = comp_fixed_ref,
.comp_var_ref = comp_var_ref,
.log2_tile_cols = log2_tile_cols,
.log2_tile_rows = log2_tile_rows,
.segment_enabled = segmentation.enabled != 0,
.segment_map_update = segmentation.update_map != 0,
.segment_map_temporal_update = segmentation.temporal_update != 0,
.segment_abs_delta = segmentation.abs_delta,
.segment_feature_enable = segmentation.feature_mask,
.segment_feature_data = segmentation.feature_data,
.mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
.use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
!(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
!(vp9_flags == (FrameFlags::IntraOnly)) &&
(vp9_flags == (FrameFlags::LastShowFrame)) &&
!(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
.ref_deltas = loop_filter.ref_deltas,
.mode_deltas = loop_filter.mode_deltas,
.frame_size = current_frame_size,
.first_level = first_level,
.sharpness_level = sharpness_level,
.bitstream_size = bitstream_size,
};
}
};
static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
struct EntropyProbs {
INSERT_PADDING_BYTES(1024);
std::array<u8, 28> inter_mode_prob{};
std::array<u8, 4> intra_inter_prob{};
INSERT_PADDING_BYTES(80);
std::array<u8, 2> tx_8x8_prob{};
std::array<u8, 4> tx_16x16_prob{};
std::array<u8, 6> tx_32x32_prob{};
std::array<u8, 4> y_mode_prob_e8{};
std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
INSERT_PADDING_BYTES(64);
std::array<u8, 64> partition_prob{};
INSERT_PADDING_BYTES(10);
std::array<u8, 8> switchable_interp_prob{};
std::array<u8, 5> comp_inter_prob{};
std::array<u8, 3> skip_probs{};
INSERT_PADDING_BYTES(1);
std::array<u8, 3> joints{};
std::array<u8, 2> sign{};
std::array<u8, 2> class_0{};
std::array<u8, 6> fr{};
std::array<u8, 2> class_0_hp{};
std::array<u8, 2> high_precision{};
std::array<u8, 20> classes{};
std::array<u8, 12> class_0_fr{};
std::array<u8, 20> pred_bits{};
std::array<u8, 10> single_ref_prob{};
std::array<u8, 5> comp_ref_prob{};
INSERT_PADDING_BYTES(17);
std::array<u8, 2304> coef_probs{};
void Convert(Vp9EntropyProbs& fc) {
fc.inter_mode_prob = inter_mode_prob;
fc.intra_inter_prob = intra_inter_prob;
fc.tx_8x8_prob = tx_8x8_prob;
fc.tx_16x16_prob = tx_16x16_prob;
fc.tx_32x32_prob = tx_32x32_prob;
for (std::size_t i = 0; i < 4; i++) {
for (std::size_t j = 0; j < 9; j++) {
fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
}
}
fc.partition_prob = partition_prob;
fc.switchable_interp_prob = switchable_interp_prob;
fc.comp_inter_prob = comp_inter_prob;
fc.skip_probs = skip_probs;
fc.joints = joints;
fc.sign = sign;
fc.class_0 = class_0;
fc.fr = fr;
fc.class_0_hp = class_0_hp;
fc.high_precision = high_precision;
fc.classes = classes;
fc.class_0_fr = class_0_fr;
fc.prob_bits = pred_bits;
fc.single_ref_prob = single_ref_prob;
fc.comp_ref_prob = comp_ref_prob;
// Skip the 4th element as it goes unused
for (std::size_t i = 0; i < coef_probs.size(); i += 4) {
const std::size_t j = i - i / 4;
fc.coef_probs[j] = coef_probs[i];
fc.coef_probs[j + 1] = coef_probs[i + 1];
fc.coef_probs[j + 2] = coef_probs[i + 2];
}
}
};
static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
enum class Ref { Last, Golden, AltRef };
struct RefPoolElement {
s64 frame{};
Ref ref{};
bool refresh{};
};
struct FrameContexts {
s64 from{};
bool adapted{};
Vp9EntropyProbs probs{};
};
}; // namespace Decoder
}; // namespace Tegra

View File

@ -0,0 +1,30 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "video_core/command_classes/host1x.h"
#include "video_core/gpu.h"
Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
Tegra::Host1x::~Host1x() = default;
void Tegra::Host1x::ProcessMethod(Method method, u32 argument) {
switch (method) {
case Method::LoadSyncptPayload32:
syncpoint_value = argument;
break;
case Method::WaitSyncpt:
case Method::WaitSyncpt32:
Execute(argument);
break;
default:
UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
break;
}
}
void Tegra::Host1x::Execute(u32 data) {
gpu.WaitFence(data, syncpoint_value);
}

View File

@ -0,0 +1,37 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra {
class GPU;
class Nvdec;
class Host1x {
public:
enum class Method : u32 {
WaitSyncpt = 0x8,
LoadSyncptPayload32 = 0x4e,
WaitSyncpt32 = 0x50,
};
explicit Host1x(GPU& gpu);
~Host1x();
/// Writes the method into the state, Invoke Execute() if encountered
void ProcessMethod(Method method, u32 argument);
private:
/// For Host1x, execute is waiting on a syncpoint previously written into the state
void Execute(u32 data);
u32 syncpoint_value{};
GPU& gpu;
};
} // namespace Tegra

View File

@ -0,0 +1,48 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "video_core/command_classes/nvdec.h"
#include "video_core/gpu.h"
namespace Tegra {
Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
Nvdec::~Nvdec() = default;
void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) {
if (method == Method::SetVideoCodec) {
codec->StateWrite(static_cast<u32>(method), arguments[0]);
} else {
codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
}
switch (method) {
case Method::SetVideoCodec:
codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
break;
case Method::Execute:
Execute();
break;
}
}
AVFramePtr Nvdec::GetFrame() {
return codec->GetCurrentFrame();
}
void Nvdec::Execute() {
switch (codec->GetCurrentCodec()) {
case NvdecCommon::VideoCodec::H264:
case NvdecCommon::VideoCodec::Vp9:
codec->Decode();
break;
default:
UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
break;
}
}
} // namespace Tegra

View File

@ -0,0 +1,38 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <vector>
#include "common/common_types.h"
#include "video_core/command_classes/codecs/codec.h"
namespace Tegra {
class GPU;
class Nvdec {
public:
enum class Method : u32 {
SetVideoCodec = 0x80,
Execute = 0xc0,
};
explicit Nvdec(GPU& gpu);
~Nvdec();
/// Writes the method into the state, Invoke Execute() if encountered
void ProcessMethod(Method method, const std::vector<u32>& arguments);
/// Return most recently decoded frame
[[nodiscard]] AVFramePtr GetFrame();
private:
/// Invoke codec to decode a frame
void Execute();
GPU& gpu;
std::unique_ptr<Codec> codec;
};
} // namespace Tegra

View File

@ -0,0 +1,48 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra::NvdecCommon {
struct NvdecRegisters {
INSERT_PADDING_WORDS(256);
u64 set_codec_id{};
INSERT_PADDING_WORDS(254);
u64 set_platform_id{};
u64 picture_info_offset{};
u64 frame_bitstream_offset{};
u64 frame_number{};
u64 h264_slice_data_offsets{};
u64 h264_mv_dump_offset{};
INSERT_PADDING_WORDS(6);
u64 frame_stats_offset{};
u64 h264_last_surface_luma_offset{};
u64 h264_last_surface_chroma_offset{};
std::array<u64, 17> surface_luma_offset{};
std::array<u64, 17> surface_chroma_offset{};
INSERT_PADDING_WORDS(132);
u64 vp9_entropy_probs_offset{};
u64 vp9_backward_updates_offset{};
u64 vp9_last_frame_segmap_offset{};
u64 vp9_curr_frame_segmap_offset{};
INSERT_PADDING_WORDS(2);
u64 vp9_last_frame_mvs_offset{};
u64 vp9_curr_frame_mvs_offset{};
INSERT_PADDING_WORDS(2);
};
static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
enum class VideoCodec : u32 {
None = 0x0,
H264 = 0x3,
Vp8 = 0x5,
H265 = 0x7,
Vp9 = 0x9,
};
} // namespace Tegra::NvdecCommon

View File

@ -0,0 +1,60 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include <algorithm>
#include "sync_manager.h"
#include "video_core/gpu.h"
namespace Tegra {
SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
SyncptIncrManager::~SyncptIncrManager() = default;
void SyncptIncrManager::Increment(u32 id) {
increments.emplace_back(0, 0, id, true);
IncrementAllDone();
}
u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
const u32 handle = current_id++;
increments.emplace_back(handle, class_id, id);
return handle;
}
void SyncptIncrManager::SignalDone(u32 handle) {
const auto done_incr =
std::find_if(increments.begin(), increments.end(),
[handle](const SyncptIncr& incr) { return incr.id == handle; });
if (done_incr != increments.cend()) {
done_incr->complete = true;
}
IncrementAllDone();
}
void SyncptIncrManager::IncrementAllDone() {
std::size_t done_count = 0;
for (; done_count < increments.size(); ++done_count) {
if (!increments[done_count].complete) {
break;
}
gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
}
increments.erase(increments.begin(), increments.begin() + done_count);
}
} // namespace Tegra

View File

@ -0,0 +1,64 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#pragma once
#include <mutex>
#include <vector>
#include "common/common_types.h"
namespace Tegra {
class GPU;
struct SyncptIncr {
u32 id;
u32 class_id;
u32 syncpt_id;
bool complete;
SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
: id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
};
class SyncptIncrManager {
public:
explicit SyncptIncrManager(GPU& gpu);
~SyncptIncrManager();
/// Add syncpoint id and increment all
void Increment(u32 id);
/// Returns a handle to increment later
u32 IncrementWhenDone(u32 class_id, u32 id);
/// IncrememntAllDone, including handle
void SignalDone(u32 handle);
/// Increment all sequential pending increments that are already done.
void IncrementAllDone();
private:
std::vector<SyncptIncr> increments;
std::mutex increment_lock;
u32 current_id{};
GPU& gpu;
};
} // namespace Tegra

View File

@ -0,0 +1,175 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include "common/assert.h"
#include "video_core/command_classes/nvdec.h"
#include "video_core/command_classes/vic.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/textures/decoders.h"
extern "C" {
#include <libswscale/swscale.h>
}
namespace Tegra {
Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
: gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
Vic::~Vic() = default;
void Vic::VicStateWrite(u32 offset, u32 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
std::memcpy(state_offset, &arguments, sizeof(u32));
}
void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method);
VicStateWrite(static_cast<u32>(method), arguments[0]);
const u64 arg = static_cast<u64>(arguments[0]) << 8;
switch (method) {
case Method::Execute:
Execute();
break;
case Method::SetConfigStructOffset:
config_struct_address = arg;
break;
case Method::SetOutputSurfaceLumaOffset:
output_surface_luma_address = arg;
break;
case Method::SetOutputSurfaceChromaUOffset:
output_surface_chroma_u_address = arg;
break;
case Method::SetOutputSurfaceChromaVOffset:
output_surface_chroma_v_address = arg;
break;
default:
break;
}
}
void Vic::Execute() {
if (output_surface_luma_address == 0) {
LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}",
vic_state.output_surface.luma_offset);
return;
}
const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
const auto* frame = frame_ptr.get();
if (!frame || frame->width == 0 || frame->height == 0) {
return;
}
const VideoPixelFormat pixel_format =
static_cast<VideoPixelFormat>(config.pixel_format.Value());
switch (pixel_format) {
case VideoPixelFormat::BGRA8:
case VideoPixelFormat::RGBA8: {
LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
if (scaler_ctx == nullptr || frame->width != scaler_width ||
frame->height != scaler_height) {
const AVPixelFormat target_format =
(pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
sws_freeContext(scaler_ctx);
scaler_ctx = nullptr;
// FFmpeg returns all frames in YUV420, convert it into expected format
scaler_ctx =
sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
frame->height, target_format, 0, nullptr, nullptr, nullptr);
scaler_width = frame->width;
scaler_height = frame->height;
}
// Get Converted frame
const std::size_t linear_size = frame->width * frame->height * 4;
using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
const int converted_stride{frame->width * 4};
u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
&converted_frame_buf_addr, &converted_stride);
const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
if (blk_kind != 0) {
// swizzle pitch linear to block linear
const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
block_height, 0);
std::vector<u8> swizzled_data(size);
Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
frame->width, 4, swizzled_data.data(),
converted_frame_buffer.get(), block_height, 0, 0);
gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
gpu.Maxwell3D().OnMemoryWrite();
} else {
// send pitch linear frame
gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
linear_size);
gpu.Maxwell3D().OnMemoryWrite();
}
break;
}
case VideoPixelFormat::Yuv420: {
LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
const std::size_t surface_width = config.surface_width_minus1 + 1;
const std::size_t surface_height = config.surface_height_minus1 + 1;
const std::size_t half_width = surface_width / 2;
const std::size_t half_height = config.surface_height_minus1 / 2;
const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
const auto* luma_ptr = frame->data[0];
const auto* chroma_b_ptr = frame->data[1];
const auto* chroma_r_ptr = frame->data[2];
const auto stride = frame->linesize[0];
const auto half_stride = frame->linesize[1];
std::vector<u8> luma_buffer(aligned_width * surface_height);
std::vector<u8> chroma_buffer(aligned_width * half_height);
// Populate luma buffer
for (std::size_t y = 0; y < surface_height - 1; ++y) {
std::size_t src = y * stride;
std::size_t dst = y * aligned_width;
std::size_t size = surface_width;
for (std::size_t offset = 0; offset < size; ++offset) {
luma_buffer[dst + offset] = luma_ptr[src + offset];
}
}
gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
luma_buffer.size());
// Populate chroma buffer from both channels with interleaving.
for (std::size_t y = 0; y < half_height; ++y) {
std::size_t src = y * half_stride;
std::size_t dst = y * aligned_width;
for (std::size_t x = 0; x < half_width; ++x) {
chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
}
}
gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
chroma_buffer.size());
gpu.Maxwell3D().OnMemoryWrite();
break;
}
default:
UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
break;
}
}
} // namespace Tegra

View File

@ -0,0 +1,110 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
struct SwsContext;
namespace Tegra {
class GPU;
class Nvdec;
struct PlaneOffsets {
u32 luma_offset{};
u32 chroma_u_offset{};
u32 chroma_v_offset{};
};
struct VicRegisters {
INSERT_PADDING_WORDS(64);
u32 nop{};
INSERT_PADDING_WORDS(15);
u32 pm_trigger{};
INSERT_PADDING_WORDS(47);
u32 set_application_id{};
u32 set_watchdog_timer{};
INSERT_PADDING_WORDS(17);
u32 context_save_area{};
u32 context_switch{};
INSERT_PADDING_WORDS(43);
u32 execute{};
INSERT_PADDING_WORDS(63);
std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
u32 picture_index{};
u32 control_params{};
u32 config_struct_offset{};
u32 filter_struct_offset{};
u32 palette_offset{};
u32 hist_offset{};
u32 context_id{};
u32 fce_ucode_size{};
PlaneOffsets output_surface{};
u32 fce_ucode_offset{};
INSERT_PADDING_WORDS(4);
std::array<u32, 8> slot_context_id{};
INSERT_PADDING_WORDS(16);
};
static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
class Vic {
public:
enum class Method : u32 {
Execute = 0xc0,
SetControlParams = 0x1c1,
SetConfigStructOffset = 0x1c2,
SetOutputSurfaceLumaOffset = 0x1c8,
SetOutputSurfaceChromaUOffset = 0x1c9,
SetOutputSurfaceChromaVOffset = 0x1ca
};
explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
~Vic();
/// Write to the device state.
void ProcessMethod(Method method, const std::vector<u32>& arguments);
private:
void Execute();
void VicStateWrite(u32 offset, u32 arguments);
VicRegisters vic_state{};
enum class VideoPixelFormat : u64_le {
RGBA8 = 0x1f,
BGRA8 = 0x20,
Yuv420 = 0x44,
};
union VicConfig {
u64_le raw{};
BitField<0, 7, u64_le> pixel_format;
BitField<7, 2, u64_le> chroma_loc_horiz;
BitField<9, 2, u64_le> chroma_loc_vert;
BitField<11, 4, u64_le> block_linear_kind;
BitField<15, 4, u64_le> block_linear_height_log2;
BitField<19, 3, u64_le> reserved0;
BitField<22, 10, u64_le> reserved1;
BitField<32, 14, u64_le> surface_width_minus1;
BitField<46, 14, u64_le> surface_height_minus1;
};
GPU& gpu;
std::shared_ptr<Tegra::Nvdec> nvdec_processor;
GPUVAddr config_struct_address{};
GPUVAddr output_surface_luma_address{};
GPUVAddr output_surface_chroma_u_address{};
GPUVAddr output_surface_chroma_v_address{};
SwsContext* scaler_ctx{};
s32 scaler_width{};
s32 scaler_height{};
};
} // namespace Tegra

View File

@ -3,33 +3,33 @@
// Refer to the license.txt file included.
#include <array>
#include <bitset>
#include <cstddef>
#include "common/common_types.h"
#include "video_core/compatible_formats.h"
#include "video_core/surface.h"
namespace VideoCore::Surface {
namespace {
using Table = std::array<std::array<u64, 2>, MaxPixelFormat>;
// Compatibility table taken from Table 3.X.2 in:
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
constexpr std::array VIEW_CLASS_128_BITS = {
constexpr std::array VIEW_CLASS_128_BITS{
PixelFormat::R32G32B32A32_FLOAT,
PixelFormat::R32G32B32A32_UINT,
PixelFormat::R32G32B32A32_SINT,
};
constexpr std::array VIEW_CLASS_96_BITS = {
constexpr std::array VIEW_CLASS_96_BITS{
PixelFormat::R32G32B32_FLOAT,
};
// Missing formats:
// PixelFormat::RGB32UI,
// PixelFormat::RGB32I,
constexpr std::array VIEW_CLASS_64_BITS = {
constexpr std::array VIEW_CLASS_64_BITS{
PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_UINT,
PixelFormat::R32G32_SINT, PixelFormat::R16G16B16A16_FLOAT,
PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
@ -38,7 +38,7 @@ constexpr std::array VIEW_CLASS_64_BITS = {
// TODO: How should we handle 48 bits?
constexpr std::array VIEW_CLASS_32_BITS = {
constexpr std::array VIEW_CLASS_32_BITS{
PixelFormat::R16G16_FLOAT, PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT,
PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT,
PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM,
@ -50,43 +50,105 @@ constexpr std::array VIEW_CLASS_32_BITS = {
// TODO: How should we handle 24 bits?
constexpr std::array VIEW_CLASS_16_BITS = {
constexpr std::array VIEW_CLASS_16_BITS{
PixelFormat::R16_FLOAT, PixelFormat::R8G8_UINT, PixelFormat::R16_UINT,
PixelFormat::R16_SINT, PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM,
PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM, PixelFormat::R8G8_SINT,
};
constexpr std::array VIEW_CLASS_8_BITS = {
constexpr std::array VIEW_CLASS_8_BITS{
PixelFormat::R8_UINT,
PixelFormat::R8_UNORM,
PixelFormat::R8_SINT,
PixelFormat::R8_SNORM,
};
constexpr std::array VIEW_CLASS_RGTC1_RED = {
constexpr std::array VIEW_CLASS_RGTC1_RED{
PixelFormat::BC4_UNORM,
PixelFormat::BC4_SNORM,
};
constexpr std::array VIEW_CLASS_RGTC2_RG = {
constexpr std::array VIEW_CLASS_RGTC2_RG{
PixelFormat::BC5_UNORM,
PixelFormat::BC5_SNORM,
};
constexpr std::array VIEW_CLASS_BPTC_UNORM = {
constexpr std::array VIEW_CLASS_BPTC_UNORM{
PixelFormat::BC7_UNORM,
PixelFormat::BC7_SRGB,
};
constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
constexpr std::array VIEW_CLASS_BPTC_FLOAT{
PixelFormat::BC6H_SFLOAT,
PixelFormat::BC6H_UFLOAT,
};
constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{
PixelFormat::ASTC_2D_4X4_UNORM,
PixelFormat::ASTC_2D_4X4_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{
PixelFormat::ASTC_2D_5X4_UNORM,
PixelFormat::ASTC_2D_5X4_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{
PixelFormat::ASTC_2D_5X5_UNORM,
PixelFormat::ASTC_2D_5X5_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{
PixelFormat::ASTC_2D_6X5_UNORM,
PixelFormat::ASTC_2D_6X5_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{
PixelFormat::ASTC_2D_6X6_UNORM,
PixelFormat::ASTC_2D_6X6_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{
PixelFormat::ASTC_2D_8X5_UNORM,
PixelFormat::ASTC_2D_8X5_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{
PixelFormat::ASTC_2D_8X8_UNORM,
PixelFormat::ASTC_2D_8X8_SRGB,
};
// Missing formats:
// PixelFormat::ASTC_2D_10X5_UNORM
// PixelFormat::ASTC_2D_10X5_SRGB
// Missing formats:
// PixelFormat::ASTC_2D_10X6_UNORM
// PixelFormat::ASTC_2D_10X6_SRGB
constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{
PixelFormat::ASTC_2D_10X8_UNORM,
PixelFormat::ASTC_2D_10X8_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{
PixelFormat::ASTC_2D_10X10_UNORM,
PixelFormat::ASTC_2D_10X10_SRGB,
};
// Missing formats
// ASTC_2D_12X10_UNORM,
// ASTC_2D_12X10_SRGB,
constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{
PixelFormat::ASTC_2D_12X12_UNORM,
PixelFormat::ASTC_2D_12X12_SRGB,
};
// Compatibility table taken from Table 4.X.1 in:
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
constexpr std::array COPY_CLASS_128_BITS = {
constexpr std::array COPY_CLASS_128_BITS{
PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT,
PixelFormat::BC2_UNORM, PixelFormat::BC2_SRGB, PixelFormat::BC3_UNORM,
PixelFormat::BC3_SRGB, PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM,
@ -97,7 +159,7 @@ constexpr std::array COPY_CLASS_128_BITS = {
// PixelFormat::RGBA32I
// COMPRESSED_RG_RGTC2
constexpr std::array COPY_CLASS_64_BITS = {
constexpr std::array COPY_CLASS_64_BITS{
PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT,
PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
PixelFormat::R16G16B16A16_SINT, PixelFormat::R32G32_UINT,
@ -110,32 +172,36 @@ constexpr std::array COPY_CLASS_64_BITS = {
// COMPRESSED_RGBA_S3TC_DXT1_EXT
// COMPRESSED_SIGNED_RED_RGTC1
void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
compatiblity[format_a][format_b] = true;
compatiblity[format_b][format_a] = true;
constexpr void Enable(Table& table, size_t format_a, size_t format_b) {
table[format_a][format_b / 64] |= u64(1) << (format_b % 64);
table[format_b][format_a / 64] |= u64(1) << (format_a % 64);
}
void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) {
Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
}
template <typename Range>
void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
constexpr void EnableRange(Table& table, const Range& range) {
for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
for (auto it_b = it_a; it_b != range.end(); ++it_b) {
Enable(compatibility, *it_a, *it_b);
Enable(table, *it_a, *it_b);
}
}
}
} // Anonymous namespace
constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) {
const size_t a = static_cast<size_t>(format_a);
const size_t b = static_cast<size_t>(format_b);
return ((table[a][b / 64] >> (b % 64)) & 1) != 0;
}
FormatCompatibility::FormatCompatibility() {
constexpr Table MakeViewTable() {
Table view{};
for (size_t i = 0; i < MaxPixelFormat; ++i) {
// Identity is allowed
Enable(view, i, i);
}
EnableRange(view, VIEW_CLASS_128_BITS);
EnableRange(view, VIEW_CLASS_96_BITS);
EnableRange(view, VIEW_CLASS_64_BITS);
@ -146,10 +212,39 @@ FormatCompatibility::FormatCompatibility() {
EnableRange(view, VIEW_CLASS_RGTC2_RG);
EnableRange(view, VIEW_CLASS_BPTC_UNORM);
EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA);
return view;
}
copy = view;
constexpr Table MakeCopyTable() {
Table copy = MakeViewTable();
EnableRange(copy, COPY_CLASS_128_BITS);
EnableRange(copy, COPY_CLASS_64_BITS);
return copy;
}
} // Anonymous namespace
bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views) {
if (broken_views) {
// If format views are broken, only accept formats that are identical.
return format_a == format_b;
}
static constexpr Table TABLE = MakeViewTable();
return IsSupported(TABLE, format_a, format_b);
}
bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) {
static constexpr Table TABLE = MakeCopyTable();
return IsSupported(TABLE, format_a, format_b);
}
} // namespace VideoCore::Surface

View File

@ -4,31 +4,12 @@
#pragma once
#include <array>
#include <bitset>
#include <cstddef>
#include "video_core/surface.h"
namespace VideoCore::Surface {
class FormatCompatibility {
public:
using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views);
explicit FormatCompatibility();
bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
}
bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
}
private:
Table view;
Table copy;
};
bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b);
} // namespace VideoCore::Surface

View File

@ -0,0 +1,32 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <utility>
#include <vector>
namespace VideoCommon {
/// Container to push objects to be destroyed a few ticks in the future
template <typename T, size_t TICKS_TO_DESTROY>
class DelayedDestructionRing {
public:
void Tick() {
index = (index + 1) % TICKS_TO_DESTROY;
elements[index].clear();
}
void Push(T&& object) {
elements[index].push_back(std::move(object));
}
private:
size_t index = 0;
std::array<std::vector<T>, TICKS_TO_DESTROY> elements;
};
} // namespace VideoCommon

View File

@ -9,13 +9,16 @@
#include "video_core/dirty_flags.h"
#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name)
#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32))
#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))
namespace VideoCommon::Dirty {
using Tegra::Engines::Maxwell3D;
void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
static constexpr std::size_t num_per_rt = NUM(rt[0]);
static constexpr std::size_t begin = OFF(rt);
static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@ -23,6 +26,10 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt);
}
FillBlock(tables[1], begin, num, RenderTargets);
FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets);
tables[0][OFF(rt_control)] = RenderTargets;
tables[1][OFF(rt_control)] = RenderTargetControl;
static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets};
for (std::size_t i = 0; i < std::size(zeta_flags); ++i) {

View File

@ -16,7 +16,10 @@ namespace VideoCommon::Dirty {
enum : u8 {
NullEntry = 0,
Descriptors,
RenderTargets,
RenderTargetControl,
ColorBuffer0,
ColorBuffer1,
ColorBuffer2,

View File

@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/cityhash.h"
#include "common/microprofile.h"
#include "core/core.h"
#include "core/memory.h"
@ -12,7 +13,7 @@
namespace Tegra {
DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {}
DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_) : gpu{gpu_}, system{system_} {}
DmaPusher::~DmaPusher() = default;
@ -45,32 +46,41 @@ bool DmaPusher::Step() {
return false;
}
const CommandList& command_list{dma_pushbuffer.front()};
ASSERT_OR_EXECUTE(!command_list.empty(), {
// Somehow the command_list is empty, in order to avoid a crash
// We ignore it and assume its size is 0.
CommandList& command_list{dma_pushbuffer.front()};
ASSERT_OR_EXECUTE(
command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
// Somehow the command_list is empty, in order to avoid a crash
// We ignore it and assume its size is 0.
dma_pushbuffer.pop();
dma_pushbuffer_subindex = 0;
return true;
});
if (command_list.prefetch_command_list.size()) {
// Prefetched command list from nvdrv, used for things like synchronization
command_headers = std::move(command_list.prefetch_command_list);
dma_pushbuffer.pop();
dma_pushbuffer_subindex = 0;
return true;
});
const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
const GPUVAddr dma_get = command_list_header.addr;
} else {
const CommandListHeader command_list_header{
command_list.command_lists[dma_pushbuffer_subindex++]};
const GPUVAddr dma_get = command_list_header.addr;
if (dma_pushbuffer_subindex >= command_list.size()) {
// We've gone through the current list, remove it from the queue
dma_pushbuffer.pop();
dma_pushbuffer_subindex = 0;
if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
// We've gone through the current list, remove it from the queue
dma_pushbuffer.pop();
dma_pushbuffer_subindex = 0;
}
if (command_list_header.size == 0) {
return true;
}
// Push buffer non-empty, read a word
command_headers.resize(command_list_header.size);
gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
command_list_header.size * sizeof(u32));
}
if (command_list_header.size == 0) {
return true;
}
// Push buffer non-empty, read a word
command_headers.resize(command_list_header.size);
gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
command_list_header.size * sizeof(u32));
for (std::size_t index = 0; index < command_headers.size();) {
const CommandHeader& command_header = command_headers[index];
@ -142,7 +152,12 @@ void DmaPusher::SetState(const CommandHeader& command_header) {
void DmaPusher::CallMethod(u32 argument) const {
if (dma_state.method < non_puller_methods) {
gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count});
gpu.CallMethod(GPU::MethodCall{
dma_state.method,
argument,
dma_state.subchannel,
dma_state.method_count,
});
} else {
subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
dma_state.is_last_call);

View File

@ -18,6 +18,8 @@ class System;
namespace Tegra {
class GPU;
enum class SubmissionMode : u32 {
IncreasingOld = 0,
Increasing = 1,
@ -27,6 +29,31 @@ enum class SubmissionMode : u32 {
IncreaseOnce = 5
};
// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
// So the values you see in docs might be multiplied by 4.
enum class BufferMethods : u32 {
BindObject = 0x0,
Nop = 0x2,
SemaphoreAddressHigh = 0x4,
SemaphoreAddressLow = 0x5,
SemaphoreSequence = 0x6,
SemaphoreTrigger = 0x7,
NotifyIntr = 0x8,
WrcacheFlush = 0x9,
Unk28 = 0xA,
UnkCacheFlush = 0xB,
RefCnt = 0x14,
SemaphoreAcquire = 0x1A,
SemaphoreRelease = 0x1B,
FenceValue = 0x1C,
FenceAction = 0x1D,
WaitForInterrupt = 0x1E,
Unk7c = 0x1F,
Yield = 0x20,
NonPullerMethods = 0x40,
};
struct CommandListHeader {
union {
u64 raw;
@ -49,9 +76,23 @@ union CommandHeader {
static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
class GPU;
inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
CommandHeader result{};
result.method.Assign(static_cast<u32>(method));
result.arg_count.Assign(arg_count);
result.mode.Assign(mode);
return result;
}
using CommandList = std::vector<Tegra::CommandListHeader>;
struct CommandList final {
CommandList() = default;
explicit CommandList(std::size_t size) : command_lists(size) {}
explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_)
: prefetch_command_list{std::move(prefetch_command_list_)} {}
std::vector<CommandListHeader> command_lists;
std::vector<CommandHeader> prefetch_command_list;
};
/**
* The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
@ -60,9 +101,9 @@ using CommandList = std::vector<Tegra::CommandListHeader>;
* See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
* details on this implementation.
*/
class DmaPusher {
class DmaPusher final {
public:
explicit DmaPusher(Core::System& system, GPU& gpu);
explicit DmaPusher(Core::System& system_, GPU& gpu_);
~DmaPusher();
void Push(CommandList&& entries) {
@ -71,7 +112,7 @@ public:
void DispatchCalls();
void BindSubchannel(Tegra::Engines::EngineInterface* engine, u32 subchannel_id) {
void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {
subchannels[subchannel_id] = engine;
}
@ -104,7 +145,7 @@ private:
bool ib_enable{true}; ///< IB mode enabled
std::array<Tegra::Engines::EngineInterface*, max_subchannels> subchannels{};
std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
GPU& gpu;
Core::System& system;

View File

@ -11,16 +11,16 @@
namespace Tegra::Engines::Upload {
State::State(MemoryManager& memory_manager, Registers& regs)
: regs{regs}, memory_manager{memory_manager} {}
State::State(MemoryManager& memory_manager_, Registers& regs_)
: regs{regs_}, memory_manager{memory_manager_} {}
State::~State() = default;
void State::ProcessExec(const bool is_linear) {
void State::ProcessExec(const bool is_linear_) {
write_offset = 0;
copy_size = regs.line_length_in * regs.line_count;
inner_buffer.resize(copy_size);
this->is_linear = is_linear;
is_linear = is_linear_;
}
void State::ProcessData(const u32 data, const bool is_last_call) {

View File

@ -54,10 +54,10 @@ struct Registers {
class State {
public:
State(MemoryManager& memory_manager, Registers& regs);
explicit State(MemoryManager& memory_manager_, Registers& regs_);
~State();
void ProcessExec(bool is_linear);
void ProcessExec(bool is_linear_);
void ProcessData(u32 data, bool is_last_call);
private:

View File

@ -10,7 +10,11 @@
namespace Tegra::Engines {
Fermi2D::Fermi2D() = default;
Fermi2D::Fermi2D() {
// Nvidia's OpenGL driver seems to assume these values
regs.src.depth = 1;
regs.dst.depth = 1;
}
Fermi2D::~Fermi2D() = default;
@ -21,79 +25,43 @@ void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid Fermi2D register, increase the size of the Regs structure");
regs.reg_array[method] = method_argument;
switch (method) {
// Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit,
// so trigger on the second 32-bit write.
case FERMI2D_REG_INDEX(blit_src_y) + 1: {
HandleSurfaceCopy();
break;
}
if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) {
Blit();
}
}
void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
for (u32 i = 0; i < amount; ++i) {
CallMethod(method, base_start[i], methods_pending - i <= 1);
}
}
static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
const u32 line_a = src_2 - src_1;
const u32 line_b = dst_2 - dst_1;
const u32 excess = std::max<s32>(0, line_a - src_line + src_1);
return {line_b - (excess * line_b) / line_a, excess};
}
void Fermi2D::Blit() {
LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
regs.src.Address(), regs.dst.Address());
void Fermi2D::HandleSurfaceCopy() {
LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}",
static_cast<u32>(regs.operation));
UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy");
UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero");
UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero");
UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one");
UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");
// TODO(Subv): Only raw copies are implemented.
ASSERT(regs.operation == Operation::SrcCopy);
const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)};
const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)};
u32 src_blit_x2, src_blit_y2;
if (regs.blit_control.origin == Origin::Corner) {
src_blit_x2 =
static_cast<u32>((regs.blit_src_x + (regs.blit_du_dx * regs.blit_dst_width)) >> 32);
src_blit_y2 =
static_cast<u32>((regs.blit_src_y + (regs.blit_dv_dy * regs.blit_dst_height)) >> 32);
} else {
src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width);
src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height);
}
u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width;
u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height;
const auto [new_dst_w, src_excess_x] =
DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width);
const auto [new_dst_h, src_excess_y] =
DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height);
dst_blit_x2 = new_dst_w + regs.blit_dst_x;
src_blit_x2 = src_blit_x2 - src_excess_x;
dst_blit_y2 = new_dst_h + regs.blit_dst_y;
src_blit_y2 = src_blit_y2 - src_excess_y;
const auto [new_src_w, dst_excess_x] =
DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width);
const auto [new_src_h, dst_excess_y] =
DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height);
src_blit_x2 = new_src_w + src_blit_x1;
dst_blit_x2 = dst_blit_x2 - dst_excess_x;
src_blit_y2 = new_src_h + src_blit_y1;
dst_blit_y2 = dst_blit_y2 - dst_excess_y;
const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
dst_blit_y2};
const Config copy_config{
const auto& args = regs.pixels_from_memory;
const Config config{
.operation = regs.operation,
.filter = regs.blit_control.filter,
.src_rect = src_rect,
.dst_rect = dst_rect,
.filter = args.sample_mode.filter,
.dst_x0 = args.dst_x0,
.dst_y0 = args.dst_y0,
.dst_x1 = args.dst_x0 + args.dst_width,
.dst_y1 = args.dst_y0 + args.dst_height,
.src_x0 = static_cast<s32>(args.src_x0 >> 32),
.src_y0 = static_cast<s32>(args.src_y0 >> 32),
.src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
.src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
};
if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) {
UNIMPLEMENTED();
}
}

View File

@ -53,8 +53,8 @@ public:
};
enum class Filter : u32 {
PointSample = 0, // Nearest
Linear = 1,
Point = 0,
Bilinear = 1,
};
enum class Operation : u32 {
@ -67,88 +67,235 @@ public:
BlendPremult = 6,
};
struct Regs {
static constexpr std::size_t NUM_REGS = 0x258;
enum class MemoryLayout : u32 {
BlockLinear = 0,
Pitch = 1,
};
struct Surface {
RenderTargetFormat format;
BitField<0, 1, u32> linear;
union {
BitField<0, 4, u32> block_width;
BitField<4, 4, u32> block_height;
BitField<8, 4, u32> block_depth;
};
u32 depth;
u32 layer;
u32 pitch;
u32 width;
u32 height;
u32 address_high;
u32 address_low;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
u32 BlockWidth() const {
return block_width.Value();
}
u32 BlockHeight() const {
return block_height.Value();
}
u32 BlockDepth() const {
return block_depth.Value();
}
};
static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
enum class CpuIndexWrap : u32 {
Wrap = 0,
NoWrap = 1,
};
struct Surface {
RenderTargetFormat format;
MemoryLayout linear;
union {
BitField<0, 4, u32> block_width;
BitField<4, 4, u32> block_height;
BitField<8, 4, u32> block_depth;
};
u32 depth;
u32 layer;
u32 pitch;
u32 width;
u32 height;
u32 addr_upper;
u32 addr_lower;
[[nodiscard]] constexpr GPUVAddr Address() const noexcept {
return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
}
};
static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
enum class SectorPromotion : u32 {
NoPromotion = 0,
PromoteTo2V = 1,
PromoteTo2H = 2,
PromoteTo4 = 3,
};
enum class NumTpcs : u32 {
All = 0,
One = 1,
};
enum class RenderEnableMode : u32 {
False = 0,
True = 1,
Conditional = 2,
RenderIfEqual = 3,
RenderIfNotEqual = 4,
};
enum class ColorKeyFormat : u32 {
A16R56G6B5 = 0,
A1R5G55B5 = 1,
A8R8G8B8 = 2,
A2R10G10B10 = 3,
Y8 = 4,
Y16 = 5,
Y32 = 6,
};
union Beta4 {
BitField<0, 8, u32> b;
BitField<8, 8, u32> g;
BitField<16, 8, u32> r;
BitField<24, 8, u32> a;
};
struct Point {
u32 x;
u32 y;
};
enum class PatternSelect : u32 {
MonoChrome8x8 = 0,
MonoChrome64x1 = 1,
MonoChrome1x64 = 2,
Color = 3,
};
enum class NotifyType : u32 {
WriteOnly = 0,
WriteThenAwaken = 1,
};
enum class MonochromePatternColorFormat : u32 {
A8X8R8G6B5 = 0,
A1R5G5B5 = 1,
A8R8G8B8 = 2,
A8Y8 = 3,
A8X8Y16 = 4,
Y32 = 5,
};
enum class MonochromePatternFormat : u32 {
CGA6_M1 = 0,
LE_M1 = 1,
};
union Regs {
static constexpr std::size_t NUM_REGS = 0x258;
struct {
u32 object;
INSERT_UNION_PADDING_WORDS(0x3F);
u32 no_operation;
NotifyType notify;
INSERT_UNION_PADDING_WORDS(0x2);
u32 wait_for_idle;
INSERT_UNION_PADDING_WORDS(0xB);
u32 pm_trigger;
INSERT_UNION_PADDING_WORDS(0xF);
u32 context_dma_notify;
u32 dst_context_dma;
u32 src_context_dma;
u32 semaphore_context_dma;
INSERT_UNION_PADDING_WORDS(0x1C);
Surface dst;
CpuIndexWrap pixels_from_cpu_index_wrap;
u32 kind2d_check_enable;
Surface src;
SectorPromotion pixels_from_memory_sector_promotion;
INSERT_UNION_PADDING_WORDS(0x1);
NumTpcs num_tpcs;
u32 render_enable_addr_upper;
u32 render_enable_addr_lower;
RenderEnableMode render_enable_mode;
INSERT_UNION_PADDING_WORDS(0x4);
u32 clip_x0;
u32 clip_y0;
u32 clip_width;
u32 clip_height;
BitField<0, 1, u32> clip_enable;
BitField<0, 3, ColorKeyFormat> color_key_format;
u32 color_key;
BitField<0, 1, u32> color_key_enable;
BitField<0, 8, u32> rop;
u32 beta1;
Beta4 beta4;
Operation operation;
union {
BitField<0, 6, u32> x;
BitField<8, 6, u32> y;
} pattern_offset;
BitField<0, 2, PatternSelect> pattern_select;
INSERT_UNION_PADDING_WORDS(0xC);
struct {
INSERT_UNION_PADDING_WORDS(0x80);
Surface dst;
INSERT_UNION_PADDING_WORDS(2);
Surface src;
INSERT_UNION_PADDING_WORDS(0x15);
Operation operation;
INSERT_UNION_PADDING_WORDS(0x177);
BitField<0, 3, MonochromePatternColorFormat> color_format;
BitField<0, 1, MonochromePatternFormat> format;
u32 color0;
u32 color1;
u32 pattern0;
u32 pattern1;
} monochrome_pattern;
struct {
std::array<u32, 0x40> X8R8G8B8;
std::array<u32, 0x20> R5G6B5;
std::array<u32, 0x20> X1R5G5B5;
std::array<u32, 0x10> Y8;
} color_pattern;
INSERT_UNION_PADDING_WORDS(0x10);
struct {
u32 prim_mode;
u32 prim_color_format;
u32 prim_color;
u32 line_tie_break_bits;
INSERT_UNION_PADDING_WORDS(0x14);
u32 prim_point_xy;
INSERT_UNION_PADDING_WORDS(0x7);
std::array<Point, 0x40> prim_point;
} render_solid;
struct {
u32 data_type;
u32 color_format;
u32 index_format;
u32 mono_format;
u32 wrap;
u32 color0;
u32 color1;
u32 mono_opacity;
INSERT_UNION_PADDING_WORDS(0x6);
u32 src_width;
u32 src_height;
u32 dx_du_frac;
u32 dx_du_int;
u32 dx_dv_frac;
u32 dy_dv_int;
u32 dst_x0_frac;
u32 dst_x0_int;
u32 dst_y0_frac;
u32 dst_y0_int;
u32 data;
} pixels_from_cpu;
INSERT_UNION_PADDING_WORDS(0x3);
u32 big_endian_control;
INSERT_UNION_PADDING_WORDS(0x3);
struct {
BitField<0, 3, u32> block_shape;
BitField<0, 5, u32> corral_size;
BitField<0, 1, u32> safe_overlap;
union {
u32 raw;
BitField<0, 1, Origin> origin;
BitField<4, 1, Filter> filter;
} blit_control;
} sample_mode;
INSERT_UNION_PADDING_WORDS(0x8);
u32 blit_dst_x;
u32 blit_dst_y;
u32 blit_dst_width;
u32 blit_dst_height;
u64 blit_du_dx;
u64 blit_dv_dy;
u64 blit_src_x;
u64 blit_src_y;
INSERT_UNION_PADDING_WORDS(0x21);
};
std::array<u32, NUM_REGS> reg_array;
s32 dst_x0;
s32 dst_y0;
s32 dst_width;
s32 dst_height;
s64 du_dx;
s64 dv_dy;
s64 src_x0;
s64 src_y0;
} pixels_from_memory;
};
std::array<u32, NUM_REGS> reg_array;
} regs{};
struct Config {
Operation operation{};
Filter filter{};
Common::Rectangle<u32> src_rect;
Common::Rectangle<u32> dst_rect;
Operation operation;
Filter filter;
s32 dst_x0;
s32 dst_y0;
s32 dst_x1;
s32 dst_y1;
s32 src_x0;
s32 src_y0;
s32 src_x1;
s32 src_y1;
};
private:
@ -156,25 +303,49 @@ private:
/// Performs the copy from the source surface to the destination surface as configured in the
/// registers.
void HandleSurfaceCopy();
void Blit();
};
#define ASSERT_REG_POSITION(field_name, position) \
static_assert(offsetof(Fermi2D::Regs, field_name) == position * 4, \
static_assert(offsetof(Fermi2D::Regs, field_name) == position, \
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(dst, 0x80);
ASSERT_REG_POSITION(src, 0x8C);
ASSERT_REG_POSITION(operation, 0xAB);
ASSERT_REG_POSITION(blit_control, 0x223);
ASSERT_REG_POSITION(blit_dst_x, 0x22c);
ASSERT_REG_POSITION(blit_dst_y, 0x22d);
ASSERT_REG_POSITION(blit_dst_width, 0x22e);
ASSERT_REG_POSITION(blit_dst_height, 0x22f);
ASSERT_REG_POSITION(blit_du_dx, 0x230);
ASSERT_REG_POSITION(blit_dv_dy, 0x232);
ASSERT_REG_POSITION(blit_src_x, 0x234);
ASSERT_REG_POSITION(blit_src_y, 0x236);
ASSERT_REG_POSITION(object, 0x0);
ASSERT_REG_POSITION(no_operation, 0x100);
ASSERT_REG_POSITION(notify, 0x104);
ASSERT_REG_POSITION(wait_for_idle, 0x110);
ASSERT_REG_POSITION(pm_trigger, 0x140);
ASSERT_REG_POSITION(context_dma_notify, 0x180);
ASSERT_REG_POSITION(dst_context_dma, 0x184);
ASSERT_REG_POSITION(src_context_dma, 0x188);
ASSERT_REG_POSITION(semaphore_context_dma, 0x18C);
ASSERT_REG_POSITION(dst, 0x200);
ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228);
ASSERT_REG_POSITION(kind2d_check_enable, 0x22C);
ASSERT_REG_POSITION(src, 0x230);
ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258);
ASSERT_REG_POSITION(num_tpcs, 0x260);
ASSERT_REG_POSITION(render_enable_addr_upper, 0x264);
ASSERT_REG_POSITION(render_enable_addr_lower, 0x268);
ASSERT_REG_POSITION(clip_x0, 0x280);
ASSERT_REG_POSITION(clip_y0, 0x284);
ASSERT_REG_POSITION(clip_width, 0x288);
ASSERT_REG_POSITION(clip_height, 0x28c);
ASSERT_REG_POSITION(clip_enable, 0x290);
ASSERT_REG_POSITION(color_key_format, 0x294);
ASSERT_REG_POSITION(color_key, 0x298);
ASSERT_REG_POSITION(rop, 0x2A0);
ASSERT_REG_POSITION(beta1, 0x2A4);
ASSERT_REG_POSITION(beta4, 0x2A8);
ASSERT_REG_POSITION(operation, 0x2AC);
ASSERT_REG_POSITION(pattern_offset, 0x2B0);
ASSERT_REG_POSITION(pattern_select, 0x2B4);
ASSERT_REG_POSITION(monochrome_pattern, 0x2E8);
ASSERT_REG_POSITION(color_pattern, 0x300);
ASSERT_REG_POSITION(render_solid, 0x580);
ASSERT_REG_POSITION(pixels_from_cpu, 0x800);
ASSERT_REG_POSITION(big_endian_control, 0x870);
ASSERT_REG_POSITION(pixels_from_memory, 0x880);
#undef ASSERT_REG_POSITION

View File

@ -58,24 +58,6 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun
}
}
Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value();
ASSERT(cbuf_mask[regs.tex_cb_index]);
const auto& texinfo = launch_description.const_buffer_config[regs.tex_cb_index];
ASSERT(texinfo.Address() != 0);
const GPUVAddr address = texinfo.Address() + offset * sizeof(Texture::TextureHandle);
ASSERT(address < texinfo.Address() + texinfo.size);
const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)};
return GetTextureInfo(tex_handle);
}
Texture::FullTextureInfo KeplerCompute::GetTextureInfo(Texture::TextureHandle tex_handle) const {
return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
}
u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
ASSERT(stage == ShaderType::Compute);
const auto& buffer = launch_description.const_buffer_config[const_buffer];
@ -98,9 +80,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
const Texture::TextureHandle tex_handle{handle};
const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
return result;
}

View File

@ -209,11 +209,6 @@ public:
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) override;
Texture::FullTextureInfo GetTexture(std::size_t offset) const;
/// Given a texture handle, returns the TSC and TIC entries.
Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;
u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;

View File

@ -14,8 +14,8 @@
namespace Tegra::Engines {
KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
: system{system}, upload_state{memory_manager, regs.upload} {}
KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
: system{system_}, upload_state{memory_manager, regs.upload} {}
KeplerMemory::~KeplerMemory() = default;

View File

@ -35,7 +35,7 @@ namespace Tegra::Engines {
class KeplerMemory final : public EngineInterface {
public:
KeplerMemory(Core::System& system, MemoryManager& memory_manager);
explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
~KeplerMemory();
/// Write the value to the register identified by method.

View File

@ -2,7 +2,6 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cinttypes>
#include <cstring>
#include <optional>
#include "common/assert.h"
@ -124,6 +123,116 @@ void Maxwell3D::InitializeRegisterDefaults() {
mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
}
void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
if (executing_macro == 0) {
// A macro call must begin by writing the macro method's register, not its argument.
ASSERT_MSG((method % 2) == 0,
"Can't start macro execution by writing to the ARGS register");
executing_macro = method;
}
macro_params.insert(macro_params.end(), base_start, base_start + amount);
// Call the macro when there are no more parameters in the command buffer
if (is_last_call) {
CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
}
}
u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
// Keep track of the register value in shadow_state when requested.
const auto control = shadow_state.shadow_ram_control;
if (control == Regs::ShadowRamControl::Track ||
control == Regs::ShadowRamControl::TrackWithFilter) {
shadow_state.reg_array[method] = argument;
return argument;
}
if (control == Regs::ShadowRamControl::Replay) {
return shadow_state.reg_array[method];
}
return argument;
}
void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
if (regs.reg_array[method] == argument) {
return;
}
regs.reg_array[method] = argument;
for (const auto& table : dirty.tables) {
dirty.flags[table[method]] = true;
}
}
void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
bool is_last_call) {
switch (method) {
case MAXWELL3D_REG_INDEX(wait_for_idle):
return rasterizer->WaitForIdle();
case MAXWELL3D_REG_INDEX(shadow_ram_control):
shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
return;
case MAXWELL3D_REG_INDEX(macros.data):
return macro_engine->AddCode(regs.macros.upload_address, argument);
case MAXWELL3D_REG_INDEX(macros.bind):
return ProcessMacroBind(argument);
case MAXWELL3D_REG_INDEX(firmware[4]):
return ProcessFirmwareCall4();
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
return StartCBData(method);
case MAXWELL3D_REG_INDEX(cb_bind[0]):
return ProcessCBBind(0);
case MAXWELL3D_REG_INDEX(cb_bind[1]):
return ProcessCBBind(1);
case MAXWELL3D_REG_INDEX(cb_bind[2]):
return ProcessCBBind(2);
case MAXWELL3D_REG_INDEX(cb_bind[3]):
return ProcessCBBind(3);
case MAXWELL3D_REG_INDEX(cb_bind[4]):
return ProcessCBBind(4);
case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
return DrawArrays();
case MAXWELL3D_REG_INDEX(clear_buffers):
return ProcessClearBuffers();
case MAXWELL3D_REG_INDEX(query.query_get):
return ProcessQueryGet();
case MAXWELL3D_REG_INDEX(condition.mode):
return ProcessQueryCondition();
case MAXWELL3D_REG_INDEX(counter_reset):
return ProcessCounterReset();
case MAXWELL3D_REG_INDEX(sync_info):
return ProcessSyncPoint();
case MAXWELL3D_REG_INDEX(exec_upload):
return upload_state.ProcessExec(regs.exec_upload.linear != 0);
case MAXWELL3D_REG_INDEX(data_upload):
upload_state.ProcessData(argument, is_last_call);
if (is_last_call) {
OnMemoryWrite();
}
return;
case MAXWELL3D_REG_INDEX(fragment_barrier):
return rasterizer->FragmentBarrier();
case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
return rasterizer->TiledCacheBarrier();
}
}
void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
// Reset the current macro.
executing_macro = 0;
@ -157,142 +266,16 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
// Methods after 0xE00 are special, they're actually triggers for some microcode that was
// uploaded to the GPU during initialization.
if (method >= MacroRegistersStart) {
// We're trying to execute a macro
if (executing_macro == 0) {
// A macro call must begin by writing the macro method's register, not its argument.
ASSERT_MSG((method % 2) == 0,
"Can't start macro execution by writing to the ARGS register");
executing_macro = method;
}
macro_params.push_back(method_argument);
// Call the macro when there are no more parameters in the command buffer
if (is_last_call) {
CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
}
ProcessMacro(method, &method_argument, 1, is_last_call);
return;
}
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid Maxwell3D register, increase the size of the Regs structure");
u32 arg = method_argument;
// Keep track of the register value in shadow_state when requested.
if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track ||
shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) {
shadow_state.reg_array[method] = arg;
} else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) {
arg = shadow_state.reg_array[method];
}
if (regs.reg_array[method] != arg) {
regs.reg_array[method] = arg;
for (const auto& table : dirty.tables) {
dirty.flags[table[method]] = true;
}
}
switch (method) {
case MAXWELL3D_REG_INDEX(wait_for_idle): {
rasterizer->WaitForIdle();
break;
}
case MAXWELL3D_REG_INDEX(shadow_ram_control): {
shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_argument);
break;
}
case MAXWELL3D_REG_INDEX(macros.data): {
macro_engine->AddCode(regs.macros.upload_address, arg);
break;
}
case MAXWELL3D_REG_INDEX(macros.bind): {
ProcessMacroBind(arg);
break;
}
case MAXWELL3D_REG_INDEX(firmware[4]): {
ProcessFirmwareCall4();
break;
}
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
StartCBData(method);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[0]): {
ProcessCBBind(0);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[1]): {
ProcessCBBind(1);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[2]): {
ProcessCBBind(2);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[3]): {
ProcessCBBind(3);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[4]): {
ProcessCBBind(4);
break;
}
case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): {
DrawArrays();
break;
}
case MAXWELL3D_REG_INDEX(clear_buffers): {
ProcessClearBuffers();
break;
}
case MAXWELL3D_REG_INDEX(query.query_get): {
ProcessQueryGet();
break;
}
case MAXWELL3D_REG_INDEX(condition.mode): {
ProcessQueryCondition();
break;
}
case MAXWELL3D_REG_INDEX(counter_reset): {
ProcessCounterReset();
break;
}
case MAXWELL3D_REG_INDEX(sync_info): {
ProcessSyncPoint();
break;
}
case MAXWELL3D_REG_INDEX(exec_upload): {
upload_state.ProcessExec(regs.exec_upload.linear != 0);
break;
}
case MAXWELL3D_REG_INDEX(data_upload): {
upload_state.ProcessData(arg, is_last_call);
if (is_last_call) {
OnMemoryWrite();
}
break;
}
default:
break;
}
const u32 argument = ProcessShadowRam(method, method_argument);
ProcessDirtyRegisters(method, argument);
ProcessMethodCall(method, argument, method_argument, is_last_call);
}
void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
@ -300,23 +283,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
// Methods after 0xE00 are special, they're actually triggers for some microcode that was
// uploaded to the GPU during initialization.
if (method >= MacroRegistersStart) {
// We're trying to execute a macro
if (executing_macro == 0) {
// A macro call must begin by writing the macro method's register, not its argument.
ASSERT_MSG((method % 2) == 0,
"Can't start macro execution by writing to the ARGS register");
executing_macro = method;
}
for (std::size_t i = 0; i < amount; i++) {
macro_params.push_back(base_start[i]);
}
// Call the macro when there are no more parameters in the command buffer
if (amount == methods_pending) {
CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
}
ProcessMacro(method, base_start, amount, amount == methods_pending);
return;
}
switch (method) {
@ -335,15 +302,14 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
ProcessCBMultiData(method, base_start, amount);
break;
}
default: {
default:
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
}
break;
}
}
@ -396,7 +362,7 @@ void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) {
}
void Maxwell3D::FlushMMEInlineDraw() {
LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
regs.vertex_buffer.count);
ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
ASSERT(mme_draw.instance_count == mme_draw.gl_end_count);
@ -541,8 +507,7 @@ void Maxwell3D::ProcessCounterReset() {
rasterizer->ResetCounter(QueryType::SamplesPassed);
break;
default:
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
static_cast<int>(regs.counter_reset));
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset);
break;
}
}
@ -557,7 +522,7 @@ void Maxwell3D::ProcessSyncPoint() {
}
void Maxwell3D::DrawArrays() {
LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
regs.vertex_buffer.count);
ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
@ -595,12 +560,12 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
return 0;
case Regs::QuerySelect::SamplesPassed:
// Deferred.
rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed,
system.GPU().GetTicks());
return std::nullopt;
default:
LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
regs.query.query_get.select.Value());
return 1;
}
}
@ -677,7 +642,7 @@ void Maxwell3D::FinishCBData() {
}
Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
Texture::TICEntry tic_entry;
memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
@ -686,43 +651,19 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
}
Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
Texture::TSCEntry tsc_entry;
memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
return tsc_entry;
}
Texture::FullTextureInfo Maxwell3D::GetTextureInfo(Texture::TextureHandle tex_handle) const {
return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
}
Texture::FullTextureInfo Maxwell3D::GetStageTexture(ShaderType stage, std::size_t offset) const {
const auto stage_index = static_cast<std::size_t>(stage);
const auto& shader = state.shader_stages[stage_index];
const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
const GPUVAddr tex_info_address =
tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
return GetTextureInfo(tex_handle);
}
u32 Maxwell3D::GetRegisterValue(u32 method) const {
ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
return regs.reg_array[method];
}
void Maxwell3D::ProcessClearBuffers() {
ASSERT(regs.clear_buffers.R == regs.clear_buffers.G &&
regs.clear_buffers.R == regs.clear_buffers.B &&
regs.clear_buffers.R == regs.clear_buffers.A);
rasterizer->Clear();
}
@ -730,9 +671,7 @@ u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offse
ASSERT(stage != ShaderType::Compute);
const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
const auto& buffer = shader_stage.const_buffers[const_buffer];
u32 result;
std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32));
return result;
return memory_manager.Read<u32>(buffer.address + offset);
}
SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
@ -750,9 +689,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
const Texture::TextureHandle tex_handle{handle};
const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
return result;
}

View File

@ -438,16 +438,6 @@ public:
DecrWrapOGL = 0x8508,
};
enum class MemoryLayout : u32 {
Linear = 0,
BlockLinear = 1,
};
enum class InvMemoryLayout : u32 {
BlockLinear = 0,
Linear = 1,
};
enum class CounterReset : u32 {
SampleCnt = 0x01,
Unk02 = 0x02,
@ -589,21 +579,31 @@ public:
NegativeW = 7,
};
enum class SamplerIndex : u32 {
Independently = 0,
ViaHeaderIndex = 1,
};
struct TileMode {
union {
BitField<0, 4, u32> block_width;
BitField<4, 4, u32> block_height;
BitField<8, 4, u32> block_depth;
BitField<12, 1, u32> is_pitch_linear;
BitField<16, 1, u32> is_3d;
};
};
static_assert(sizeof(TileMode) == 4);
struct RenderTargetConfig {
u32 address_high;
u32 address_low;
u32 width;
u32 height;
Tegra::RenderTargetFormat format;
TileMode tile_mode;
union {
BitField<0, 3, u32> block_width;
BitField<4, 3, u32> block_height;
BitField<8, 3, u32> block_depth;
BitField<12, 1, InvMemoryLayout> type;
BitField<16, 1, u32> is_3d;
} memory_layout;
union {
BitField<0, 16, u32> layers;
BitField<0, 16, u32> depth;
BitField<16, 1, u32> volume;
};
u32 layer_stride;
@ -755,7 +755,11 @@ public:
u32 data_upload;
INSERT_UNION_PADDING_WORDS(0x44);
INSERT_UNION_PADDING_WORDS(0x16);
u32 force_early_fragment_tests;
INSERT_UNION_PADDING_WORDS(0x2D);
struct {
union {
@ -828,7 +832,11 @@ public:
u32 patch_vertices;
INSERT_UNION_PADDING_WORDS(0xC);
INSERT_UNION_PADDING_WORDS(0x4);
u32 fragment_barrier;
INSERT_UNION_PADDING_WORDS(0x7);
std::array<ScissorTest, NumViewports> scissor_test;
@ -838,7 +846,15 @@ public:
u32 stencil_back_mask;
u32 stencil_back_func_mask;
INSERT_UNION_PADDING_WORDS(0xC);
INSERT_UNION_PADDING_WORDS(0x5);
u32 invalidate_texture_data_cache;
INSERT_UNION_PADDING_WORDS(0x1);
u32 tiled_cache_barrier;
INSERT_UNION_PADDING_WORDS(0x4);
u32 color_mask_common;
@ -862,12 +878,7 @@ public:
u32 address_high;
u32 address_low;
Tegra::DepthFormat format;
union {
BitField<0, 4, u32> block_width;
BitField<4, 4, u32> block_height;
BitField<8, 4, u32> block_depth;
BitField<20, 1, InvMemoryLayout> type;
} memory_layout;
TileMode tile_mode;
u32 layer_stride;
GPUVAddr Address() const {
@ -876,7 +887,18 @@ public:
}
} zeta;
INSERT_UNION_PADDING_WORDS(0x41);
struct {
union {
BitField<0, 16, u32> x;
BitField<16, 16, u32> width;
};
union {
BitField<0, 16, u32> y;
BitField<16, 16, u32> height;
};
} render_area;
INSERT_UNION_PADDING_WORDS(0x3F);
union {
BitField<0, 4, u32> stencil;
@ -917,7 +939,7 @@ public:
BitField<25, 3, u32> map_7;
};
u32 GetMap(std::size_t index) const {
u32 Map(std::size_t index) const {
const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3,
map_4, map_5, map_6, map_7};
ASSERT(index < maps.size());
@ -930,11 +952,13 @@ public:
u32 zeta_width;
u32 zeta_height;
union {
BitField<0, 16, u32> zeta_layers;
BitField<0, 16, u32> zeta_depth;
BitField<16, 1, u32> zeta_volume;
};
INSERT_UNION_PADDING_WORDS(0x26);
SamplerIndex sampler_index;
INSERT_UNION_PADDING_WORDS(0x25);
u32 depth_test_enable;
@ -960,6 +984,7 @@ public:
float b;
float a;
} blend_color;
INSERT_UNION_PADDING_WORDS(0x4);
struct {
@ -997,7 +1022,12 @@ public:
float line_width_smooth;
float line_width_aliased;
INSERT_UNION_PADDING_WORDS(0x1F);
INSERT_UNION_PADDING_WORDS(0x1B);
u32 invalidate_sampler_cache_no_wfi;
u32 invalidate_texture_header_cache_no_wfi;
INSERT_UNION_PADDING_WORDS(0x2);
u32 vb_element_base;
u32 vb_base_instance;
@ -1041,13 +1071,13 @@ public:
} condition;
struct {
u32 tsc_address_high;
u32 tsc_address_low;
u32 tsc_limit;
u32 address_high;
u32 address_low;
u32 limit;
GPUVAddr TSCAddress() const {
return static_cast<GPUVAddr>(
(static_cast<GPUVAddr>(tsc_address_high) << 32) | tsc_address_low);
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} tsc;
@ -1058,13 +1088,13 @@ public:
u32 line_smooth_enable;
struct {
u32 tic_address_high;
u32 tic_address_low;
u32 tic_limit;
u32 address_high;
u32 address_low;
u32 limit;
GPUVAddr TICAddress() const {
return static_cast<GPUVAddr>(
(static_cast<GPUVAddr>(tic_address_high) << 32) | tic_address_low);
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} tic;
@ -1393,12 +1423,6 @@ public:
void FlushMMEInlineDraw();
/// Given a texture handle, returns the TSC and TIC entries.
Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;
/// Returns the texture information for a specific texture in a specific shader stage.
Texture::FullTextureInfo GetStageTexture(ShaderType stage, std::size_t offset) const;
u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
@ -1461,38 +1485,13 @@ public:
private:
void InitializeRegisterDefaults();
Core::System& system;
MemoryManager& memory_manager;
void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call);
VideoCore::RasterizerInterface* rasterizer = nullptr;
u32 ProcessShadowRam(u32 method, u32 argument);
/// Start offsets of each macro in macro_memory
std::array<u32, 0x80> macro_positions = {};
void ProcessDirtyRegisters(u32 method, u32 argument);
std::array<bool, Regs::NUM_REGS> mme_inline{};
/// Macro method that is currently being executed / being fed parameters.
u32 executing_macro = 0;
/// Parameters that have been submitted to the macro call so far.
std::vector<u32> macro_params;
/// Interpreter for the macro codes uploaded to the GPU.
std::unique_ptr<MacroEngine> macro_engine;
static constexpr u32 null_cb_data = 0xFFFFFFFF;
struct {
std::array<std::array<u32, 0x4000>, 16> buffer;
u32 current{null_cb_data};
u32 id{null_cb_data};
u32 start_pos{};
u32 counter{};
} cb_data_state;
Upload::State upload_state;
bool execute_on{true};
std::array<u8, Regs::NUM_REGS> dirty_pointers{};
void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);
/// Retrieves information about a specific TIC entry from the TIC buffer.
Texture::TICEntry GetTICEntry(u32 tic_index) const;
@ -1502,8 +1501,8 @@ private:
/**
* Call a macro on this engine.
*
* @param method Method to call
* @param num_parameters Number of arguments
* @param parameters Arguments to the method call
*/
void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
@ -1552,6 +1551,38 @@ private:
/// Returns a query's value or an empty object if the value will be deferred through a cache.
std::optional<u64> GetQueryResult();
Core::System& system;
MemoryManager& memory_manager;
VideoCore::RasterizerInterface* rasterizer = nullptr;
/// Start offsets of each macro in macro_memory
std::array<u32, 0x80> macro_positions{};
std::array<bool, Regs::NUM_REGS> mme_inline{};
/// Macro method that is currently being executed / being fed parameters.
u32 executing_macro = 0;
/// Parameters that have been submitted to the macro call so far.
std::vector<u32> macro_params;
/// Interpreter for the macro codes uploaded to the GPU.
std::unique_ptr<MacroEngine> macro_engine;
static constexpr u32 null_cb_data = 0xFFFFFFFF;
struct CBDataState {
std::array<std::array<u32, 0x4000>, 16> buffer;
u32 current{null_cb_data};
u32 id{null_cb_data};
u32 start_pos{};
u32 counter{};
};
CBDataState cb_data_state;
Upload::State upload_state;
bool execute_on{true};
};
#define ASSERT_REG_POSITION(field_name, position) \
@ -1564,6 +1595,7 @@ ASSERT_REG_POSITION(shadow_ram_control, 0x49);
ASSERT_REG_POSITION(upload, 0x60);
ASSERT_REG_POSITION(exec_upload, 0x6C);
ASSERT_REG_POSITION(data_upload, 0x6D);
ASSERT_REG_POSITION(force_early_fragment_tests, 0x84);
ASSERT_REG_POSITION(sync_info, 0xB2);
ASSERT_REG_POSITION(tess_mode, 0xC8);
ASSERT_REG_POSITION(tess_level_outer, 0xC9);
@ -1586,10 +1618,13 @@ ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370);
ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371);
ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
ASSERT_REG_POSITION(patch_vertices, 0x373);
ASSERT_REG_POSITION(fragment_barrier, 0x378);
ASSERT_REG_POSITION(scissor_test, 0x380);
ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
ASSERT_REG_POSITION(invalidate_texture_data_cache, 0x3DD);
ASSERT_REG_POSITION(tiled_cache_barrier, 0x3DF);
ASSERT_REG_POSITION(color_mask_common, 0x3E4);
ASSERT_REG_POSITION(depth_bounds, 0x3E7);
ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
@ -1597,6 +1632,7 @@ ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED);
ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE);
ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF);
ASSERT_REG_POSITION(zeta, 0x3F8);
ASSERT_REG_POSITION(render_area, 0x3FD);
ASSERT_REG_POSITION(clear_flags, 0x43E);
ASSERT_REG_POSITION(fill_rectangle, 0x44F);
ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
@ -1605,7 +1641,8 @@ ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E);
ASSERT_REG_POSITION(rt_control, 0x487);
ASSERT_REG_POSITION(zeta_width, 0x48a);
ASSERT_REG_POSITION(zeta_height, 0x48b);
ASSERT_REG_POSITION(zeta_layers, 0x48c);
ASSERT_REG_POSITION(zeta_depth, 0x48c);
ASSERT_REG_POSITION(sampler_index, 0x48D);
ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
@ -1629,6 +1666,8 @@ ASSERT_REG_POSITION(frag_color_clamp, 0x4EA);
ASSERT_REG_POSITION(screen_y_control, 0x4EB);
ASSERT_REG_POSITION(line_width_smooth, 0x4EC);
ASSERT_REG_POSITION(line_width_aliased, 0x4ED);
ASSERT_REG_POSITION(invalidate_sampler_cache_no_wfi, 0x509);
ASSERT_REG_POSITION(invalidate_texture_header_cache_no_wfi, 0x50A);
ASSERT_REG_POSITION(vb_element_base, 0x50D);
ASSERT_REG_POSITION(vb_base_instance, 0x50E);
ASSERT_REG_POSITION(clip_distance_enabled, 0x544);

View File

@ -16,8 +16,10 @@ namespace Tegra::Engines {
using namespace Texture;
MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
: system{system}, memory_manager{memory_manager} {}
MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
: system{system_}, memory_manager{memory_manager_} {}
MaxwellDMA::~MaxwellDMA() = default;
void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
@ -94,6 +96,7 @@ void MaxwellDMA::CopyPitchToPitch() {
}
void MaxwellDMA::CopyBlockLinearToPitch() {
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
UNIMPLEMENTED_IF(regs.src_params.layer != 0);
@ -114,8 +117,6 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
const u32 block_depth = src_params.block_size.depth;
const size_t src_size =
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
const size_t src_layer_size =
CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
if (read_buffer.size() < src_size) {
read_buffer.resize(src_size);
@ -135,6 +136,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
}
void MaxwellDMA::CopyPitchToBlockLinear() {
UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
const auto& dst_params = regs.dst_params;
const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
const u32 width = dst_params.width;

View File

@ -72,11 +72,13 @@ public:
struct RenderEnable {
enum class Mode : u32 {
FALSE = 0,
TRUE = 1,
CONDITIONAL = 2,
RENDER_IF_EQUAL = 3,
RENDER_IF_NOT_EQUAL = 4,
// Note: This uses Pascal case in order to avoid the identifiers
// FALSE and TRUE, which are reserved on Darwin.
False = 0,
True = 1,
Conditional = 2,
RenderIfEqual = 3,
RenderIfNotEqual = 4,
};
PackedGPUVAddr address;
@ -185,8 +187,8 @@ public:
};
static_assert(sizeof(RemapConst) == 12);
explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
~MaxwellDMA() = default;
explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_);
~MaxwellDMA();
/// Write the value to the register identified by method.
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;

View File

@ -32,31 +32,31 @@ struct Register {
constexpr Register() = default;
constexpr Register(u64 value) : value(value) {}
constexpr Register(u64 value_) : value(value_) {}
constexpr operator u64() const {
[[nodiscard]] constexpr operator u64() const {
return value;
}
template <typename T>
constexpr u64 operator-(const T& oth) const {
[[nodiscard]] constexpr u64 operator-(const T& oth) const {
return value - oth;
}
template <typename T>
constexpr u64 operator&(const T& oth) const {
[[nodiscard]] constexpr u64 operator&(const T& oth) const {
return value & oth;
}
constexpr u64 operator&(const Register& oth) const {
[[nodiscard]] constexpr u64 operator&(const Register& oth) const {
return value & oth.value;
}
constexpr u64 operator~() const {
[[nodiscard]] constexpr u64 operator~() const {
return ~value;
}
u64 GetSwizzledIndex(u64 elem) const {
[[nodiscard]] u64 GetSwizzledIndex(u64 elem) const {
elem = (value + elem) & 3;
return (value & ~3) + elem;
}
@ -75,7 +75,7 @@ enum class AttributeSize : u64 {
union Attribute {
Attribute() = default;
constexpr explicit Attribute(u64 value) : value(value) {}
constexpr explicit Attribute(u64 value_) : value(value_) {}
enum class Index : u64 {
LayerViewportPointSize = 6,
@ -107,7 +107,7 @@ union Attribute {
BitField<31, 1, u64> patch;
BitField<47, 3, AttributeSize> size;
bool IsPhysical() const {
[[nodiscard]] bool IsPhysical() const {
return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0;
}
} fmt20;
@ -124,7 +124,7 @@ union Attribute {
union Sampler {
Sampler() = default;
constexpr explicit Sampler(u64 value) : value(value) {}
constexpr explicit Sampler(u64 value_) : value(value_) {}
enum class Index : u64 {
Sampler_0 = 8,
@ -137,7 +137,7 @@ union Sampler {
union Image {
Image() = default;
constexpr explicit Image(u64 value) : value{value} {}
constexpr explicit Image(u64 value_) : value{value_} {}
BitField<36, 13, u64> index;
u64 value;
@ -505,14 +505,14 @@ struct IpaMode {
IpaInterpMode interpolation_mode;
IpaSampleMode sampling_mode;
bool operator==(const IpaMode& a) const {
[[nodiscard]] bool operator==(const IpaMode& a) const {
return std::tie(interpolation_mode, sampling_mode) ==
std::tie(a.interpolation_mode, a.sampling_mode);
}
bool operator!=(const IpaMode& a) const {
[[nodiscard]] bool operator!=(const IpaMode& a) const {
return !operator==(a);
}
bool operator<(const IpaMode& a) const {
[[nodiscard]] bool operator<(const IpaMode& a) const {
return std::tie(interpolation_mode, sampling_mode) <
std::tie(a.interpolation_mode, a.sampling_mode);
}
@ -658,10 +658,10 @@ union Instruction {
return *this;
}
constexpr Instruction(u64 value) : value{value} {}
constexpr Instruction(u64 value_) : value{value_} {}
constexpr Instruction(const Instruction& instr) : value(instr.value) {}
constexpr bool Bit(u64 offset) const {
[[nodiscard]] constexpr bool Bit(u64 offset) const {
return ((value >> offset) & 1) != 0;
}
@ -746,34 +746,34 @@ union Instruction {
BitField<28, 8, u64> imm_lut28;
BitField<48, 8, u64> imm_lut48;
u32 GetImmLut28() const {
[[nodiscard]] u32 GetImmLut28() const {
return static_cast<u32>(imm_lut28);
}
u32 GetImmLut48() const {
[[nodiscard]] u32 GetImmLut48() const {
return static_cast<u32>(imm_lut48);
}
} lop3;
u16 GetImm20_16() const {
[[nodiscard]] u16 GetImm20_16() const {
return static_cast<u16>(imm20_16);
}
u32 GetImm20_19() const {
[[nodiscard]] u32 GetImm20_19() const {
u32 imm{static_cast<u32>(imm20_19)};
imm <<= 12;
imm |= negate_imm ? 0x80000000 : 0;
return imm;
}
u32 GetImm20_32() const {
[[nodiscard]] u32 GetImm20_32() const {
return static_cast<u32>(imm20_32);
}
s32 GetSignedImm20_20() const {
u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
[[nodiscard]] s32 GetSignedImm20_20() const {
const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19));
// Sign extend the 20-bit value.
u32 mask = 1U << (20 - 1);
const auto mask = 1U << (20 - 1);
return static_cast<s32>((immediate ^ mask) - mask);
}
} alu;
@ -857,7 +857,7 @@ union Instruction {
BitField<56, 1, u64> second_negate;
BitField<30, 9, u64> second;
u32 PackImmediates() const {
[[nodiscard]] u32 PackImmediates() const {
// Immediates are half floats shifted.
constexpr u32 imm_shift = 6;
return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift)));
@ -1033,7 +1033,7 @@ union Instruction {
BitField<28, 2, AtomicType> type;
BitField<30, 22, s64> offset;
s32 GetImmediateOffset() const {
[[nodiscard]] s32 GetImmediateOffset() const {
return static_cast<s32>(offset << 2);
}
} atoms;
@ -1215,7 +1215,7 @@ union Instruction {
BitField<39, 4, u64> rounding;
// H0, H1 extract for F16 missing
BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
F2fRoundingOp GetRoundingMode() const {
[[nodiscard]] F2fRoundingOp GetRoundingMode() const {
constexpr u64 rounding_mask = 0x0B;
return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
}
@ -1239,15 +1239,15 @@ union Instruction {
BitField<54, 1, u64> aoffi_flag;
BitField<55, 3, TextureProcessMode> process_mode;
bool IsComponentEnabled(std::size_t component) const {
return ((1ull << component) & component_mask) != 0;
[[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
return ((1ULL << component) & component_mask) != 0;
}
TextureProcessMode GetTextureProcessMode() const {
[[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
return process_mode;
}
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::DC:
return dc_flag != 0;
@ -1271,15 +1271,15 @@ union Instruction {
BitField<36, 1, u64> aoffi_flag;
BitField<37, 3, TextureProcessMode> process_mode;
bool IsComponentEnabled(std::size_t component) const {
[[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
return ((1ULL << component) & component_mask) != 0;
}
TextureProcessMode GetTextureProcessMode() const {
[[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
return process_mode;
}
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::DC:
return dc_flag != 0;
@ -1299,7 +1299,7 @@ union Instruction {
BitField<31, 4, u64> component_mask;
BitField<49, 1, u64> nodep_flag;
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::NODEP:
return nodep_flag != 0;
@ -1309,7 +1309,7 @@ union Instruction {
return false;
}
bool IsComponentEnabled(std::size_t component) const {
[[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
return ((1ULL << component) & component_mask) != 0;
}
} txq;
@ -1321,11 +1321,11 @@ union Instruction {
BitField<35, 1, u64> ndv_flag;
BitField<49, 1, u64> nodep_flag;
bool IsComponentEnabled(std::size_t component) const {
return ((1ull << component) & component_mask) != 0;
[[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
return ((1ULL << component) & component_mask) != 0;
}
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::NDV:
return (ndv_flag != 0);
@ -1347,7 +1347,7 @@ union Instruction {
BitField<54, 2, u64> offset_mode;
BitField<56, 2, u64> component;
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::NDV:
return ndv_flag != 0;
@ -1373,7 +1373,7 @@ union Instruction {
BitField<33, 2, u64> offset_mode;
BitField<37, 2, u64> component;
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::NDV:
return ndv_flag != 0;
@ -1399,7 +1399,7 @@ union Instruction {
BitField<52, 2, u64> component;
BitField<55, 1, u64> fp16_flag;
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::DC:
return dc_flag != 0;
@ -1422,24 +1422,27 @@ union Instruction {
BitField<53, 4, u64> texture_info;
BitField<59, 1, u64> fp32_flag;
TextureType GetTextureType() const {
[[nodiscard]] TextureType GetTextureType() const {
// The TEXS instruction has a weird encoding for the texture type.
if (texture_info == 0)
if (texture_info == 0) {
return TextureType::Texture1D;
if (texture_info >= 1 && texture_info <= 9)
}
if (texture_info >= 1 && texture_info <= 9) {
return TextureType::Texture2D;
if (texture_info >= 10 && texture_info <= 11)
}
if (texture_info >= 10 && texture_info <= 11) {
return TextureType::Texture3D;
if (texture_info >= 12 && texture_info <= 13)
}
if (texture_info >= 12 && texture_info <= 13) {
return TextureType::TextureCube;
}
LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}",
static_cast<u32>(texture_info.Value()));
LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value());
UNREACHABLE();
return TextureType::Texture1D;
}
TextureProcessMode GetTextureProcessMode() const {
[[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
switch (texture_info) {
case 0:
case 2:
@ -1458,7 +1461,7 @@ union Instruction {
return TextureProcessMode::None;
}
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::DC:
return (texture_info >= 4 && texture_info <= 6) || texture_info == 9;
@ -1470,16 +1473,16 @@ union Instruction {
return false;
}
bool IsArrayTexture() const {
[[nodiscard]] bool IsArrayTexture() const {
// TEXS only supports Texture2D arrays.
return texture_info >= 7 && texture_info <= 9;
}
bool HasTwoDestinations() const {
[[nodiscard]] bool HasTwoDestinations() const {
return gpr28.Value() != Register::ZeroIndex;
}
bool IsComponentEnabled(std::size_t component) const {
[[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
{},
{0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
@ -1506,7 +1509,7 @@ union Instruction {
BitField<54, 1, u64> cl;
BitField<55, 1, u64> process_mode;
TextureProcessMode GetTextureProcessMode() const {
[[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL;
}
} tld;
@ -1516,7 +1519,7 @@ union Instruction {
BitField<53, 4, u64> texture_info;
BitField<59, 1, u64> fp32_flag;
TextureType GetTextureType() const {
[[nodiscard]] TextureType GetTextureType() const {
// The TLDS instruction has a weird encoding for the texture type.
if (texture_info <= 1) {
return TextureType::Texture1D;
@ -1529,19 +1532,19 @@ union Instruction {
return TextureType::Texture3D;
}
LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}",
static_cast<u32>(texture_info.Value()));
LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value());
UNREACHABLE();
return TextureType::Texture1D;
}
TextureProcessMode GetTextureProcessMode() const {
if (texture_info == 1 || texture_info == 5 || texture_info == 12)
[[nodiscard]] TextureProcessMode GetTextureProcessMode() const {
if (texture_info == 1 || texture_info == 5 || texture_info == 12) {
return TextureProcessMode::LL;
}
return TextureProcessMode::LZ;
}
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::AOFFI:
return texture_info == 12 || texture_info == 4;
@ -1555,7 +1558,7 @@ union Instruction {
return false;
}
bool IsArrayTexture() const {
[[nodiscard]] bool IsArrayTexture() const {
// TEXS only supports Texture2D arrays.
return texture_info == 8;
}
@ -1567,7 +1570,7 @@ union Instruction {
BitField<35, 1, u64> aoffi_flag;
BitField<49, 1, u64> nodep_flag;
bool UsesMiscMode(TextureMiscMode mode) const {
[[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
case TextureMiscMode::AOFFI:
return aoffi_flag != 0;
@ -1591,7 +1594,7 @@ union Instruction {
BitField<20, 3, StoreType> store_data_layout;
BitField<20, 4, u64> component_mask_selector;
bool IsComponentEnabled(std::size_t component) const {
[[nodiscard]] bool IsComponentEnabled(std::size_t component) const {
ASSERT(mode == SurfaceDataMode::P);
constexpr u8 R = 0b0001;
constexpr u8 G = 0b0010;
@ -1604,7 +1607,7 @@ union Instruction {
return std::bitset<4>{mask.at(component_mask_selector)}.test(component);
}
StoreType GetStoreDataLayout() const {
[[nodiscard]] StoreType GetStoreDataLayout() const {
ASSERT(mode == SurfaceDataMode::D_BA);
return store_data_layout;
}
@ -1622,14 +1625,15 @@ union Instruction {
BitField<20, 24, u64> target;
BitField<5, 1, u64> constant_buffer;
s32 GetBranchTarget() const {
[[nodiscard]] s32 GetBranchTarget() const {
// Sign extend the branch target offset
u32 mask = 1U << (24 - 1);
u32 value = static_cast<u32>(target);
const auto mask = 1U << (24 - 1);
const auto target_value = static_cast<u32>(target);
constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
// The branch offset is relative to the next instruction and is stored in bytes, so
// divide it by the size of an instruction and add 1 to it.
return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
1;
return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
}
} bra;
@ -1637,14 +1641,15 @@ union Instruction {
BitField<20, 24, u64> target;
BitField<5, 1, u64> constant_buffer;
s32 GetBranchExtend() const {
[[nodiscard]] s32 GetBranchExtend() const {
// Sign extend the branch target offset
u32 mask = 1U << (24 - 1);
u32 value = static_cast<u32>(target);
const auto mask = 1U << (24 - 1);
const auto target_value = static_cast<u32>(target);
constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction));
// The branch offset is relative to the next instruction and is stored in bytes, so
// divide it by the size of an instruction and add 1 to it.
return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) +
1;
return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1;
}
} brx;
@ -1697,7 +1702,7 @@ union Instruction {
BitField<50, 1, u64> is_op_b_register;
BitField<51, 3, VmnmxOperation> operation;
VmnmxType SourceFormatA() const {
[[nodiscard]] VmnmxType SourceFormatA() const {
switch (src_format_a) {
case 0b11:
return VmnmxType::Bits32;
@ -1708,7 +1713,7 @@ union Instruction {
}
}
VmnmxType SourceFormatB() const {
[[nodiscard]] VmnmxType SourceFormatB() const {
switch (src_format_b) {
case 0b11:
return VmnmxType::Bits32;
@ -1739,7 +1744,7 @@ union Instruction {
BitField<20, 14, u64> shifted_offset;
BitField<34, 5, u64> index;
u64 GetOffset() const {
[[nodiscard]] u64 GetOffset() const {
return shifted_offset * 4;
}
} cbuf34;
@ -1748,7 +1753,7 @@ union Instruction {
BitField<20, 16, s64> offset;
BitField<36, 5, u64> index;
s64 GetOffset() const {
[[nodiscard]] s64 GetOffset() const {
return offset;
}
} cbuf36;
@ -1893,6 +1898,7 @@ public:
ICMP_IMM,
FCMP_RR,
FCMP_RC,
FCMP_IMMR,
MUFU, // Multi-Function Operator
RRO_C, // Range Reduction Operator
RRO_R,
@ -1996,29 +2002,29 @@ public:
/// Returns whether an opcode has an execution predicate field or not (ie, whether it can be
/// conditionally executed).
static bool IsPredicatedInstruction(Id opcode) {
[[nodiscard]] static bool IsPredicatedInstruction(Id opcode) {
// TODO(Subv): Add the rest of unpredicated instructions.
return opcode != Id::SSY && opcode != Id::PBK;
}
class Matcher {
public:
constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type)
: name{name}, mask{mask}, expected{expected}, id{id}, type{type} {}
constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_)
: name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {}
constexpr const char* GetName() const {
[[nodiscard]] constexpr const char* GetName() const {
return name;
}
constexpr u16 GetMask() const {
[[nodiscard]] constexpr u16 GetMask() const {
return mask;
}
constexpr Id GetId() const {
[[nodiscard]] constexpr Id GetId() const {
return id;
}
constexpr Type GetType() const {
[[nodiscard]] constexpr Type GetType() const {
return type;
}
@ -2027,7 +2033,7 @@ public:
* @param instruction The instruction to test
* @returns true if the given instruction matches.
*/
constexpr bool Matches(u16 instruction) const {
[[nodiscard]] constexpr bool Matches(u16 instruction) const {
return (instruction & mask) == expected;
}
@ -2039,7 +2045,8 @@ public:
Type type;
};
static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) {
using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>;
[[nodiscard]] static DecodeResult Decode(Instruction instr) {
static const auto table{GetDecodeTable()};
const auto matches_instruction = [instr](const auto& matcher) {
@ -2061,7 +2068,7 @@ private:
* A '0' in a bitstring indicates that a zero must be present at that bit position.
* A '1' in a bitstring indicates that a one must be present at that bit position.
*/
static constexpr auto GetMaskAndExpect(const char* const bitstring) {
[[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) {
u16 mask = 0, expect = 0;
for (std::size_t i = 0; i < opcode_bitsize; i++) {
const std::size_t bit_position = opcode_bitsize - i - 1;
@ -2083,14 +2090,14 @@ private:
public:
/// Creates a matcher that can match and parse instructions based on bitstring.
static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type,
const char* const name) {
[[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op,
Type type, const char* const name) {
const auto [mask, expected] = GetMaskAndExpect(bitstring);
return Matcher(name, mask, expected, op, type);
}
};
static std::vector<Matcher> GetDecodeTable() {
[[nodiscard]] static std::vector<Matcher> GetDecodeTable() {
std::vector<Matcher> table = {
#define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name)
INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
@ -2205,6 +2212,7 @@ private:
INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"),
INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),

View File

@ -41,30 +41,30 @@ struct Header {
BitField<26, 1, u32> does_load_or_store;
BitField<27, 1, u32> does_fp64;
BitField<28, 4, u32> stream_out_mask;
} common0{};
} common0;
union {
BitField<0, 24, u32> shader_local_memory_low_size;
BitField<24, 8, u32> per_patch_attribute_count;
} common1{};
} common1;
union {
BitField<0, 24, u32> shader_local_memory_high_size;
BitField<24, 8, u32> threads_per_input_primitive;
} common2{};
} common2;
union {
BitField<0, 24, u32> shader_local_memory_crs_size;
BitField<24, 4, OutputTopology> output_topology;
BitField<28, 4, u32> reserved;
} common3{};
} common3;
union {
BitField<0, 12, u32> max_output_vertices;
BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
BitField<20, 4, u32> reserved;
BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
} common4{};
} common4;
union {
struct {
@ -145,7 +145,7 @@ struct Header {
}
} ps;
std::array<u32, 0xF> raw{};
std::array<u32, 0xF> raw;
};
u64 GetLocalMemorySize() const {
@ -153,7 +153,6 @@ struct Header {
(common2.shader_local_memory_high_size << 24));
}
};
static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
} // namespace Tegra::Shader

View File

@ -9,6 +9,7 @@
#include "common/common_types.h"
#include "core/core.h"
#include "video_core/delayed_destruction_ring.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
@ -17,11 +18,11 @@ namespace VideoCommon {
class FenceBase {
public:
FenceBase(u32 payload, bool is_stubbed)
: address{}, payload{payload}, is_semaphore{false}, is_stubbed{is_stubbed} {}
explicit FenceBase(u32 payload_, bool is_stubbed_)
: address{}, payload{payload_}, is_semaphore{false}, is_stubbed{is_stubbed_} {}
FenceBase(GPUVAddr address, u32 payload, bool is_stubbed)
: address{address}, payload{payload}, is_semaphore{true}, is_stubbed{is_stubbed} {}
explicit FenceBase(GPUVAddr address_, u32 payload_, bool is_stubbed_)
: address{address_}, payload{payload_}, is_semaphore{true}, is_stubbed{is_stubbed_} {}
GPUVAddr GetAddress() const {
return address;
@ -47,6 +48,11 @@ protected:
template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache>
class FenceManager {
public:
/// Notify the fence manager about a new frame
void TickFrame() {
delayed_destruction_ring.Tick();
}
void SignalSemaphore(GPUVAddr addr, u32 value) {
TryReleasePendingFences();
const bool should_flush = ShouldFlush();
@ -86,7 +92,7 @@ public:
} else {
gpu.IncrementSyncPoint(current_fence->GetPayload());
}
fences.pop();
PopFence();
}
}
@ -132,7 +138,7 @@ private:
} else {
gpu.IncrementSyncPoint(current_fence->GetPayload());
}
fences.pop();
PopFence();
}
}
@ -158,7 +164,14 @@ private:
query_cache.CommitAsyncFlushes();
}
void PopFence() {
delayed_destruction_ring.Push(std::move(fences.front()));
fences.pop();
}
std::queue<TFence> fences;
DelayedDestructionRing<TFence, 6> delayed_destruction_ring;
};
} // namespace VideoCommon

View File

@ -0,0 +1,31 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
namespace Tegra {
/**
* Struct describing framebuffer configuration
*/
struct FramebufferConfig {
enum class PixelFormat : u32 {
A8B8G8R8_UNORM = 1,
RGB565_UNORM = 4,
B8G8R8A8_UNORM = 5,
};
VAddr address{};
u32 offset{};
u32 width{};
u32 height{};
u32 stride{};
PixelFormat pixel_format{};
using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
TransformFlags transform_flags{};
Common::Rectangle<int> crop_rect;
};
} // namespace Tegra

View File

@ -10,6 +10,7 @@
#include "core/core_timing.h"
#include "core/core_timing_util.h"
#include "core/frontend/emu_window.h"
#include "core/hardware_interrupt_manager.h"
#include "core/memory.h"
#include "core/settings.h"
#include "video_core/engines/fermi_2d.h"
@ -27,15 +28,17 @@ namespace Tegra {
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
GPU::GPU(Core::System& system_, bool is_async_)
GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
: system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
fermi_2d{std::make_unique<Engines::Fermi2D>()},
kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {}
shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
gpu_thread{system_, is_async_} {}
GPU::~GPU() = default;
@ -77,31 +80,46 @@ DmaPusher& GPU::DmaPusher() {
return *dma_pusher;
}
Tegra::CDmaPusher& GPU::CDmaPusher() {
return *cdma_pusher;
}
const DmaPusher& GPU::DmaPusher() const {
return *dma_pusher;
}
const Tegra::CDmaPusher& GPU::CDmaPusher() const {
return *cdma_pusher;
}
void GPU::WaitFence(u32 syncpoint_id, u32 value) {
// Synced GPU, is always in sync
if (!is_async) {
return;
}
if (syncpoint_id == UINT32_MAX) {
// TODO: Research what this does.
LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
return;
}
MICROPROFILE_SCOPE(GPU_wait);
std::unique_lock lock{sync_mutex};
sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; });
sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; });
}
void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
syncpoints[syncpoint_id]++;
auto& syncpoint = syncpoints.at(syncpoint_id);
syncpoint++;
std::lock_guard lock{sync_mutex};
sync_cv.notify_all();
if (!syncpt_interrupts[syncpoint_id].empty()) {
u32 value = syncpoints[syncpoint_id].load();
auto it = syncpt_interrupts[syncpoint_id].begin();
while (it != syncpt_interrupts[syncpoint_id].end()) {
auto& interrupt = syncpt_interrupts.at(syncpoint_id);
if (!interrupt.empty()) {
u32 value = syncpoint.load();
auto it = interrupt.begin();
while (it != interrupt.end()) {
if (value >= *it) {
TriggerCpuInterrupt(syncpoint_id, *it);
it = syncpt_interrupts[syncpoint_id].erase(it);
it = interrupt.erase(it);
continue;
}
it++;
@ -110,22 +128,22 @@ void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
}
u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
return syncpoints[syncpoint_id].load();
return syncpoints.at(syncpoint_id).load();
}
void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
auto& interrupt = syncpt_interrupts[syncpoint_id];
auto& interrupt = syncpt_interrupts.at(syncpoint_id);
bool contains = std::any_of(interrupt.begin(), interrupt.end(),
[value](u32 in_value) { return in_value == value; });
if (contains) {
return;
}
syncpt_interrupts[syncpoint_id].emplace_back(value);
interrupt.emplace_back(value);
}
bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
std::lock_guard lock{sync_mutex};
auto& interrupt = syncpt_interrupts[syncpoint_id];
auto& interrupt = syncpt_interrupts.at(syncpoint_id);
const auto iter =
std::find_if(interrupt.begin(), interrupt.end(),
[value](u32 interrupt_value) { return value == interrupt_value; });
@ -182,34 +200,6 @@ void GPU::SyncGuestHost() {
renderer->Rasterizer().SyncGuestHost();
}
void GPU::OnCommandListEnd() {
renderer->Rasterizer().ReleaseFences();
}
// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
// So the values you see in docs might be multiplied by 4.
enum class BufferMethods {
BindObject = 0x0,
Nop = 0x2,
SemaphoreAddressHigh = 0x4,
SemaphoreAddressLow = 0x5,
SemaphoreSequence = 0x6,
SemaphoreTrigger = 0x7,
NotifyIntr = 0x8,
WrcacheFlush = 0x9,
Unk28 = 0xA,
UnkCacheFlush = 0xB,
RefCnt = 0x14,
SemaphoreAcquire = 0x1A,
SemaphoreRelease = 0x1B,
FenceValue = 0x1C,
FenceAction = 0x1D,
Unk78 = 0x1E,
Unk7c = 0x1F,
Yield = 0x20,
NonPullerMethods = 0x40,
};
enum class GpuSemaphoreOperation {
AcquireEqual = 0x1,
WriteLong = 0x2,
@ -240,8 +230,12 @@ void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32
CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
} else {
for (std::size_t i = 0; i < amount; i++) {
CallPullerMethod(
{method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)});
CallPullerMethod(MethodCall{
method,
base_start[i],
subchannel,
methods_pending - static_cast<u32>(i),
});
}
}
}
@ -268,7 +262,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
case BufferMethods::UnkCacheFlush:
case BufferMethods::WrcacheFlush:
case BufferMethods::FenceValue:
break;
case BufferMethods::FenceAction:
ProcessFenceActionMethod();
break;
case BufferMethods::WaitForInterrupt:
ProcessWaitForInterruptMethod();
break;
case BufferMethods::SemaphoreTrigger: {
ProcessSemaphoreTriggerMethod();
@ -298,8 +297,7 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
break;
}
default:
LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented",
static_cast<u32>(method));
LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
break;
}
}
@ -378,10 +376,28 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id));
UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
}
}
void GPU::ProcessFenceActionMethod() {
switch (regs.fence_action.op) {
case FenceOperation::Acquire:
WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
break;
case FenceOperation::Increment:
IncrementSyncPoint(regs.fence_action.syncpoint_id);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
}
}
void GPU::ProcessWaitForInterruptMethod() {
// TODO(bunnei) ImplementMe
LOG_WARNING(HW_GPU, "(STUBBED) called");
}
void GPU::ProcessSemaphoreTriggerMethod() {
const auto semaphoreOperationMask = 0xF;
const auto op =
@ -443,4 +459,75 @@ void GPU::ProcessSemaphoreAcquire() {
}
}
void GPU::Start() {
gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
cpu_context = renderer->GetRenderWindow().CreateSharedContext();
cpu_context->MakeCurrent();
}
void GPU::ObtainContext() {
cpu_context->MakeCurrent();
}
void GPU::ReleaseContext() {
cpu_context->DoneCurrent();
}
void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
gpu_thread.SubmitList(std::move(entries));
}
void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
if (!use_nvdec) {
return;
}
// This condition fires when a video stream ends, clear all intermediary data
if (entries[0].raw == 0xDEADB33F) {
cdma_pusher.reset();
return;
}
if (!cdma_pusher) {
cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
}
// SubmitCommandBuffer would make the nvdec operations async, this is not currently working
// TODO(ameerj): RE proper async nvdec operation
// gpu_thread.SubmitCommandBuffer(std::move(entries));
cdma_pusher->Push(std::move(entries));
cdma_pusher->DispatchCalls();
}
void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
gpu_thread.SwapBuffers(framebuffer);
}
void GPU::FlushRegion(VAddr addr, u64 size) {
gpu_thread.FlushRegion(addr, size);
}
void GPU::InvalidateRegion(VAddr addr, u64 size) {
gpu_thread.InvalidateRegion(addr, size);
}
void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
auto& interrupt_manager = system.InterruptManager();
interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
}
void GPU::WaitIdle() const {
gpu_thread.WaitIdle();
}
void GPU::OnCommandListEnd() {
if (is_async) {
// This command only applies to asynchronous GPU mode
gpu_thread.OnCommandListEnd();
}
}
} // namespace Tegra

View File

@ -13,14 +13,17 @@
#include "common/common_types.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/cdma_pusher.h"
#include "video_core/dma_pusher.h"
#include "video_core/framebuffer_config.h"
#include "video_core/gpu_thread.h"
using CacheAddr = std::uintptr_t;
inline CacheAddr ToCacheAddr(const void* host_ptr) {
[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
return reinterpret_cast<CacheAddr>(host_ptr);
}
inline u8* FromCacheAddr(CacheAddr cache_addr) {
[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
return reinterpret_cast<u8*>(cache_addr);
}
@ -100,28 +103,6 @@ enum class DepthFormat : u32 {
struct CommandListHeader;
class DebugContext;
/**
* Struct describing framebuffer configuration
*/
struct FramebufferConfig {
enum class PixelFormat : u32 {
A8B8G8R8_UNORM = 1,
RGB565_UNORM = 4,
B8G8R8A8_UNORM = 5,
};
VAddr address;
u32 offset;
u32 width;
u32 height;
u32 stride;
PixelFormat pixel_format;
using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
TransformFlags transform_flags;
Common::Rectangle<int> crop_rect;
};
namespace Engines {
class Fermi2D;
class Maxwell3D;
@ -140,7 +121,7 @@ enum class EngineID {
class MemoryManager;
class GPU {
class GPU final {
public:
struct MethodCall {
u32 method{};
@ -148,17 +129,17 @@ public:
u32 subchannel{};
u32 method_count{};
bool IsLastCall() const {
explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0)
: method(method_), argument(argument_), subchannel(subchannel_),
method_count(method_count_) {}
[[nodiscard]] bool IsLastCall() const {
return method_count <= 1;
}
MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0)
: method(method), argument(argument), subchannel(subchannel),
method_count(method_count) {}
};
explicit GPU(Core::System& system, bool is_async);
virtual ~GPU();
explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
~GPU();
/// Binds a renderer to the GPU.
void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
@ -175,13 +156,13 @@ public:
/// Synchronizes CPU writes with Host GPU memory.
void SyncGuestHost();
/// Signal the ending of command list.
virtual void OnCommandListEnd();
void OnCommandListEnd();
/// Request a host GPU memory flush from the CPU.
u64 RequestFlush(VAddr addr, std::size_t size);
[[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
/// Obtains current flush request fence id.
u64 CurrentFlushRequestFence() const {
[[nodiscard]] u64 CurrentFlushRequestFence() const {
return current_flush_fence.load(std::memory_order_relaxed);
}
@ -189,68 +170,100 @@ public:
void TickWork();
/// Returns a reference to the Maxwell3D GPU engine.
Engines::Maxwell3D& Maxwell3D();
[[nodiscard]] Engines::Maxwell3D& Maxwell3D();
/// Returns a const reference to the Maxwell3D GPU engine.
const Engines::Maxwell3D& Maxwell3D() const;
[[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;
/// Returns a reference to the KeplerCompute GPU engine.
Engines::KeplerCompute& KeplerCompute();
[[nodiscard]] Engines::KeplerCompute& KeplerCompute();
/// Returns a reference to the KeplerCompute GPU engine.
const Engines::KeplerCompute& KeplerCompute() const;
[[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;
/// Returns a reference to the GPU memory manager.
Tegra::MemoryManager& MemoryManager();
[[nodiscard]] Tegra::MemoryManager& MemoryManager();
/// Returns a const reference to the GPU memory manager.
const Tegra::MemoryManager& MemoryManager() const;
[[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;
/// Returns a reference to the GPU DMA pusher.
Tegra::DmaPusher& DmaPusher();
[[nodiscard]] Tegra::DmaPusher& DmaPusher();
VideoCore::RendererBase& Renderer() {
/// Returns a const reference to the GPU DMA pusher.
[[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
/// Returns a reference to the GPU CDMA pusher.
[[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
/// Returns a const reference to the GPU CDMA pusher.
[[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
/// Returns a reference to the underlying renderer.
[[nodiscard]] VideoCore::RendererBase& Renderer() {
return *renderer;
}
const VideoCore::RendererBase& Renderer() const {
/// Returns a const reference to the underlying renderer.
[[nodiscard]] const VideoCore::RendererBase& Renderer() const {
return *renderer;
}
VideoCore::ShaderNotify& ShaderNotify() {
/// Returns a reference to the shader notifier.
[[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
return *shader_notify;
}
const VideoCore::ShaderNotify& ShaderNotify() const {
/// Returns a const reference to the shader notifier.
[[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
return *shader_notify;
}
// Waits for the GPU to finish working
virtual void WaitIdle() const = 0;
void WaitIdle() const;
/// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
void WaitFence(u32 syncpoint_id, u32 value);
void IncrementSyncPoint(u32 syncpoint_id);
u32 GetSyncpointValue(u32 syncpoint_id) const;
[[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;
void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
[[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
u64 GetTicks() const;
[[nodiscard]] u64 GetTicks() const;
std::unique_lock<std::mutex> LockSync() {
[[nodiscard]] std::unique_lock<std::mutex> LockSync() {
return std::unique_lock{sync_mutex};
}
bool IsAsync() const {
[[nodiscard]] bool IsAsync() const {
return is_async;
}
/// Returns a const reference to the GPU DMA pusher.
const Tegra::DmaPusher& DmaPusher() const;
[[nodiscard]] bool UseNvdec() const {
return use_nvdec;
}
enum class FenceOperation : u32 {
Acquire = 0,
Increment = 1,
};
union FenceAction {
u32 raw;
BitField<0, 1, FenceOperation> op;
BitField<8, 24, u32> syncpoint_id;
[[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
FenceAction result{};
result.op.Assign(op);
result.syncpoint_id.Assign(syncpoint_id);
return {result.raw};
}
};
struct Regs {
static constexpr size_t NUM_REGS = 0x40;
@ -262,7 +275,7 @@ public:
u32 address_high;
u32 address_low;
GPUVAddr SemaphoreAddress() const {
[[nodiscard]] GPUVAddr SemaphoreAddress() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
@ -280,10 +293,7 @@ public:
u32 semaphore_acquire;
u32 semaphore_release;
u32 fence_value;
union {
BitField<4, 4, u32> operation;
BitField<8, 8, u32> id;
} fence_action;
FenceAction fence_action;
INSERT_UNION_PADDING_WORDS(0xE2);
// Puller state
@ -300,34 +310,39 @@ public:
/// Performs any additional setup necessary in order to begin GPU emulation.
/// This can be used to launch any necessary threads and register any necessary
/// core timing events.
virtual void Start() = 0;
void Start();
/// Obtain the CPU Context
virtual void ObtainContext() = 0;
void ObtainContext();
/// Release the CPU Context
virtual void ReleaseContext() = 0;
void ReleaseContext();
/// Push GPU command entries to be processed
virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
void PushGPUEntries(Tegra::CommandList&& entries);
/// Push GPU command buffer entries to be processed
void PushCommandBuffer(Tegra::ChCommandHeaderList& entries);
/// Swap buffers (render frame)
virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
virtual void FlushRegion(VAddr addr, u64 size) = 0;
void FlushRegion(VAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be invalidated
virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
void InvalidateRegion(VAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
void FlushAndInvalidateRegion(VAddr addr, u64 size);
protected:
virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
private:
void ProcessBindMethod(const MethodCall& method_call);
void ProcessFenceActionMethod();
void ProcessWaitForInterruptMethod();
void ProcessSemaphoreTriggerMethod();
void ProcessSemaphoreRelease();
void ProcessSemaphoreAcquire();
@ -343,13 +358,15 @@ private:
u32 methods_pending);
/// Determines where the method should be executed.
bool ExecuteMethodOnEngine(u32 method);
[[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
protected:
Core::System& system;
std::unique_ptr<Tegra::MemoryManager> memory_manager;
std::unique_ptr<Tegra::DmaPusher> dma_pusher;
std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
std::unique_ptr<VideoCore::RendererBase> renderer;
const bool use_nvdec;
private:
/// Mapping of command subchannels to their bound engine ids
@ -372,12 +389,13 @@ private:
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
std::mutex sync_mutex;
std::mutex device_mutex;
std::condition_variable sync_cv;
struct FlushRequest {
FlushRequest(u64 fence, VAddr addr, std::size_t size)
: fence{fence}, addr{addr}, size{size} {}
explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
: fence{fence_}, addr{addr_}, size{size_} {}
u64 fence;
VAddr addr;
std::size_t size;
@ -389,6 +407,9 @@ private:
std::mutex flush_request_mutex;
const bool is_async;
VideoCommon::GPUThread::ThreadManager gpu_thread;
std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
};
#define ASSERT_REG_POSITION(field_name, position) \

View File

@ -1,64 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "core/core.h"
#include "core/hardware_interrupt_manager.h"
#include "video_core/gpu_asynch.h"
#include "video_core/gpu_thread.h"
#include "video_core/renderer_base.h"
namespace VideoCommon {
GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {}
GPUAsynch::~GPUAsynch() = default;
void GPUAsynch::Start() {
gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
cpu_context = renderer->GetRenderWindow().CreateSharedContext();
cpu_context->MakeCurrent();
}
void GPUAsynch::ObtainContext() {
cpu_context->MakeCurrent();
}
void GPUAsynch::ReleaseContext() {
cpu_context->DoneCurrent();
}
void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
gpu_thread.SubmitList(std::move(entries));
}
void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
gpu_thread.SwapBuffers(framebuffer);
}
void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
gpu_thread.FlushRegion(addr, size);
}
void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
gpu_thread.InvalidateRegion(addr, size);
}
void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
auto& interrupt_manager = system.InterruptManager();
interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
}
void GPUAsynch::WaitIdle() const {
gpu_thread.WaitIdle();
}
void GPUAsynch::OnCommandListEnd() {
gpu_thread.OnCommandListEnd();
}
} // namespace VideoCommon

View File

@ -1,46 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "video_core/gpu.h"
#include "video_core/gpu_thread.h"
namespace Core::Frontend {
class GraphicsContext;
}
namespace VideoCore {
class RendererBase;
} // namespace VideoCore
namespace VideoCommon {
/// Implementation of GPU interface that runs the GPU asynchronously
class GPUAsynch final : public Tegra::GPU {
public:
explicit GPUAsynch(Core::System& system);
~GPUAsynch() override;
void Start() override;
void ObtainContext() override;
void ReleaseContext() override;
void PushGPUEntries(Tegra::CommandList&& entries) override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitIdle() const override;
void OnCommandListEnd() override;
protected:
void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
private:
GPUThread::ThreadManager gpu_thread;
std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
};
} // namespace VideoCommon

View File

@ -1,45 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "video_core/gpu_synch.h"
#include "video_core/renderer_base.h"
namespace VideoCommon {
GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {}
GPUSynch::~GPUSynch() = default;
void GPUSynch::Start() {}
void GPUSynch::ObtainContext() {
renderer->Context().MakeCurrent();
}
void GPUSynch::ReleaseContext() {
renderer->Context().DoneCurrent();
}
void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
dma_pusher->Push(std::move(entries));
dma_pusher->DispatchCalls();
}
void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
renderer->SwapBuffers(framebuffer);
}
void GPUSynch::FlushRegion(VAddr addr, u64 size) {
renderer->Rasterizer().FlushRegion(addr, size);
}
void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
renderer->Rasterizer().InvalidateRegion(addr, size);
}
void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
renderer->Rasterizer().FlushAndInvalidateRegion(addr, size);
}
} // namespace VideoCommon

View File

@ -1,40 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "video_core/gpu.h"
namespace Core::Frontend {
class GraphicsContext;
}
namespace VideoCore {
class RendererBase;
} // namespace VideoCore
namespace VideoCommon {
/// Implementation of GPU interface that runs the GPU synchronously
class GPUSynch final : public Tegra::GPU {
public:
explicit GPUSynch(Core::System& system);
~GPUSynch() override;
void Start() override;
void ObtainContext() override;
void ReleaseContext() override;
void PushGPUEntries(Tegra::CommandList&& entries) override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override;
void InvalidateRegion(VAddr addr, u64 size) override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitIdle() const override {}
protected:
void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
[[maybe_unused]] u32 value) const override {}
};
} // namespace VideoCommon

View File

@ -4,6 +4,7 @@
#include "common/assert.h"
#include "common/microprofile.h"
#include "common/scope_exit.h"
#include "common/thread.h"
#include "core/core.h"
#include "core/frontend/emu_window.h"
@ -18,9 +19,11 @@ namespace VideoCommon::GPUThread {
/// Runs the GPU thread
static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
SynchState& state) {
SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
std::string name = "yuzu:GPU";
MicroProfileOnThreadCreate(name.c_str());
SCOPE_EXIT({ MicroProfileOnThreadExit(); });
Common::SetCurrentThreadName(name.c_str());
Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
system.RegisterHostThread();
@ -39,19 +42,23 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
CommandDataContainer next;
while (state.is_running) {
next = state.queue.PopWait();
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls();
} else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
} else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
// NVDEC
cdma_pusher.Push(std::move(command_list->entries));
cdma_pusher.DispatchCalls();
} else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
} else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
renderer.Rasterizer().ReleaseFences();
} else if (std::holds_alternative<GPUTickCommand>(next.data)) {
system.GPU().TickWork();
} else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
renderer.Rasterizer().FlushRegion(data->addr, data->size);
} else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
renderer.Rasterizer().OnCPUWrite(data->addr, data->size);
} else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
renderer.Rasterizer().FlushRegion(flush->addr, flush->size);
} else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size);
} else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
return;
} else {
@ -61,7 +68,8 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
}
}
ThreadManager::ThreadManager(Core::System& system) : system{system} {}
ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
: system{system_}, is_async{is_async_} {}
ThreadManager::~ThreadManager() {
if (!thread.joinable()) {
@ -75,33 +83,48 @@ ThreadManager::~ThreadManager() {
void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher) {
thread = std::thread{RunThread, std::ref(system), std::ref(renderer),
std::ref(context), std::ref(dma_pusher), std::ref(state)};
Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
}
void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
PushCommand(SubmitListCommand(std::move(entries)));
}
void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
PushCommand(SubmitChCommandEntries(std::move(entries)));
}
void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
}
void ThreadManager::FlushRegion(VAddr addr, u64 size) {
if (!Settings::IsGPULevelHigh()) {
if (!is_async) {
// Always flush with synchronous GPU mode
PushCommand(FlushRegionCommand(addr, size));
return;
}
if (!Settings::IsGPULevelExtreme()) {
return;
}
if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) {
// Asynchronous GPU mode
switch (Settings::values.gpu_accuracy.GetValue()) {
case Settings::GPUAccuracy::Normal:
PushCommand(FlushRegionCommand(addr, size));
break;
case Settings::GPUAccuracy::High:
// TODO(bunnei): Is this right? Preserving existing behavior for now
break;
case Settings::GPUAccuracy::Extreme: {
auto& gpu = system.GPU();
u64 fence = gpu.RequestFlush(addr, size);
PushCommand(GPUTickCommand());
while (fence > gpu.CurrentFlushRequestFence()) {
}
break;
}
default:
UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue());
}
}
@ -115,7 +138,8 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
}
void ThreadManager::WaitIdle() const {
while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed)) {
while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
system.IsPoweredOn()) {
}
}
@ -126,6 +150,12 @@ void ThreadManager::OnCommandListEnd() {
u64 ThreadManager::PushCommand(CommandData&& command_data) {
const u64 fence{++state.last_fence};
state.queue.Push(CommandDataContainer(std::move(command_data), fence));
if (!is_async) {
// In synchronous GPU mode, block the caller until the command has executed
WaitIdle();
}
return fence;
}

View File

@ -10,8 +10,9 @@
#include <optional>
#include <thread>
#include <variant>
#include "common/threadsafe_queue.h"
#include "video_core/gpu.h"
#include "video_core/framebuffer_config.h"
namespace Tegra {
struct FramebufferConfig;
@ -25,6 +26,10 @@ class GraphicsContext;
class System;
} // namespace Core
namespace VideoCore {
class RendererBase;
} // namespace VideoCore
namespace VideoCommon::GPUThread {
/// Command to signal to the GPU thread that processing has ended
@ -32,22 +37,30 @@ struct EndProcessingCommand final {};
/// Command to signal to the GPU thread that a command list is ready for processing
struct SubmitListCommand final {
explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}
Tegra::CommandList entries;
};
/// Command to signal to the GPU thread that a cdma command list is ready for processing
struct SubmitChCommandEntries final {
explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries_)
: entries{std::move(entries_)} {}
Tegra::ChCommandHeaderList entries;
};
/// Command to signal to the GPU thread that a swap buffers is pending
struct SwapBuffersCommand final {
explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
: framebuffer{std::move(framebuffer)} {}
explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer_)
: framebuffer{std::move(framebuffer_)} {}
std::optional<Tegra::FramebufferConfig> framebuffer;
};
/// Command to signal to the GPU thread to flush a region
struct FlushRegionCommand final {
explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
explicit constexpr FlushRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}
VAddr addr;
u64 size;
@ -55,7 +68,7 @@ struct FlushRegionCommand final {
/// Command to signal to the GPU thread to invalidate a region
struct InvalidateRegionCommand final {
explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
explicit constexpr InvalidateRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}
VAddr addr;
u64 size;
@ -63,8 +76,8 @@ struct InvalidateRegionCommand final {
/// Command to signal to the GPU thread to flush and invalidate a region
struct FlushAndInvalidateRegionCommand final {
explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
: addr{addr}, size{size} {}
explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr_, u64 size_)
: addr{addr_}, size{size_} {}
VAddr addr;
u64 size;
@ -77,15 +90,15 @@ struct OnCommandListEndCommand final {};
struct GPUTickCommand final {};
using CommandData =
std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
GPUTickCommand>;
std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
struct CommandDataContainer {
CommandDataContainer() = default;
CommandDataContainer(CommandData&& data, u64 next_fence)
: data{std::move(data)}, fence{next_fence} {}
explicit CommandDataContainer(CommandData&& data_, u64 next_fence_)
: data{std::move(data_)}, fence{next_fence_} {}
CommandData data;
u64 fence{};
@ -104,16 +117,19 @@ struct SynchState final {
/// Class used to manage the GPU thread
class ThreadManager final {
public:
explicit ThreadManager(Core::System& system);
explicit ThreadManager(Core::System& system_, bool is_async_);
~ThreadManager();
/// Creates and starts the GPU thread.
void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher);
Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
/// Push GPU command entries to be processed
void SubmitList(Tegra::CommandList&& entries);
/// Push GPU CDMA command buffer entries to be processed
void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
/// Swap buffers (render frame)
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
@ -135,11 +151,11 @@ private:
/// Pushes a command to be executed by the GPU thread
u64 PushCommand(CommandData&& command_data);
private:
SynchState state;
Core::System& system;
std::thread thread;
std::thread::id thread_id;
const bool is_async;
};
} // namespace VideoCommon::GPUThread

View File

@ -19,8 +19,8 @@ namespace VideoCore {
class GuestDriverProfile {
public:
explicit GuestDriverProfile() = default;
explicit GuestDriverProfile(std::optional<u32> texture_handler_size)
: texture_handler_size{texture_handler_size} {}
explicit GuestDriverProfile(std::optional<u32> texture_handler_size_)
: texture_handler_size{texture_handler_size_} {}
void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);

View File

@ -1,18 +1,29 @@
set(SHADER_FILES
block_linear_unswizzle_2d.comp
block_linear_unswizzle_3d.comp
convert_depth_to_float.frag
convert_float_to_depth.frag
full_screen_triangle.vert
opengl_copy_bc4.comp
opengl_present.frag
opengl_present.vert
pitch_unswizzle.comp
vulkan_blit_color_float.frag
vulkan_blit_depth_stencil.frag
vulkan_present.frag
vulkan_present.vert
vulkan_quad_array.comp
vulkan_quad_indexed.comp
vulkan_uint8.comp
)
find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED)
set(GLSL_FLAGS "")
set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
add_custom_command(
OUTPUT
${SHADER_DIR}
COMMAND
${CMAKE_COMMAND} -E make_directory ${SHADER_DIR}
)
set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
@ -20,19 +31,36 @@ set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
foreach(FILENAME IN ITEMS ${SHADER_FILES})
string(REPLACE "." "_" SHADER_NAME ${FILENAME})
set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
add_custom_command(
OUTPUT
${HEADER_FILE}
COMMAND
${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE}
MAIN_DEPENDENCY
${SOURCE_FILE}
DEPENDS
${HEADER_GENERATOR}
${INPUT_FILE}
)
set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE})
# Skip generating source headers on Vulkan exclusive files
if (NOT ${FILENAME} MATCHES "vulkan.*")
set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
add_custom_command(
OUTPUT
${SOURCE_HEADER_FILE}
COMMAND
${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE}
MAIN_DEPENDENCY
${SOURCE_FILE}
DEPENDS
${INPUT_FILE}
# HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
)
set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE})
endif()
# Skip compiling to SPIR-V OpenGL exclusive files
if (NOT ${FILENAME} MATCHES "opengl.*")
string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME)
set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h)
add_custom_command(
OUTPUT
${SPIRV_HEADER_FILE}
COMMAND
${GLSLANGVALIDATOR} -V ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
MAIN_DEPENDENCY
${SOURCE_FILE}
)
set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE})
endif()
endforeach()
add_custom_target(host_shaders

View File

@ -8,4 +8,6 @@ string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)
file(READ ${SOURCE_FILE} CONTENTS)
get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
make_directory(${OUTPUT_DIR})
configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY)

View File

@ -0,0 +1,122 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 2
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint layer_stride;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
END_PUSH_CONSTANTS
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image;
layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
uint SwizzleOffset(uvec2 pos) {
pos = pos & SWIZZLE_MASK;
return swizzle_table[pos.y * 64 + pos.x];
}
uvec4 ReadTexel(uint offset) {
switch (bytes_per_block_log2) {
#if HAS_EXTENDED_TYPES
case 0:
return uvec4(u8data[offset], 0, 0, 0);
case 1:
return uvec4(u16data[offset / 2], 0, 0, 0);
#else
case 0:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
case 1:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
#endif
case 2:
return uvec4(u32data[offset / 4], 0, 0, 0);
case 3:
return uvec4(u64data[offset / 8], 0, 0);
case 4:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
uvec3 pos = gl_GlobalInvocationID + origin;
pos.x <<= bytes_per_block_log2;
// Read as soon as possible due to its latency
const uint swizzle = SwizzleOffset(pos.xy);
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0;
offset += pos.z * layer_stride;
offset += (block_y >> block_height) * block_size;
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
offset += swizzle;
const uvec4 texel = ReadTexel(offset);
const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
imageStore(output_image, coord, texel);
}

View File

@ -0,0 +1,125 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 2
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint slice_size;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
UNIFORM(8) uint block_depth;
UNIFORM(9) uint block_depth_mask;
END_PUSH_CONSTANTS
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image;
layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
uint SwizzleOffset(uvec2 pos) {
pos = pos & SWIZZLE_MASK;
return swizzle_table[pos.y * 64 + pos.x];
}
uvec4 ReadTexel(uint offset) {
switch (bytes_per_block_log2) {
#if HAS_EXTENDED_TYPES
case 0:
return uvec4(u8data[offset], 0, 0, 0);
case 1:
return uvec4(u16data[offset / 2], 0, 0, 0);
#else
case 0:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
case 1:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
#endif
case 2:
return uvec4(u32data[offset / 4], 0, 0, 0);
case 3:
return uvec4(u64data[offset / 8], 0, 0);
case 4:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
uvec3 pos = gl_GlobalInvocationID + origin;
pos.x <<= bytes_per_block_log2;
// Read as soon as possible due to its latency
const uint swizzle = SwizzleOffset(pos.xy);
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0;
offset += (pos.z >> block_depth) * slice_size;
offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height);
offset += (block_y >> block_height) * block_size;
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
offset += swizzle;
const uvec4 texel = ReadTexel(offset);
const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
imageStore(output_image, coord, texel);
}

View File

@ -0,0 +1,13 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
layout(binding = 0) uniform sampler2D depth_texture;
layout(location = 0) out float output_color;
void main() {
ivec2 coord = ivec2(gl_FragCoord.xy);
output_color = texelFetch(depth_texture, coord, 0).r;
}

View File

@ -0,0 +1,13 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
layout(binding = 0) uniform sampler2D color_texture;
void main() {
ivec2 coord = ivec2(gl_FragCoord.xy);
float color = texelFetch(color_texture, coord, 0).r;
gl_FragDepth = color;
}

View File

@ -0,0 +1,29 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
#ifdef VULKAN
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) vec2 tex_scale;
UNIFORM(1) vec2 tex_offset;
END_PUSH_CONSTANTS
layout(location = 0) out vec2 texcoord;
void main() {
float x = float((gl_VertexIndex & 1) << 2);
float y = float((gl_VertexIndex & 2) << 1);
gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0);
texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset);
}

View File

@ -0,0 +1,70 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430 core
#extension GL_ARB_gpu_shader_int64 : require
layout (local_size_x = 4, local_size_y = 4) in;
layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input;
layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output;
layout(location = 0) uniform uvec3 src_offset;
layout(location = 1) uniform uvec3 dst_offset;
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
uint DecompressBlock(uint64_t bits, uvec2 coord) {
const uint code_offset = 16 + 3 * (4 * coord.y + coord.x);
const uint code = uint(bits >> code_offset) & 7;
const uint red0 = uint(bits >> 0) & 0xff;
const uint red1 = uint(bits >> 8) & 0xff;
if (red0 > red1) {
switch (code) {
case 0:
return red0;
case 1:
return red1;
case 2:
return (6 * red0 + 1 * red1) / 7;
case 3:
return (5 * red0 + 2 * red1) / 7;
case 4:
return (4 * red0 + 3 * red1) / 7;
case 5:
return (3 * red0 + 4 * red1) / 7;
case 6:
return (2 * red0 + 5 * red1) / 7;
case 7:
return (1 * red0 + 6 * red1) / 7;
}
} else {
switch (code) {
case 0:
return red0;
case 1:
return red1;
case 2:
return (4 * red0 + 1 * red1) / 5;
case 3:
return (3 * red0 + 2 * red1) / 5;
case 4:
return (2 * red0 + 3 * red1) / 5;
case 5:
return (1 * red0 + 4 * red1) / 5;
case 6:
return 0;
case 7:
return 0xff;
}
}
return 0;
}
void main() {
uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg;
uint64_t bits = packUint2x32(packed_bits);
uint red = DecompressBlock(bits, gl_LocalInvocationID.xy);
uvec4 color = uvec4(red & 0xff, 0, 0, 0xff);
imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color);
}

View File

@ -1,3 +1,7 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430 core
layout (location = 0) in vec2 frag_tex_coord;

View File

@ -1,3 +1,7 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430 core
out gl_PerVertex {

View File

@ -0,0 +1,86 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 1
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec2 origin;
UNIFORM(1) ivec2 destination;
UNIFORM(2) uint bytes_per_block;
UNIFORM(3) uint pitch;
END_PUSH_CONSTANTS
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image;
layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
uvec4 ReadTexel(uint offset) {
switch (bytes_per_block) {
#if HAS_EXTENDED_TYPES
case 1:
return uvec4(u8data[offset], 0, 0, 0);
case 2:
return uvec4(u16data[offset / 2], 0, 0, 0);
#else
case 1:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
case 2:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
#endif
case 4:
return uvec4(u32data[offset / 4], 0, 0, 0);
case 8:
return uvec4(u64data[offset / 8], 0, 0);
case 16:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
uvec2 pos = gl_GlobalInvocationID.xy + origin;
uint offset = 0;
offset += pos.x * bytes_per_block;
offset += pos.y * pitch;
const uvec4 texel = ReadTexel(offset);
const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination;
imageStore(output_image, coord, texel);
}

View File

@ -0,0 +1,14 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
layout(binding = 0) uniform sampler2D tex;
layout(location = 0) in vec2 texcoord;
layout(location = 0) out vec4 color;
void main() {
color = textureLod(tex, texcoord, 0);
}

View File

@ -0,0 +1,16 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
#extension GL_ARB_shader_stencil_export : require
layout(binding = 0) uniform sampler2D depth_tex;
layout(binding = 1) uniform isampler2D stencil_tex;
layout(location = 0) in vec2 texcoord;
void main() {
gl_FragDepth = textureLod(depth_tex, texcoord, 0).r;
gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r;
}

View File

@ -2,15 +2,6 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (location = 0) in vec2 frag_tex_coord;

View File

@ -2,15 +2,6 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (location = 0) in vec2 vert_position;

View File

@ -2,15 +2,6 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (local_size_x = 1024) in;

View File

@ -2,15 +2,6 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V quad_indexed.comp -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (local_size_x = 1024) in;

View File

@ -2,15 +2,6 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require

View File

@ -85,7 +85,7 @@ constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
{0x0217920100488FF7, &HLE_0217920100488FF7},
}};
HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
HLEMacro::~HLEMacro() = default;
std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
@ -99,8 +99,8 @@ std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) co
HLEMacroImpl::~HLEMacroImpl() = default;
HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
: maxwell3d(maxwell3d), func(func) {}
HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
: maxwell3d{maxwell3d_}, func{func_} {}
void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
func(maxwell3d, parameters);

View File

@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u3
class HLEMacro {
public:
explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
~HLEMacro();
std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;

View File

@ -11,29 +11,29 @@
MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
namespace Tegra {
MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
: MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
: MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
}
MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
const std::vector<u32>& code)
: maxwell3d(maxwell3d), code(code) {}
MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_,
const std::vector<u32>& code_)
: maxwell3d{maxwell3d_}, code{code_} {}
void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
MICROPROFILE_SCOPE(MacroInterp);
Reset();
registers[1] = parameters[0];
num_parameters = parameters.size();
registers[1] = params[0];
num_parameters = params.size();
if (num_parameters > parameters_capacity) {
parameters_capacity = num_parameters;
this->parameters = std::make_unique<u32[]>(num_parameters);
parameters = std::make_unique<u32[]>(num_parameters);
}
std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32));
// Execute the code until we hit an exit condition.
bool keep_executing = true;
@ -133,8 +133,7 @@ bool MacroInterpreterImpl::Step(bool is_delay_slot) {
break;
}
default:
UNIMPLEMENTED_MSG("Unimplemented macro operation {}",
static_cast<u32>(opcode.operation.Value()));
UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value());
}
// An instruction with the Exit flag will not actually
@ -182,7 +181,7 @@ u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a,
return ~(src_a & src_b);
default:
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast<u32>(operation));
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation);
return 0;
}
}
@ -230,7 +229,7 @@ void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 r
Send((result >> 12) & 0b111111);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast<u32>(operation));
UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation);
}
}

View File

@ -17,7 +17,7 @@ class Maxwell3D;
class MacroInterpreter final : public MacroEngine {
public:
explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_);
protected:
std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
@ -28,8 +28,8 @@ private:
class MacroInterpreterImpl : public CachedMacro {
public:
MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
void Execute(const std::vector<u32>& parameters, u32 method) override;
explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
void Execute(const std::vector<u32>& params, u32 method) override;
private:
/// Resets the execution engine state, zeroing registers, etc.
@ -38,9 +38,9 @@ private:
/**
* Executes a single macro instruction located at the current program counter. Returns whether
* the interpreter should keep running.
* @param offset Offset to start execution at.
*
* @param is_delay_slot Whether the current step is being executed due to a delay slot in a
* previous instruction.
* previous instruction.
*/
bool Step(bool is_delay_slot);

View File

@ -28,15 +28,15 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
BRANCH_HOLDER,
});
MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
: MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
: MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
}
MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
: Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
: CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} {
Compile();
}
@ -165,8 +165,7 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
}
break;
default:
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
static_cast<std::size_t>(opcode.alu_operation.Value()));
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value());
break;
}
Compile_ProcessResult(opcode.result_operation, opcode.dst);
@ -553,15 +552,15 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
}
void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) {
// Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
// register.
if (reg == 0) {
if (reg_index == 0) {
return;
}
mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result);
};
const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); };
switch (operation) {
case Macro::ResultOperation::IgnoreAndFetch:
@ -604,7 +603,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3
Compile_Send(RESULT);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation);
}
}

View File

@ -23,7 +23,7 @@ constexpr size_t MAX_CODE_SIZE = 0x10000;
class MacroJITx64 final : public MacroEngine {
public:
explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_);
protected:
std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
@ -34,7 +34,7 @@ private:
class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
public:
MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
~MacroJITx64Impl();
void Execute(const std::vector<u32>& parameters, u32 method) override;

View File

@ -11,6 +11,7 @@
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
namespace Tegra {
@ -44,13 +45,22 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_
return Map(cpu_addr, *FindFreeRange(size, align), size);
}
GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
ASSERT(gpu_addr);
return Map(cpu_addr, *gpu_addr, size);
}
void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
if (!size) {
return;
}
// Flush and invalidate through the GPU interface, to be asynchronous if possible.
system.GPU().FlushAndInvalidateRegion(*GpuToCpuAddress(gpu_addr), size);
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
ASSERT(cpu_addr);
rasterizer->UnmapMemory(*cpu_addr, size);
UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
}
@ -108,7 +118,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s
page_table[PageEntryIndex(gpu_addr)] = page_entry;
}
std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const {
std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
bool start_32bit_address) const {
if (!align) {
align = page_size;
} else {
@ -116,7 +127,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
}
u64 available_size{};
GPUVAddr gpu_addr{address_space_start};
GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
while (gpu_addr + available_size < address_space_size) {
if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
available_size += page_size;

View File

@ -28,7 +28,7 @@ public:
};
constexpr PageEntry() = default;
constexpr PageEntry(State state) : state{state} {}
constexpr PageEntry(State state_) : state{state_} {}
constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {}
[[nodiscard]] constexpr bool IsUnmapped() const {
@ -68,7 +68,7 @@ static_assert(sizeof(PageEntry) == 4, "PageEntry is too large");
class MemoryManager final {
public:
explicit MemoryManager(Core::System& system);
explicit MemoryManager(Core::System& system_);
~MemoryManager();
/// Binds a renderer to the memory manager.
@ -116,6 +116,7 @@ public:
[[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
[[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
[[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
[[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
[[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
void Unmap(GPUVAddr gpu_addr, std::size_t size);
@ -124,7 +125,8 @@ private:
[[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
[[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const;
[[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
bool start_32bit_address = false) const;
void TryLockPage(PageEntry page_entry, std::size_t size);
void TryUnlockPage(PageEntry page_entry, std::size_t size);
@ -135,6 +137,7 @@ private:
static constexpr u64 address_space_size = 1ULL << 40;
static constexpr u64 address_space_start = 1ULL << 32;
static constexpr u64 address_space_start_low = 1ULL << 16;
static constexpr u64 page_bits{16};
static constexpr u64 page_size{1 << page_bits};
static constexpr u64 page_mask{page_size - 1};

View File

@ -1,250 +0,0 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include <cstring>
#include "common/assert.h"
#include "common/common_types.h"
#include "video_core/morton.h"
#include "video_core/surface.h"
#include "video_core/textures/decoders.h"
namespace VideoCore {
using Surface::GetBytesPerPixel;
using Surface::PixelFormat;
using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, u8*);
using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>;
template <bool morton_to_linear, PixelFormat format>
static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth,
u32 tile_width_spacing, u8* buffer, u8* addr) {
constexpr u32 bytes_per_pixel = GetBytesPerPixel(format);
// With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
// pixel values.
constexpr u32 tile_size_x{GetDefaultBlockWidth(format)};
constexpr u32 tile_size_y{GetDefaultBlockHeight(format)};
if constexpr (morton_to_linear) {
Tegra::Texture::UnswizzleTexture(buffer, addr, tile_size_x, tile_size_y, bytes_per_pixel,
stride, height, depth, block_height, block_depth,
tile_width_spacing);
} else {
Tegra::Texture::CopySwizzledData((stride + tile_size_x - 1) / tile_size_x,
(height + tile_size_y - 1) / tile_size_y, depth,
bytes_per_pixel, bytes_per_pixel, addr, buffer, false,
block_height, block_depth, tile_width_spacing);
}
}
static constexpr ConversionArray morton_to_linear_fns = {
MortonCopy<true, PixelFormat::A8B8G8R8_UNORM>,
MortonCopy<true, PixelFormat::A8B8G8R8_SNORM>,
MortonCopy<true, PixelFormat::A8B8G8R8_SINT>,
MortonCopy<true, PixelFormat::A8B8G8R8_UINT>,
MortonCopy<true, PixelFormat::R5G6B5_UNORM>,
MortonCopy<true, PixelFormat::B5G6R5_UNORM>,
MortonCopy<true, PixelFormat::A1R5G5B5_UNORM>,
MortonCopy<true, PixelFormat::A2B10G10R10_UNORM>,
MortonCopy<true, PixelFormat::A2B10G10R10_UINT>,
MortonCopy<true, PixelFormat::A1B5G5R5_UNORM>,
MortonCopy<true, PixelFormat::R8_UNORM>,
MortonCopy<true, PixelFormat::R8_SNORM>,
MortonCopy<true, PixelFormat::R8_SINT>,
MortonCopy<true, PixelFormat::R8_UINT>,
MortonCopy<true, PixelFormat::R16G16B16A16_FLOAT>,
MortonCopy<true, PixelFormat::R16G16B16A16_UNORM>,
MortonCopy<true, PixelFormat::R16G16B16A16_SNORM>,
MortonCopy<true, PixelFormat::R16G16B16A16_SINT>,
MortonCopy<true, PixelFormat::R16G16B16A16_UINT>,
MortonCopy<true, PixelFormat::B10G11R11_FLOAT>,
MortonCopy<true, PixelFormat::R32G32B32A32_UINT>,
MortonCopy<true, PixelFormat::BC1_RGBA_UNORM>,
MortonCopy<true, PixelFormat::BC2_UNORM>,
MortonCopy<true, PixelFormat::BC3_UNORM>,
MortonCopy<true, PixelFormat::BC4_UNORM>,
MortonCopy<true, PixelFormat::BC4_SNORM>,
MortonCopy<true, PixelFormat::BC5_UNORM>,
MortonCopy<true, PixelFormat::BC5_SNORM>,
MortonCopy<true, PixelFormat::BC7_UNORM>,
MortonCopy<true, PixelFormat::BC6H_UFLOAT>,
MortonCopy<true, PixelFormat::BC6H_SFLOAT>,
MortonCopy<true, PixelFormat::ASTC_2D_4X4_UNORM>,
MortonCopy<true, PixelFormat::B8G8R8A8_UNORM>,
MortonCopy<true, PixelFormat::R32G32B32A32_FLOAT>,
MortonCopy<true, PixelFormat::R32G32B32A32_SINT>,
MortonCopy<true, PixelFormat::R32G32_FLOAT>,
MortonCopy<true, PixelFormat::R32G32_SINT>,
MortonCopy<true, PixelFormat::R32_FLOAT>,
MortonCopy<true, PixelFormat::R16_FLOAT>,
MortonCopy<true, PixelFormat::R16_UNORM>,
MortonCopy<true, PixelFormat::R16_SNORM>,
MortonCopy<true, PixelFormat::R16_UINT>,
MortonCopy<true, PixelFormat::R16_SINT>,
MortonCopy<true, PixelFormat::R16G16_UNORM>,
MortonCopy<true, PixelFormat::R16G16_FLOAT>,
MortonCopy<true, PixelFormat::R16G16_UINT>,
MortonCopy<true, PixelFormat::R16G16_SINT>,
MortonCopy<true, PixelFormat::R16G16_SNORM>,
MortonCopy<true, PixelFormat::R32G32B32_FLOAT>,
MortonCopy<true, PixelFormat::A8B8G8R8_SRGB>,
MortonCopy<true, PixelFormat::R8G8_UNORM>,
MortonCopy<true, PixelFormat::R8G8_SNORM>,
MortonCopy<true, PixelFormat::R8G8_SINT>,
MortonCopy<true, PixelFormat::R8G8_UINT>,
MortonCopy<true, PixelFormat::R32G32_UINT>,
MortonCopy<true, PixelFormat::R16G16B16X16_FLOAT>,
MortonCopy<true, PixelFormat::R32_UINT>,
MortonCopy<true, PixelFormat::R32_SINT>,
MortonCopy<true, PixelFormat::ASTC_2D_8X8_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_8X5_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_5X4_UNORM>,
MortonCopy<true, PixelFormat::B8G8R8A8_SRGB>,
MortonCopy<true, PixelFormat::BC1_RGBA_SRGB>,
MortonCopy<true, PixelFormat::BC2_SRGB>,
MortonCopy<true, PixelFormat::BC3_SRGB>,
MortonCopy<true, PixelFormat::BC7_SRGB>,
MortonCopy<true, PixelFormat::A4B4G4R4_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_5X5_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_10X8_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_6X6_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_6X6_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_10X10_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_10X10_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_12X12_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_12X12_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_8X6_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>,
MortonCopy<true, PixelFormat::ASTC_2D_6X5_UNORM>,
MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>,
MortonCopy<true, PixelFormat::E5B9G9R9_FLOAT>,
MortonCopy<true, PixelFormat::D32_FLOAT>,
MortonCopy<true, PixelFormat::D16_UNORM>,
MortonCopy<true, PixelFormat::D24_UNORM_S8_UINT>,
MortonCopy<true, PixelFormat::S8_UINT_D24_UNORM>,
MortonCopy<true, PixelFormat::D32_FLOAT_S8_UINT>,
};
static constexpr ConversionArray linear_to_morton_fns = {
MortonCopy<false, PixelFormat::A8B8G8R8_UNORM>,
MortonCopy<false, PixelFormat::A8B8G8R8_SNORM>,
MortonCopy<false, PixelFormat::A8B8G8R8_SINT>,
MortonCopy<false, PixelFormat::A8B8G8R8_UINT>,
MortonCopy<false, PixelFormat::R5G6B5_UNORM>,
MortonCopy<false, PixelFormat::B5G6R5_UNORM>,
MortonCopy<false, PixelFormat::A1R5G5B5_UNORM>,
MortonCopy<false, PixelFormat::A2B10G10R10_UNORM>,
MortonCopy<false, PixelFormat::A2B10G10R10_UINT>,
MortonCopy<false, PixelFormat::A1B5G5R5_UNORM>,
MortonCopy<false, PixelFormat::R8_UNORM>,
MortonCopy<false, PixelFormat::R8_SNORM>,
MortonCopy<false, PixelFormat::R8_SINT>,
MortonCopy<false, PixelFormat::R8_UINT>,
MortonCopy<false, PixelFormat::R16G16B16A16_FLOAT>,
MortonCopy<false, PixelFormat::R16G16B16A16_SNORM>,
MortonCopy<false, PixelFormat::R16G16B16A16_SINT>,
MortonCopy<false, PixelFormat::R16G16B16A16_UNORM>,
MortonCopy<false, PixelFormat::R16G16B16A16_UINT>,
MortonCopy<false, PixelFormat::B10G11R11_FLOAT>,
MortonCopy<false, PixelFormat::R32G32B32A32_UINT>,
MortonCopy<false, PixelFormat::BC1_RGBA_UNORM>,
MortonCopy<false, PixelFormat::BC2_UNORM>,
MortonCopy<false, PixelFormat::BC3_UNORM>,
MortonCopy<false, PixelFormat::BC4_UNORM>,
MortonCopy<false, PixelFormat::BC4_SNORM>,
MortonCopy<false, PixelFormat::BC5_UNORM>,
MortonCopy<false, PixelFormat::BC5_SNORM>,
MortonCopy<false, PixelFormat::BC7_UNORM>,
MortonCopy<false, PixelFormat::BC6H_UFLOAT>,
MortonCopy<false, PixelFormat::BC6H_SFLOAT>,
// TODO(Subv): Swizzling ASTC formats are not supported
nullptr,
MortonCopy<false, PixelFormat::B8G8R8A8_UNORM>,
MortonCopy<false, PixelFormat::R32G32B32A32_FLOAT>,
MortonCopy<false, PixelFormat::R32G32B32A32_SINT>,
MortonCopy<false, PixelFormat::R32G32_FLOAT>,
MortonCopy<false, PixelFormat::R32G32_SINT>,
MortonCopy<false, PixelFormat::R32_FLOAT>,
MortonCopy<false, PixelFormat::R16_FLOAT>,
MortonCopy<false, PixelFormat::R16_UNORM>,
MortonCopy<false, PixelFormat::R16_SNORM>,
MortonCopy<false, PixelFormat::R16_UINT>,
MortonCopy<false, PixelFormat::R16_SINT>,
MortonCopy<false, PixelFormat::R16G16_UNORM>,
MortonCopy<false, PixelFormat::R16G16_FLOAT>,
MortonCopy<false, PixelFormat::R16G16_UINT>,
MortonCopy<false, PixelFormat::R16G16_SINT>,
MortonCopy<false, PixelFormat::R16G16_SNORM>,
MortonCopy<false, PixelFormat::R32G32B32_FLOAT>,
MortonCopy<false, PixelFormat::A8B8G8R8_SRGB>,
MortonCopy<false, PixelFormat::R8G8_UNORM>,
MortonCopy<false, PixelFormat::R8G8_SNORM>,
MortonCopy<false, PixelFormat::R8G8_SINT>,
MortonCopy<false, PixelFormat::R8G8_UINT>,
MortonCopy<false, PixelFormat::R32G32_UINT>,
MortonCopy<false, PixelFormat::R16G16B16X16_FLOAT>,
MortonCopy<false, PixelFormat::R32_UINT>,
MortonCopy<false, PixelFormat::R32_SINT>,
nullptr,
nullptr,
nullptr,
MortonCopy<false, PixelFormat::B8G8R8A8_SRGB>,
MortonCopy<false, PixelFormat::BC1_RGBA_SRGB>,
MortonCopy<false, PixelFormat::BC2_SRGB>,
MortonCopy<false, PixelFormat::BC3_SRGB>,
MortonCopy<false, PixelFormat::BC7_SRGB>,
MortonCopy<false, PixelFormat::A4B4G4R4_UNORM>,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
MortonCopy<false, PixelFormat::E5B9G9R9_FLOAT>,
MortonCopy<false, PixelFormat::D32_FLOAT>,
MortonCopy<false, PixelFormat::D16_UNORM>,
MortonCopy<false, PixelFormat::D24_UNORM_S8_UINT>,
MortonCopy<false, PixelFormat::S8_UINT_D24_UNORM>,
MortonCopy<false, PixelFormat::D32_FLOAT_S8_UINT>,
};
static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) {
switch (mode) {
case MortonSwizzleMode::MortonToLinear:
return morton_to_linear_fns[static_cast<std::size_t>(format)];
case MortonSwizzleMode::LinearToMorton:
return linear_to_morton_fns[static_cast<std::size_t>(format)];
}
UNREACHABLE();
return morton_to_linear_fns[static_cast<std::size_t>(format)];
}
void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride,
u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
u8* buffer, u8* addr) {
GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth,
tile_width_spacing, buffer, addr);
}
} // namespace VideoCore

View File

@ -1,18 +0,0 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
#include "video_core/surface.h"
namespace VideoCore {
enum class MortonSwizzleMode { MortonToLinear, LinearToMorton };
void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride,
u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
u8* buffer, u8* addr);
} // namespace VideoCore

View File

@ -28,8 +28,8 @@ namespace VideoCommon {
template <class QueryCache, class HostCounter>
class CounterStreamBase {
public:
explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type)
: cache{cache}, type{type} {}
explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_)
: cache{cache_}, type{type_} {}
/// Updates the state of the stream, enabling or disabling as needed.
void Update(bool enabled) {
@ -334,8 +334,8 @@ private:
template <class HostCounter>
class CachedQueryBase {
public:
explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr)
: cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
explicit CachedQueryBase(VAddr cpu_addr_, u8* host_ptr_)
: cpu_addr{cpu_addr_}, host_ptr{host_ptr_} {}
virtual ~CachedQueryBase() = default;
CachedQueryBase(CachedQueryBase&&) noexcept = default;

View File

@ -32,7 +32,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size
class RasterizerInterface {
public:
virtual ~RasterizerInterface() {}
virtual ~RasterizerInterface() = default;
/// Dispatches a draw invocation
virtual void Draw(bool is_indexed, bool is_instanced) = 0;
@ -76,6 +76,9 @@ public:
/// Sync memory between guest and host.
virtual void SyncGuestHost() = 0;
/// Unmap memory range
virtual void UnmapMemory(VAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
/// and invalidated
virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
@ -83,6 +86,12 @@ public:
/// Notify the host renderer to wait for previous primitive and compute operations.
virtual void WaitForIdle() = 0;
/// Notify the host renderer to wait for reads and writes to render targets and flush caches.
virtual void FragmentBarrier() = 0;
/// Notify the host renderer to make available previous render target writes.
virtual void TiledCacheBarrier() = 0;
/// Notify the rasterizer to send all written commands to the host GPU.
virtual void FlushCommands() = 0;
@ -90,15 +99,15 @@ public:
virtual void TickFrame() = 0;
/// Attempt to use a faster method to perform a surface copy
virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
[[nodiscard]] virtual bool AccelerateSurfaceCopy(
const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
return false;
}
/// Attempt to use a faster method to display the framebuffer to screen
virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) {
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
return false;
}
@ -110,12 +119,12 @@ public:
const DiskResourceLoadCallback& callback) {}
/// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
GuestDriverProfile& AccessGuestDriverProfile() {
[[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() {
return guest_driver_profile;
}
/// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
const GuestDriverProfile& AccessGuestDriverProfile() const {
[[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const {
return guest_driver_profile;
}

View File

@ -38,7 +38,7 @@ public:
virtual ~RendererBase();
/// Initialize the renderer
virtual bool Init() = 0;
[[nodiscard]] virtual bool Init() = 0;
/// Shutdown the renderer
virtual void ShutDown() = 0;
@ -49,43 +49,43 @@ public:
// Getter/setter functions:
// ------------------------
f32 GetCurrentFPS() const {
[[nodiscard]] f32 GetCurrentFPS() const {
return m_current_fps;
}
int GetCurrentFrame() const {
[[nodiscard]] int GetCurrentFrame() const {
return m_current_frame;
}
RasterizerInterface& Rasterizer() {
[[nodiscard]] RasterizerInterface& Rasterizer() {
return *rasterizer;
}
const RasterizerInterface& Rasterizer() const {
[[nodiscard]] const RasterizerInterface& Rasterizer() const {
return *rasterizer;
}
Core::Frontend::GraphicsContext& Context() {
[[nodiscard]] Core::Frontend::GraphicsContext& Context() {
return *context;
}
const Core::Frontend::GraphicsContext& Context() const {
[[nodiscard]] const Core::Frontend::GraphicsContext& Context() const {
return *context;
}
Core::Frontend::EmuWindow& GetRenderWindow() {
[[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() {
return render_window;
}
const Core::Frontend::EmuWindow& GetRenderWindow() const {
[[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const {
return render_window;
}
RendererSettings& Settings() {
[[nodiscard]] RendererSettings& Settings() {
return renderer_settings;
}
const RendererSettings& Settings() const {
[[nodiscard]] const RendererSettings& Settings() const {
return renderer_settings;
}

View File

@ -39,8 +39,8 @@ using Operation = const OperationNode&;
constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
char Swizzle(std::size_t component) {
ASSERT(component < 4);
return component["xyzw"];
static constexpr std::string_view SWIZZLE{"xyzw"};
return SWIZZLE.at(component);
}
constexpr bool IsGenericAttribute(Attribute::Index index) {
@ -71,7 +71,7 @@ std::string_view GetInputFlags(PixelImap attribute) {
case PixelImap::Unused:
break;
}
UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
UNIMPLEMENTED_MSG("Unknown attribute usage index={}", attribute);
return {};
}
@ -123,7 +123,7 @@ std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::Primitive
case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
return "TRIANGLES_ADJACENCY";
default:
UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
UNIMPLEMENTED_MSG("topology={}", topology);
return "POINTS";
}
}
@ -137,7 +137,7 @@ std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
case Tegra::Shader::OutputTopology::TriangleStrip:
return "TRIANGLE_STRIP";
default:
UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
UNIMPLEMENTED_MSG("Unknown output topology: {}", topology);
return "points";
}
}
@ -187,8 +187,8 @@ std::string TextureType(const MetaTexture& meta) {
class ARBDecompiler final {
public:
explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
ShaderType stage, std::string_view identifier);
explicit ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
ShaderType stage_, std::string_view identifier);
std::string Code() const {
return shader_source;
@ -224,7 +224,7 @@ private:
std::string Visit(const Node& node);
std::pair<std::string, std::size_t> BuildCoords(Operation);
std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation);
std::string BuildAoffi(Operation);
std::string GlobalMemoryPointer(const GmemNode& gmem);
void Exit();
@ -376,9 +376,11 @@ private:
std::string temporary = AllocTemporary();
std::string address;
std::string_view opname;
bool robust = false;
if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
address = GlobalMemoryPointer(*gmem);
opname = "ATOM";
robust = true;
} else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
opname = "ATOMS";
@ -386,7 +388,15 @@ private:
UNREACHABLE();
return "{0, 0, 0, 0}";
}
if (robust) {
AddLine("IF NE.x;");
}
AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
if (robust) {
AddLine("ELSE;");
AddLine("MOV.S {}, 0;", temporary);
AddLine("ENDIF;");
}
return temporary;
}
@ -792,9 +802,9 @@ private:
};
};
ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
ShaderType stage, std::string_view identifier)
: device{device}, ir{ir}, registry{registry}, stage{stage} {
ARBDecompiler::ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
ShaderType stage_, std::string_view identifier)
: device{device_}, ir{ir_}, registry{registry_}, stage{stage_} {
DefineGlobalMemory();
AddLine("TEMP RC;");
@ -980,10 +990,9 @@ void ARBDecompiler::DeclareLocalMemory() {
}
void ARBDecompiler::DeclareGlobalMemory() {
const std::size_t num_entries = ir.GetGlobalMemory().size();
const size_t num_entries = ir.GetGlobalMemory().size();
if (num_entries > 0) {
const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2;
AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1);
AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1);
}
}
@ -1125,44 +1134,44 @@ void ARBDecompiler::VisitAST(const ASTNode& node) {
for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
VisitAST(current);
}
} else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
const std::string condition = VisitExpression(ast->condition);
} else if (const auto if_then = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
const std::string condition = VisitExpression(if_then->condition);
ResetTemporaries();
AddLine("MOVC.U RC.x, {};", condition);
AddLine("IF NE.x;");
for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
for (ASTNode current = if_then->nodes.GetFirst(); current; current = current->GetNext()) {
VisitAST(current);
}
AddLine("ENDIF;");
} else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
} else if (const auto if_else = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
AddLine("ELSE;");
for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
for (ASTNode current = if_else->nodes.GetFirst(); current; current = current->GetNext()) {
VisitAST(current);
}
} else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
VisitBlock(ast->nodes);
} else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
} else if (const auto decoded = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
VisitBlock(decoded->nodes);
} else if (const auto var_set = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
AddLine("MOV.U F{}, {};", var_set->index, VisitExpression(var_set->condition));
ResetTemporaries();
} else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
const std::string condition = VisitExpression(ast->condition);
} else if (const auto do_while = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
const std::string condition = VisitExpression(do_while->condition);
ResetTemporaries();
AddLine("REP;");
for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
for (ASTNode current = do_while->nodes.GetFirst(); current; current = current->GetNext()) {
VisitAST(current);
}
AddLine("MOVC.U RC.x, {};", condition);
AddLine("BRK (NE.x);");
AddLine("ENDREP;");
} else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
const bool is_true = ExprIsTrue(ast->condition);
} else if (const auto ast_return = std::get_if<ASTReturn>(&*node->GetInnerData())) {
const bool is_true = ExprIsTrue(ast_return->condition);
if (!is_true) {
AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
AddLine("MOVC.U RC.x, {};", VisitExpression(ast_return->condition));
AddLine("IF NE.x;");
ResetTemporaries();
}
if (ast->kills) {
if (ast_return->kills) {
AddLine("KIL TR;");
} else {
Exit();
@ -1170,11 +1179,11 @@ void ARBDecompiler::VisitAST(const ASTNode& node) {
if (!is_true) {
AddLine("ENDIF;");
}
} else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
if (ExprIsTrue(ast->condition)) {
} else if (const auto ast_break = std::get_if<ASTBreak>(&*node->GetInnerData())) {
if (ExprIsTrue(ast_break->condition)) {
AddLine("BRK;");
} else {
AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
AddLine("MOVC.U RC.x, {};", VisitExpression(ast_break->condition));
AddLine("BRK (NE.x);");
ResetTemporaries();
}
@ -1342,7 +1351,7 @@ std::string ARBDecompiler::Visit(const Node& node) {
GetGenericAttributeIndex(index), swizzle);
}
}
UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
UNIMPLEMENTED_MSG("Unimplemented input attribute={}", index);
break;
}
return "{0, 0, 0, 0}.x";
@ -1363,7 +1372,8 @@ std::string ARBDecompiler::Visit(const Node& node) {
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
std::string temporary = AllocTemporary();
AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem));
AddLine("MOV {}, 0;", temporary);
AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem));
return temporary;
}
@ -1406,12 +1416,12 @@ std::string ARBDecompiler::Visit(const Node& node) {
return {};
}
std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
UNIMPLEMENTED_IF(meta.sampler.is_indexed);
UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array &&
meta.sampler.type == Tegra::Shader::TextureType::TextureCube;
const std::size_t count = operation.GetOperandsCount();
std::string temporary = AllocVectorTemporary();
std::size_t i = 0;
@ -1419,12 +1429,21 @@ std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operati
AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
}
if (meta.sampler.is_array) {
AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array));
++i;
}
if (meta.sampler.is_shadow) {
AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
std::string compare = Visit(meta.depth_compare);
if (is_extended) {
ASSERT(i == 4);
std::string extra_coord = AllocVectorTemporary();
AddLine("MOV.F {}.x, {};", extra_coord, compare);
return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0};
}
AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare);
++i;
}
return {std::move(temporary), i};
return {temporary, temporary, i};
}
std::string ARBDecompiler::BuildAoffi(Operation operation) {
@ -1441,18 +1460,21 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) {
}
std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) {
// Read a bindless SSBO, return its address and set CC accordingly
// address = c[binding].xy
// length = c[binding].z
const u32 binding = global_memory_names.at(gmem.GetDescriptor());
const char result_swizzle = binding % 2 == 0 ? 'x' : 'y';
const std::string pointer = AllocLongVectorTemporary();
std::string temporary = AllocTemporary();
const u32 local_index = binding / 2;
AddLine("PK64.U {}, c[{}];", pointer, local_index);
AddLine("PK64.U {}, c[{}];", pointer, binding);
AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()),
Visit(gmem.GetBaseAddress()));
AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary);
AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer);
AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer);
// Compare offset to length and set CC
AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding);
return fmt::format("{}.x", pointer);
}
@ -1463,9 +1485,7 @@ void ARBDecompiler::Exit() {
}
const auto safe_get_register = [this](u32 reg) -> std::string {
// TODO(Rodrigo): Replace with contains once C++20 releases
const auto& used_registers = ir.GetRegisters();
if (used_registers.find(reg) != used_registers.end()) {
if (ir.GetRegisters().contains(reg)) {
return fmt::format("R{}.x", reg);
}
return "{0, 0, 0, 0}.x";
@ -1552,7 +1572,9 @@ std::string ARBDecompiler::Assign(Operation operation) {
ResetTemporaries();
return {};
} else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
AddLine("IF NE.x;");
AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem));
AddLine("ENDIF;");
ResetTemporaries();
return {};
} else {
@ -1844,7 +1866,7 @@ std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
std::string ARBDecompiler::Texture(Operation operation) {
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
const auto [temporary, swizzle] = BuildCoords(operation);
const auto [coords, temporary, swizzle] = BuildCoords(operation);
std::string_view opcode = "TEX";
std::string extra;
@ -1873,7 +1895,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
}
}
AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id,
TextureType(meta), BuildAoffi(operation));
AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
return fmt::format("{}.x", temporary);
@ -1882,7 +1904,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
std::string ARBDecompiler::TextureGather(Operation operation) {
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
const auto [temporary, swizzle] = BuildCoords(operation);
const auto [coords, temporary, swizzle] = BuildCoords(operation);
std::string comp;
if (!meta.sampler.is_shadow) {
@ -1892,7 +1914,7 @@ std::string ARBDecompiler::TextureGather(Operation operation) {
AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
TextureType(meta), BuildAoffi(operation));
AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element));
return fmt::format("{}.x", temporary);
}
@ -1930,13 +1952,13 @@ std::string ARBDecompiler::TextureQueryLod(Operation operation) {
std::string ARBDecompiler::TexelFetch(Operation operation) {
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
const auto [temporary, swizzle] = BuildCoords(operation);
const auto [coords, temporary, swizzle] = BuildCoords(operation);
if (!meta.sampler.is_buffer) {
ASSERT(swizzle < 4);
AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
}
AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta),
BuildAoffi(operation));
AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
return fmt::format("{}.x", temporary);
@ -1947,7 +1969,7 @@ std::string ARBDecompiler::TextureGradient(Operation operation) {
const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
const std::string ddx = AllocVectorTemporary();
const std::string ddy = AllocVectorTemporary();
const std::string coord = BuildCoords(operation).first;
const std::string coord = std::get<1>(BuildCoords(operation));
const std::size_t num_components = meta.derivates.size() / 2;
for (std::size_t index = 0; index < num_components; ++index) {

View File

@ -22,11 +22,11 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
: VideoCommon::BufferBlock{cpu_addr, size} {
Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_)
: BufferBlock{cpu_addr_, size_} {
gl_buffer.Create();
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW);
if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) {
glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
}
@ -34,14 +34,14 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
Buffer::~Buffer() = default;
void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
data);
void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
static_cast<GLsizeiptr>(data_size), data);
}
void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size);
const GLintptr gl_offset = static_cast<GLintptr>(offset);
if (read_buffer.handle == 0) {
read_buffer.Create();
@ -54,17 +54,16 @@ void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
}
void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
std::size_t size) {
std::size_t copy_size) {
glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size));
}
OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
const Device& device_, std::size_t stream_size)
: GenericBufferCache{rasterizer, gpu_memory, cpu_memory,
std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
device{device_} {
OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
const Device& device_, OGLStreamBuffer& stream_buffer_,
StateTracker& state_tracker)
: GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
if (!device.HasFastBufferSubData()) {
return;
}

View File

@ -22,18 +22,19 @@ namespace OpenGL {
class Device;
class OGLStreamBuffer;
class RasterizerOpenGL;
class StateTracker;
class Buffer : public VideoCommon::BufferBlock {
public:
explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_);
~Buffer();
void Upload(std::size_t offset, std::size_t size, const u8* data);
void Upload(std::size_t offset, std::size_t data_size, const u8* data);
void Download(std::size_t offset, std::size_t size, u8* data);
void Download(std::size_t offset, std::size_t data_size, u8* data);
void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
std::size_t size);
std::size_t copy_size);
GLuint Handle() const noexcept {
return gl_buffer.handle;
@ -54,7 +55,8 @@ class OGLBufferCache final : public GenericBufferCache {
public:
explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
const Device& device, std::size_t stream_size);
const Device& device, OGLStreamBuffer& stream_buffer,
StateTracker& state_tracker);
~OGLBufferCache();
BufferInfo GetEmptyBuffer(std::size_t) override;

View File

@ -5,9 +5,11 @@
#include <algorithm>
#include <array>
#include <cstddef>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <optional>
#include <span>
#include <vector>
#include <glad/glad.h>
@ -27,27 +29,29 @@ constexpr u32 ReservedUniformBlocks = 1;
constexpr u32 NumStages = 5;
constexpr std::array LimitUBOs = {
constexpr std::array LIMIT_UBOS = {
GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
constexpr std::array LimitSSBOs = {
GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS,
};
constexpr std::array LIMIT_SSBOS = {
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
GL_MAX_TEXTURE_IMAGE_UNITS,
GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
constexpr std::array LimitImages = {
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS,
};
constexpr std::array LIMIT_SAMPLERS = {
GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
GL_MAX_TEXTURE_IMAGE_UNITS,
GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS,
};
constexpr std::array LIMIT_IMAGES = {
GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS,
};
template <typename T>
T GetInteger(GLenum pname) {
@ -76,8 +80,8 @@ std::vector<std::string_view> GetExtensions() {
return extensions;
}
bool HasExtension(const std::vector<std::string_view>& images, std::string_view extension) {
return std::find(images.begin(), images.end(), extension) != images.end();
bool HasExtension(std::span<const std::string_view> extensions, std::string_view extension) {
return std::ranges::find(extensions, extension) != extensions.end();
}
u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
@ -91,8 +95,8 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
std::array<u32, Tegra::Engines::MaxShaderTypes> max;
std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
[](GLenum pname) { return GetInteger<u32>(pname); });
std::ranges::transform(LIMIT_UBOS, max.begin(),
[](GLenum pname) { return GetInteger<u32>(pname); });
return max;
}
@ -115,9 +119,10 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
for (std::size_t i = 0; i < NumStages; ++i) {
const std::size_t stage = stage_swizzle[i];
bindings[stage] = {
Extract(base_ubo, num_ubos, total_ubos / NumStages, LimitUBOs[stage]),
Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LimitSSBOs[stage]),
Extract(base_samplers, num_samplers, total_samplers / NumStages, LimitSamplers[stage])};
Extract(base_ubo, num_ubos, total_ubos / NumStages, LIMIT_UBOS[stage]),
Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LIMIT_SSBOS[stage]),
Extract(base_samplers, num_samplers, total_samplers / NumStages,
LIMIT_SAMPLERS[stage])};
}
u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
@ -130,7 +135,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
// Reserve at least 4 image bindings on the fragment stage.
bindings[4].image =
Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
Extract(base_images, num_images, std::max(4U, num_images / NumStages), LIMIT_IMAGES[4]);
// This is guaranteed to be at least 1.
const u32 total_extracted_images = num_images / (NumStages - 1);
@ -142,7 +147,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
continue;
}
bindings[stage].image =
Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
Extract(base_images, num_images, total_extracted_images, LIMIT_IMAGES[stage]);
}
// Compute doesn't care about any of this.
@ -188,17 +193,22 @@ bool IsASTCSupported() {
return true;
}
[[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) {
const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
}
} // Anonymous namespace
Device::Device()
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions();
const bool is_nvidia = vendor == "NVIDIA Corporation";
const bool is_amd = vendor == "ATI Technologies Inc.";
const bool is_intel = vendor == "Intel";
bool disable_fast_buffer_sub_data = false;
if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
@ -207,9 +217,8 @@ Device::Device()
"Beta driver 443.24 is known to have issues. There might be performance issues.");
disable_fast_buffer_sub_data = true;
}
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
@ -223,8 +232,10 @@ Device::Device()
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
has_broken_texture_view_formats = is_amd || is_intel;
has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
has_debugging_tool_attached = IsDebugToolAttached(extensions);
// At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
// uniform buffers as "push constants"
@ -239,6 +250,8 @@ Device::Device()
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
LOG_INFO(Render_OpenGL, "Renderer_BrokenTextureViewFormats: {}",
has_broken_texture_view_formats);
if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");

View File

@ -36,11 +36,11 @@ public:
return GetBaseBindings(static_cast<std::size_t>(shader_type));
}
std::size_t GetUniformBufferAlignment() const {
size_t GetUniformBufferAlignment() const {
return uniform_buffer_alignment;
}
std::size_t GetShaderStorageBufferAlignment() const {
size_t GetShaderStorageBufferAlignment() const {
return shader_storage_alignment;
}
@ -96,6 +96,10 @@ public:
return has_precise_bug;
}
bool HasBrokenTextureViewFormats() const {
return has_broken_texture_view_formats;
}
bool HasFastBufferSubData() const {
return has_fast_buffer_sub_data;
}
@ -104,6 +108,10 @@ public:
return has_nv_viewport_array2;
}
bool HasDebuggingToolAttached() const {
return has_debugging_tool_attached;
}
bool UseAssemblyShaders() const {
return use_assembly_shaders;
}
@ -118,8 +126,8 @@ private:
std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
std::size_t uniform_buffer_alignment{};
std::size_t shader_storage_alignment{};
size_t uniform_buffer_alignment{};
size_t shader_storage_alignment{};
u32 max_vertex_attributes{};
u32 max_varyings{};
u32 max_compute_shared_memory_size{};
@ -133,8 +141,10 @@ private:
bool has_variable_aoffi{};
bool has_component_indexing_bug{};
bool has_precise_bug{};
bool has_broken_texture_view_formats{};
bool has_fast_buffer_sub_data{};
bool has_nv_viewport_array2{};
bool has_debugging_tool_attached{};
bool use_assembly_shaders{};
bool use_asynchronous_shaders{};
};

View File

@ -11,10 +11,10 @@
namespace OpenGL {
GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {}
GLInnerFence::GLInnerFence(u32 payload_, bool is_stubbed_) : FenceBase{payload_, is_stubbed_} {}
GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed)
: FenceBase(address, payload, is_stubbed) {}
GLInnerFence::GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_)
: FenceBase{address_, payload_, is_stubbed_} {}
GLInnerFence::~GLInnerFence() = default;
@ -45,10 +45,10 @@ void GLInnerFence::Wait() {
glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
}
FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
TextureCacheOpenGL& texture_cache,
OGLBufferCache& buffer_cache, QueryCache& query_cache)
: GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {}
FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
Tegra::GPU& gpu_, TextureCache& texture_cache_,
OGLBufferCache& buffer_cache_, QueryCache& query_cache_)
: GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}
Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
return std::make_shared<GLInnerFence>(value, is_stubbed);

View File

@ -17,8 +17,8 @@ namespace OpenGL {
class GLInnerFence : public VideoCommon::FenceBase {
public:
GLInnerFence(u32 payload, bool is_stubbed);
GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed);
explicit GLInnerFence(u32 payload_, bool is_stubbed_);
explicit GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_);
~GLInnerFence();
void Queue();
@ -33,13 +33,13 @@ private:
using Fence = std::shared_ptr<GLInnerFence>;
using GenericFenceManager =
VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>;
VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>;
class FenceManagerOpenGL final : public GenericFenceManager {
public:
explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache,
QueryCache& query_cache);
explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
TextureCache& texture_cache_, OGLBufferCache& buffer_cache_,
QueryCache& query_cache_);
protected:
Fence CreateFence(u32 value, bool is_stubbed) override;

View File

@ -1,85 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <tuple>
#include <unordered_map>
#include <utility>
#include <glad/glad.h>
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_framebuffer_cache.h"
namespace OpenGL {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
using VideoCore::Surface::SurfaceType;
FramebufferCacheOpenGL::FramebufferCacheOpenGL() = default;
FramebufferCacheOpenGL::~FramebufferCacheOpenGL() = default;
GLuint FramebufferCacheOpenGL::GetFramebuffer(const FramebufferCacheKey& key) {
const auto [entry, is_cache_miss] = cache.try_emplace(key);
auto& framebuffer{entry->second};
if (is_cache_miss) {
framebuffer = CreateFramebuffer(key);
}
return framebuffer.handle;
}
OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheKey& key) {
OGLFramebuffer framebuffer;
framebuffer.Create();
// TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs.
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle);
if (key.zeta) {
const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil;
const GLenum attach_target = stencil ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT;
key.zeta->Attach(attach_target, GL_DRAW_FRAMEBUFFER);
}
std::size_t num_buffers = 0;
std::array<GLenum, Maxwell::NumRenderTargets> targets;
for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
if (!key.colors[index]) {
targets[index] = GL_NONE;
continue;
}
const GLenum attach_target = GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index);
key.colors[index]->Attach(attach_target, GL_DRAW_FRAMEBUFFER);
const u32 attachment = (key.color_attachments >> (BitsPerAttachment * index)) & 0b1111;
targets[index] = GL_COLOR_ATTACHMENT0 + attachment;
num_buffers = index + 1;
}
if (num_buffers > 0) {
glDrawBuffers(static_cast<GLsizei>(num_buffers), std::data(targets));
} else {
glDrawBuffer(GL_NONE);
}
return framebuffer;
}
std::size_t FramebufferCacheKey::Hash() const noexcept {
std::size_t hash = std::hash<View>{}(zeta);
for (const auto& color : colors) {
hash ^= std::hash<View>{}(color);
}
hash ^= static_cast<std::size_t>(color_attachments) << 16;
return hash;
}
bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const noexcept {
return std::tie(colors, zeta, color_attachments) ==
std::tie(rhs.colors, rhs.zeta, rhs.color_attachments);
}
} // namespace OpenGL

View File

@ -1,68 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <unordered_map>
#include <glad/glad.h>
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_texture_cache.h"
namespace OpenGL {
constexpr std::size_t BitsPerAttachment = 4;
struct FramebufferCacheKey {
View zeta;
std::array<View, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors;
u32 color_attachments = 0;
std::size_t Hash() const noexcept;
bool operator==(const FramebufferCacheKey& rhs) const noexcept;
bool operator!=(const FramebufferCacheKey& rhs) const noexcept {
return !operator==(rhs);
}
void SetAttachment(std::size_t index, u32 attachment) {
color_attachments |= attachment << (BitsPerAttachment * index);
}
};
} // namespace OpenGL
namespace std {
template <>
struct hash<OpenGL::FramebufferCacheKey> {
std::size_t operator()(const OpenGL::FramebufferCacheKey& k) const noexcept {
return k.Hash();
}
};
} // namespace std
namespace OpenGL {
class FramebufferCacheOpenGL {
public:
FramebufferCacheOpenGL();
~FramebufferCacheOpenGL();
GLuint GetFramebuffer(const FramebufferCacheKey& key);
private:
OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key);
std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache;
};
} // namespace OpenGL

View File

@ -30,11 +30,9 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {
} // Anonymous namespace
QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
Tegra::MemoryManager& gpu_memory)
: VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>(
rasterizer, maxwell3d, gpu_memory),
gl_rasterizer{rasterizer} {}
QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_,
Tegra::MemoryManager& gpu_memory_)
: QueryCacheBase(rasterizer_, maxwell3d_, gpu_memory_), gl_rasterizer{rasterizer_} {}
QueryCache::~QueryCache() = default;
@ -59,10 +57,11 @@ bool QueryCache::AnyCommandQueued() const noexcept {
return gl_rasterizer.AnyCommandQueued();
}
HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type)
: VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache},
type{type}, query{cache.AllocateQuery(type)} {
HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
VideoCore::QueryType type_)
: HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, query{
cache.AllocateQuery(
type)} {
glBeginQuery(GetTarget(type), query.handle);
}
@ -86,13 +85,14 @@ u64 HostCounter::BlockingQuery() const {
return static_cast<u64>(value);
}
CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
: VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
CachedQuery::CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_,
u8* host_ptr_)
: CachedQueryBase{cpu_addr_, host_ptr_}, cache{&cache_}, type{type_} {}
CachedQuery::~CachedQuery() = default;
CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
: VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
: CachedQueryBase(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
cache = rhs.cache;

View File

@ -29,8 +29,8 @@ using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
class QueryCache final
: public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
public:
explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d,
Tegra::MemoryManager& gpu_memory);
explicit QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_,
Tegra::MemoryManager& gpu_memory_);
~QueryCache();
OGLQuery AllocateQuery(VideoCore::QueryType type);
@ -46,8 +46,8 @@ private:
class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
public:
explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type);
explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
VideoCore::QueryType type_);
~HostCounter();
void EndQuery();
@ -62,8 +62,8 @@ private:
class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
public:
explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
u8* host_ptr);
explicit CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_,
u8* host_ptr_);
~CachedQuery() override;
CachedQuery(CachedQuery&& rhs) noexcept;

View File

@ -25,12 +25,15 @@
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_query_cache.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_texture_cache.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
#include "video_core/shader_cache.h"
#include "video_core/texture_cache/texture_cache.h"
namespace OpenGL {
@ -55,18 +58,32 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
namespace {
constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
constexpr size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
constexpr size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
constexpr size_t TOTAL_CONST_BUFFER_BYTES =
NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
constexpr size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
constexpr size_t MAX_TEXTURES = 192;
constexpr size_t MAX_IMAGES = 48;
struct TextureHandle {
constexpr TextureHandle(u32 data, bool via_header_index) {
const Tegra::Texture::TextureHandle handle{data};
image = handle.tic_id;
sampler = via_header_index ? image : handle.tsc_id.Value();
}
u32 image;
u32 sampler;
};
template <typename Engine, typename Entry>
Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
ShaderType shader_type, std::size_t index = 0) {
TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const Entry& entry,
ShaderType shader_type, size_t index = 0) {
if constexpr (std::is_same_v<Entry, SamplerEntry>) {
if (entry.is_separated) {
const u32 buffer_1 = entry.buffer;
@ -75,21 +92,16 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
const u32 offset_2 = entry.secondary_offset;
const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
return engine.GetTextureInfo(handle_1 | handle_2);
return TextureHandle(handle_1 | handle_2, via_header_index);
}
}
if (entry.is_bindless) {
const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
return engine.GetTextureInfo(handle);
}
const auto& gpu_profile = engine.AccessGuestDriverProfile();
const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
return engine.GetStageTexture(shader_type, offset);
} else {
return engine.GetTexture(offset);
const u32 raw = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
return TextureHandle(raw, via_header_index);
}
const u32 buffer = engine.GetBoundBuffer();
const u64 offset = (entry.offset + index) * sizeof(u32);
return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
}
std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
@ -97,7 +109,6 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
if (!entry.IsIndirect()) {
return entry.GetSize();
}
if (buffer.size > Maxwell::MaxConstBufferSize) {
LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
Maxwell::MaxConstBufferSize);
@ -131,7 +142,7 @@ std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
case 43:
return {GL_BACK_SECONDARY_COLOR_NV, 0};
}
UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
UNIMPLEMENTED_MSG("index={}", index);
return {GL_POSITION, 0};
}
@ -139,35 +150,68 @@ void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) {
if (num_entries == 0) {
void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
if (num_ssbos == 0) {
return;
}
if (num_entries % 2 == 1) {
pointers[num_entries] = 0;
glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
reinterpret_cast<const GLuint*>(ssbos));
}
ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
if (entry.is_buffer) {
return ImageViewType::Buffer;
}
const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2);
glProgramLocalParametersI4uivNV(target, 0, num_vectors,
reinterpret_cast<const GLuint*>(pointers));
switch (entry.type) {
case Tegra::Shader::TextureType::Texture1D:
return entry.is_array ? ImageViewType::e1DArray : ImageViewType::e1D;
case Tegra::Shader::TextureType::Texture2D:
return entry.is_array ? ImageViewType::e2DArray : ImageViewType::e2D;
case Tegra::Shader::TextureType::Texture3D:
return ImageViewType::e3D;
case Tegra::Shader::TextureType::TextureCube:
return entry.is_array ? ImageViewType::CubeArray : ImageViewType::Cube;
}
UNREACHABLE();
return ImageViewType::e2D;
}
ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) {
switch (entry.type) {
case Tegra::Shader::ImageType::Texture1D:
return ImageViewType::e1D;
case Tegra::Shader::ImageType::Texture1DArray:
return ImageViewType::e1DArray;
case Tegra::Shader::ImageType::Texture2D:
return ImageViewType::e2D;
case Tegra::Shader::ImageType::Texture2DArray:
return ImageViewType::e2DArray;
case Tegra::Shader::ImageType::Texture3D:
return ImageViewType::e3D;
case Tegra::Shader::ImageType::TextureBuffer:
return ImageViewType::Buffer;
}
UNREACHABLE();
return ImageViewType::e2D;
}
} // Anonymous namespace
RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_,
Core::Memory::Memory& cpu_memory, const Device& device_,
RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
Core::Memory::Memory& cpu_memory_, const Device& device_,
ScreenInfo& screen_info_, ProgramManager& program_manager_,
StateTracker& state_tracker_)
: RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
: RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker),
shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device),
stream_buffer(device, state_tracker),
texture_cache_runtime(device, program_manager, state_tracker),
texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
query_cache(*this, maxwell3d, gpu_memory),
buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE),
buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
async_shaders(emu_window) {
CheckExtensions();
async_shaders(emu_window_) {
unified_uniform_buffer.Create();
glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
@ -178,7 +222,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra:
nullptr, 0);
}
}
if (device.UseAsynchronousShaders()) {
async_shaders.AllocateWorkers();
}
@ -190,14 +233,6 @@ RasterizerOpenGL::~RasterizerOpenGL() {
}
}
void RasterizerOpenGL::CheckExtensions() {
if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
LOG_WARNING(
Render_OpenGL,
"Anisotropic filter is not supported! This can cause graphical issues in some games.");
}
}
void RasterizerOpenGL::SetupVertexFormat() {
auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexFormats]) {
@ -320,10 +355,16 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
return info.offset;
}
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
void RasterizerOpenGL::SetupShaders() {
MICROPROFILE_SCOPE(OpenGL_Shader);
u32 clip_distances = 0;
std::array<Shader*, Maxwell::MaxShaderStage> shaders{};
image_view_indices.clear();
sampler_handles.clear();
texture_cache.SynchronizeGraphicsDescriptors();
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
const auto& shader_config = maxwell3d.regs.shader_config[index];
const auto program{static_cast<Maxwell::ShaderProgram>(index)};
@ -342,7 +383,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
}
continue;
}
// Currently this stages are not supported in the OpenGL backend.
// TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
if (program == Maxwell::ShaderProgram::TesselationControl ||
@ -351,7 +391,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
}
Shader* const shader = shader_cache.GetStageProgram(program, async_shaders);
const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0;
switch (program) {
case Maxwell::ShaderProgram::VertexA:
@ -367,14 +406,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
default:
UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index,
shader_config.enable.Value(), shader_config.offset);
break;
}
// Stage indices are 0 - 5
const std::size_t stage = index == 0 ? 0 : index - 1;
const size_t stage = index == 0 ? 0 : index - 1;
shaders[stage] = shader;
SetupDrawConstBuffers(stage, shader);
SetupDrawGlobalMemory(stage, shader);
SetupDrawTextures(stage, shader);
SetupDrawImages(stage, shader);
SetupDrawTextures(shader, stage);
SetupDrawImages(shader, stage);
// Workaround for Intel drivers.
// When a clip distance is enabled but not set in the shader it crops parts of the screen
@ -388,9 +430,23 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
++index;
}
}
SyncClipEnabled(clip_distances);
maxwell3d.dirty.flags[Dirty::Shaders] = false;
const std::span indices_span(image_view_indices.data(), image_view_indices.size());
texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
size_t image_view_index = 0;
size_t texture_index = 0;
size_t image_index = 0;
for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
const Shader* const shader = shaders[stage];
if (shader) {
const auto base = device.GetBaseBindings(stage);
BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
texture_index, image_index);
}
}
}
std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@ -421,98 +477,6 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s
shader_cache.LoadDiskCache(title_id, stop_loading, callback);
}
void RasterizerOpenGL::ConfigureFramebuffers() {
MICROPROFILE_SCOPE(OpenGL_Framebuffer);
if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) {
return;
}
maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false;
texture_cache.GuardRenderTargets(true);
View depth_surface = texture_cache.GetDepthBufferSurface(true);
const auto& regs = maxwell3d.regs;
UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0);
// Bind the framebuffer surfaces
FramebufferCacheKey key;
const auto colors_count = static_cast<std::size_t>(regs.rt_control.count);
for (std::size_t index = 0; index < colors_count; ++index) {
View color_surface{texture_cache.GetColorBufferSurface(index, true)};
if (!color_surface) {
continue;
}
// Assume that a surface will be written to if it is used as a framebuffer, even
// if the shader doesn't actually write to it.
texture_cache.MarkColorBufferInUse(index);
key.SetAttachment(index, regs.rt_control.GetMap(index));
key.colors[index] = std::move(color_surface);
}
if (depth_surface) {
// Assume that a surface will be written to if it is used as a framebuffer, even if
// the shader doesn't actually write to it.
texture_cache.MarkDepthBufferInUse();
key.zeta = std::move(depth_surface);
}
texture_cache.GuardRenderTargets(false);
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
}
void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) {
const auto& regs = maxwell3d.regs;
texture_cache.GuardRenderTargets(true);
View color_surface;
if (using_color) {
// Determine if we have to preserve the contents.
// First we have to make sure all clear masks are enabled.
bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G ||
!regs.clear_buffers.B || !regs.clear_buffers.A;
const std::size_t index = regs.clear_buffers.RT;
if (regs.clear_flags.scissor) {
// Then we have to confirm scissor testing clears the whole image.
const auto& scissor = regs.scissor_test[0];
preserve_contents |= scissor.min_x > 0;
preserve_contents |= scissor.min_y > 0;
preserve_contents |= scissor.max_x < regs.rt[index].width;
preserve_contents |= scissor.max_y < regs.rt[index].height;
}
color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents);
texture_cache.MarkColorBufferInUse(index);
}
View depth_surface;
if (using_depth_stencil) {
bool preserve_contents = false;
if (regs.clear_flags.scissor) {
// For depth stencil clears we only have to confirm scissor test covers the whole image.
const auto& scissor = regs.scissor_test[0];
preserve_contents |= scissor.min_x > 0;
preserve_contents |= scissor.min_y > 0;
preserve_contents |= scissor.max_x < regs.zeta_width;
preserve_contents |= scissor.max_y < regs.zeta_height;
}
depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents);
texture_cache.MarkDepthBufferInUse();
}
texture_cache.GuardRenderTargets(false);
FramebufferCacheKey key;
key.colors[0] = std::move(color_surface);
key.zeta = std::move(depth_surface);
state_tracker.NotifyFramebuffer();
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key));
}
void RasterizerOpenGL::Clear() {
if (!maxwell3d.ShouldExecute()) {
return;
@ -527,8 +491,9 @@ void RasterizerOpenGL::Clear() {
regs.clear_buffers.A) {
use_color = true;
state_tracker.NotifyColorMask0();
glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0,
const GLuint index = regs.clear_buffers.RT;
state_tracker.NotifyColorMask(index);
glColorMaski(index, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0,
regs.clear_buffers.B != 0, regs.clear_buffers.A != 0);
// TODO(Rodrigo): Determine if clamping is used on clears
@ -561,15 +526,17 @@ void RasterizerOpenGL::Clear() {
state_tracker.NotifyScissor0();
glDisablei(GL_SCISSOR_TEST, 0);
}
UNIMPLEMENTED_IF(regs.clear_flags.viewport);
ConfigureClearFramebuffer(use_color, use_depth || use_stencil);
if (use_color) {
glClearBufferfv(GL_COLOR, 0, regs.clear_color);
{
auto lock = texture_cache.AcquireLock();
texture_cache.UpdateRenderTargets(true);
state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
}
if (use_color) {
glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
}
if (use_depth && use_stencil) {
glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
} else if (use_depth) {
@ -626,16 +593,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
// Prepare the vertex array.
const bool invalidated = buffer_cache.Map(buffer_size);
if (invalidated) {
// When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
auto& dirty = maxwell3d.dirty.flags;
dirty[Dirty::VertexBuffers] = true;
for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
dirty[index] = true;
}
}
buffer_cache.Map(buffer_size);
// Prepare vertex array format.
SetupVertexFormat();
@ -659,22 +617,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
// Setup shaders and their used resources.
texture_cache.GuardSamplers(true);
const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
SetupShaders(primitive_mode);
texture_cache.GuardSamplers(false);
ConfigureFramebuffers();
auto lock = texture_cache.AcquireLock();
SetupShaders();
// Signal the buffer cache that we are not going to upload more things.
buffer_cache.Unmap();
texture_cache.UpdateRenderTargets(false);
state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
program_manager.BindGraphicsPipeline();
if (texture_cache.TextureBarrier()) {
glTextureBarrier();
}
const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology);
BeginTransformFeedback(primitive_mode);
const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance);
@ -726,15 +678,13 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
buffer_cache.Acquire();
current_cbuf = 0;
auto kernel = shader_cache.GetComputeKernel(code_addr);
program_manager.BindCompute(kernel->GetHandle());
Shader* const kernel = shader_cache.GetComputeKernel(code_addr);
SetupComputeTextures(kernel);
SetupComputeImages(kernel);
auto lock = texture_cache.AcquireLock();
BindComputeTextures(kernel);
const std::size_t buffer_size =
Tegra::Engines::KeplerCompute::NumConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
buffer_cache.Map(buffer_size);
SetupComputeConstBuffers(kernel);
@ -743,7 +693,6 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
buffer_cache.Unmap();
const auto& launch_desc = kepler_compute.launch_description;
program_manager.BindCompute(kernel->GetHandle());
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
++num_queued_commands;
}
@ -764,7 +713,10 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
if (addr == 0 || size == 0) {
return;
}
texture_cache.FlushRegion(addr, size);
{
auto lock = texture_cache.AcquireLock();
texture_cache.DownloadMemory(addr, size);
}
buffer_cache.FlushRegion(addr, size);
query_cache.FlushRegion(addr, size);
}
@ -773,7 +725,8 @@ bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
if (!Settings::IsGPULevelHigh()) {
return buffer_cache.MustFlushRegion(addr, size);
}
return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size);
return texture_cache.IsRegionGpuModified(addr, size) ||
buffer_cache.MustFlushRegion(addr, size);
}
void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@ -781,7 +734,10 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
if (addr == 0 || size == 0) {
return;
}
texture_cache.InvalidateRegion(addr, size);
{
auto lock = texture_cache.AcquireLock();
texture_cache.WriteMemory(addr, size);
}
shader_cache.InvalidateRegion(addr, size);
buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size);
@ -792,18 +748,29 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
if (addr == 0 || size == 0) {
return;
}
texture_cache.OnCPUWrite(addr, size);
{
auto lock = texture_cache.AcquireLock();
texture_cache.WriteMemory(addr, size);
}
shader_cache.OnCPUWrite(addr, size);
buffer_cache.OnCPUWrite(addr, size);
}
void RasterizerOpenGL::SyncGuestHost() {
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
texture_cache.SyncGuestHost();
buffer_cache.SyncGuestHost();
shader_cache.SyncGuestHost();
}
void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
{
auto lock = texture_cache.AcquireLock();
texture_cache.UnmapMemory(addr, size);
}
buffer_cache.OnCPUWrite(addr, size);
shader_cache.OnCPUWrite(addr, size);
}
void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
if (!gpu.IsAsync()) {
gpu_memory.Write<u32>(addr, value);
@ -845,6 +812,14 @@ void RasterizerOpenGL::WaitForIdle() {
GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
}
void RasterizerOpenGL::FragmentBarrier() {
glMemoryBarrier(GL_FRAMEBUFFER_BARRIER_BIT);
}
void RasterizerOpenGL::TiledCacheBarrier() {
glTextureBarrier();
}
void RasterizerOpenGL::FlushCommands() {
// Only flush when we have commands queued to OpenGL.
if (num_queued_commands == 0) {
@ -858,53 +833,103 @@ void RasterizerOpenGL::TickFrame() {
// Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
num_queued_commands = 0;
fence_manager.TickFrame();
buffer_cache.TickFrame();
{
auto lock = texture_cache.AcquireLock();
texture_cache.TickFrame();
}
}
bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
texture_cache.DoFermiCopy(src, dst, copy_config);
auto lock = texture_cache.AcquireLock();
texture_cache.BlitImage(dst, src, copy_config);
return true;
}
bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
if (!framebuffer_addr) {
return {};
if (framebuffer_addr == 0) {
return false;
}
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
const auto surface{texture_cache.TryFindFramebufferSurface(framebuffer_addr)};
if (!surface) {
return {};
auto lock = texture_cache.AcquireLock();
ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)};
if (!image_view) {
return false;
}
// Verify that the cached surface is the same size and format as the requested framebuffer
const auto& params{surface->GetSurfaceParams()};
const auto& pixel_format{
VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)};
ASSERT_MSG(params.width == config.width, "Framebuffer width is different");
ASSERT_MSG(params.height == config.height, "Framebuffer height is different");
if (params.pixel_format != pixel_format) {
LOG_DEBUG(Render_OpenGL, "Framebuffer pixel_format is different");
}
screen_info.display_texture = surface->GetTexture();
screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion;
// ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different");
// ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different");
screen_info.display_texture = image_view->Handle(ImageViewType::e2D);
screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format);
return true;
}
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
static constexpr std::array PARAMETER_LUT = {
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
void RasterizerOpenGL::BindComputeTextures(Shader* kernel) {
image_view_indices.clear();
sampler_handles.clear();
texture_cache.SynchronizeComputeDescriptors();
SetupComputeTextures(kernel);
SetupComputeImages(kernel);
const std::span indices_span(image_view_indices.data(), image_view_indices.size());
texture_cache.FillComputeImageViews(indices_span, image_view_ids);
program_manager.BindCompute(kernel->GetHandle());
size_t image_view_index = 0;
size_t texture_index = 0;
size_t image_index = 0;
BindTextures(kernel->GetEntries(), 0, 0, image_view_index, texture_index, image_index);
}
void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_texture,
GLuint base_image, size_t& image_view_index,
size_t& texture_index, size_t& image_index) {
const GLuint* const samplers = sampler_handles.data() + texture_index;
const GLuint* const textures = texture_handles.data() + texture_index;
const GLuint* const images = image_handles.data() + image_index;
const size_t num_samplers = entries.samplers.size();
for (const auto& sampler : entries.samplers) {
for (size_t i = 0; i < sampler.size; ++i) {
const ImageViewId image_view_id = image_view_ids[image_view_index++];
const ImageView& image_view = texture_cache.GetImageView(image_view_id);
const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(sampler));
texture_handles[texture_index++] = handle;
}
}
const size_t num_images = entries.images.size();
for (size_t unit = 0; unit < num_images; ++unit) {
// TODO: Mark as modified
const ImageViewId image_view_id = image_view_ids[image_view_index++];
const ImageView& image_view = texture_cache.GetImageView(image_view_id);
const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(entries.images[unit]));
image_handles[image_index] = handle;
++image_index;
}
if (num_samplers > 0) {
glBindSamplers(base_texture, static_cast<GLsizei>(num_samplers), samplers);
glBindTextures(base_texture, static_cast<GLsizei>(num_samplers), textures);
}
if (num_images > 0) {
glBindImageTextures(base_image, static_cast<GLsizei>(num_images), images);
}
}
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
static constexpr std::array PARAMETER_LUT{
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
};
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& stages = maxwell3d.state.shader_stages;
const auto& shader_stage = stages[stage_index];
@ -1003,12 +1028,11 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh
GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
};
const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
const auto& entries{shader->GetEntries().global_memory_entries};
std::array<GLuint64EXT, 32> pointers;
ASSERT(entries.size() < pointers.size());
std::array<BindlessSSBO, 32> ssbos;
ASSERT(entries.size() < ssbos.size());
const bool assembly_shaders = device.UseAssemblyShaders();
u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
@ -1016,11 +1040,11 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh
const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
const u32 size{gpu_memory.Read<u32>(addr + 8)};
SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
++binding;
}
if (assembly_shaders) {
UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size());
UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
}
}
@ -1028,106 +1052,85 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
const auto& entries{kernel->GetEntries().global_memory_entries};
std::array<GLuint64EXT, 32> pointers;
ASSERT(entries.size() < pointers.size());
std::array<BindlessSSBO, 32> ssbos;
ASSERT(entries.size() < ssbos.size());
u32 binding = 0;
for (const auto& entry : entries) {
const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
const u32 size{gpu_memory.Read<u32>(addr + 8)};
SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]);
SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
++binding;
}
if (device.UseAssemblyShaders()) {
UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size());
UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
}
}
void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
GPUVAddr gpu_addr, std::size_t size,
GLuint64EXT* pointer) {
const std::size_t alignment{device.GetShaderStorageBufferAlignment()};
GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
const size_t alignment{device.GetShaderStorageBufferAlignment()};
const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
if (device.UseAssemblyShaders()) {
*pointer = info.address + info.offset;
*ssbo = BindlessSSBO{
.address = static_cast<GLuint64EXT>(info.address + info.offset),
.length = static_cast<GLsizei>(size),
.padding = 0,
};
} else {
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
static_cast<GLsizeiptr>(size));
}
}
void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
MICROPROFILE_SCOPE(OpenGL_Texture);
u32 binding = device.GetBaseBindings(stage_index).sampler;
void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) {
const bool via_header_index =
maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : shader->GetEntries().samplers) {
const auto shader_type = static_cast<ShaderType>(stage_index);
for (std::size_t i = 0; i < entry.size; ++i) {
const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
SetupTexture(binding++, texture, entry);
for (size_t index = 0; index < entry.size; ++index) {
const auto handle =
GetTextureInfo(maxwell3d, via_header_index, entry, shader_type, index);
const Sampler* const sampler = texture_cache.GetGraphicsSampler(handle.sampler);
sampler_handles.push_back(sampler->Handle());
image_view_indices.push_back(handle.image);
}
}
}
void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
MICROPROFILE_SCOPE(OpenGL_Texture);
u32 binding = 0;
void RasterizerOpenGL::SetupComputeTextures(const Shader* kernel) {
const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : kernel->GetEntries().samplers) {
for (std::size_t i = 0; i < entry.size; ++i) {
const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i);
SetupTexture(binding++, texture, entry);
for (size_t i = 0; i < entry.size; ++i) {
const auto handle =
GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute, i);
const Sampler* const sampler = texture_cache.GetComputeSampler(handle.sampler);
sampler_handles.push_back(sampler->Handle());
image_view_indices.push_back(handle.image);
}
}
}
void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
const SamplerEntry& entry) {
const auto view = texture_cache.GetTextureSurface(texture.tic, entry);
if (!view) {
// Can occur when texture addr is null or its memory is unmapped/invalid
glBindSampler(binding, 0);
glBindTextureUnit(binding, 0);
return;
}
const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
texture.tic.z_source, texture.tic.w_source);
glBindTextureUnit(binding, handle);
if (!view->GetSurfaceParams().IsBuffer()) {
glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
}
void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
u32 binding = device.GetBaseBindings(stage_index).image;
void RasterizerOpenGL::SetupDrawImages(const Shader* shader, size_t stage_index) {
const bool via_header_index =
maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : shader->GetEntries().images) {
const auto shader_type = static_cast<ShaderType>(stage_index);
const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
SetupImage(binding++, tic, entry);
const auto handle = GetTextureInfo(maxwell3d, via_header_index, entry, shader_type);
image_view_indices.push_back(handle.image);
}
}
void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
u32 binding = 0;
void RasterizerOpenGL::SetupComputeImages(const Shader* shader) {
const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : shader->GetEntries().images) {
const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic;
SetupImage(binding++, tic, entry);
const auto handle =
GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute);
image_view_indices.push_back(handle.image);
}
}
void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
const ImageEntry& entry) {
const auto view = texture_cache.GetImageSurface(tic, entry);
if (!view) {
glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
return;
}
if (entry.is_written) {
view->MarkAsModified(texture_cache.Tick());
}
const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
}
void RasterizerOpenGL::SyncViewport() {
auto& flags = maxwell3d.dirty.flags;
const auto& regs = maxwell3d.regs;
@ -1157,7 +1160,7 @@ void RasterizerOpenGL::SyncViewport() {
flags[Dirty::ClipControl] = false;
bool flip_y = false;
if (regs.viewport_transform[0].scale_y < 0.0) {
if (regs.viewport_transform[0].scale_y < 0.0f) {
flip_y = !flip_y;
}
if (regs.screen_y_control.y_negate != 0) {
@ -1527,17 +1530,9 @@ void RasterizerOpenGL::SyncPointState() {
flags[Dirty::PointSize] = false;
oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable);
oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable);
if (maxwell3d.regs.vp_point_size.enable) {
// By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled.
glEnable(GL_PROGRAM_POINT_SIZE);
return;
}
// Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
// in OpenGL).
glPointSize(std::max(1.0f, maxwell3d.regs.point_size));
glDisable(GL_PROGRAM_POINT_SIZE);
}
void RasterizerOpenGL::SyncLineState() {
@ -1580,10 +1575,6 @@ void RasterizerOpenGL::SyncAlphaTest() {
flags[Dirty::AlphaTest] = false;
const auto& regs = maxwell3d.regs;
if (regs.alpha_test_enabled && regs.rt_control.count > 1) {
LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested");
}
if (regs.alpha_test_enabled) {
glEnable(GL_ALPHA_TEST);
glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref);

View File

@ -7,12 +7,13 @@
#include <array>
#include <atomic>
#include <cstddef>
#include <map>
#include <memory>
#include <optional>
#include <tuple>
#include <utility>
#include <boost/container/static_vector.hpp>
#include <glad/glad.h>
#include "common/common_types.h"
@ -23,16 +24,14 @@
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_fence_manager.h"
#include "video_core/renderer_opengl/gl_framebuffer_cache.h"
#include "video_core/renderer_opengl/gl_query_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_sampler_cache.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
#include "video_core/renderer_opengl/gl_texture_cache.h"
#include "video_core/renderer_opengl/utils.h"
#include "video_core/shader/async_shaders.h"
#include "video_core/textures/texture.h"
@ -51,14 +50,21 @@ class MemoryManager;
namespace OpenGL {
struct ScreenInfo;
struct DrawParameters;
struct ShaderEntries;
struct BindlessSSBO {
GLuint64EXT address;
GLsizei length;
GLsizei padding;
};
static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128);
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
public:
explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu,
Core::Memory::Memory& cpu_memory, const Device& device,
ScreenInfo& screen_info, ProgramManager& program_manager,
StateTracker& state_tracker);
explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
Core::Memory::Memory& cpu_memory_, const Device& device_,
ScreenInfo& screen_info_, ProgramManager& program_manager_,
StateTracker& state_tracker_);
~RasterizerOpenGL() override;
void Draw(bool is_indexed, bool is_instanced) override;
@ -72,15 +78,18 @@ public:
void InvalidateRegion(VAddr addr, u64 size) override;
void OnCPUWrite(VAddr addr, u64 size) override;
void SyncGuestHost() override;
void UnmapMemory(VAddr addr, u64 size) override;
void SignalSemaphore(GPUVAddr addr, u32 value) override;
void SignalSyncPoint(u32 value) override;
void ReleaseFences() override;
void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
void WaitForIdle() override;
void FragmentBarrier() override;
void TiledCacheBarrier() override;
void FlushCommands() override;
void TickFrame() override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
@ -101,11 +110,14 @@ public:
}
private:
/// Configures the color and depth framebuffer states.
void ConfigureFramebuffers();
static constexpr size_t MAX_TEXTURES = 192;
static constexpr size_t MAX_IMAGES = 48;
static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES;
/// Configures the color and depth framebuffer for clearing.
void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
void BindComputeTextures(Shader* kernel);
void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image,
size_t& image_view_index, size_t& texture_index, size_t& image_index);
/// Configures the current constbuffers to use for the draw command.
void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
@ -126,26 +138,19 @@ private:
/// Configures a global memory buffer.
void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
std::size_t size, GLuint64EXT* pointer);
size_t size, BindlessSSBO* ssbo);
/// Configures the current textures to use for the draw command.
void SetupDrawTextures(std::size_t stage_index, Shader* shader);
void SetupDrawTextures(const Shader* shader, size_t stage_index);
/// Configures the textures used in a compute shader.
void SetupComputeTextures(Shader* kernel);
/// Configures a texture.
void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
const SamplerEntry& entry);
void SetupComputeTextures(const Shader* kernel);
/// Configures images in a graphics shader.
void SetupDrawImages(std::size_t stage_index, Shader* shader);
void SetupDrawImages(const Shader* shader, size_t stage_index);
/// Configures images in a compute shader.
void SetupComputeImages(Shader* shader);
/// Configures an image.
void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
void SetupComputeImages(const Shader* shader);
/// Syncs the viewport and depth range to match the guest state
void SyncViewport();
@ -220,9 +225,6 @@ private:
/// End a transform feedback
void EndTransformFeedback();
/// Check for extension that are not strictly required but are needed for correct emulation
void CheckExtensions();
std::size_t CalculateVertexArraysSize() const;
std::size_t CalculateIndexBufferSize() const;
@ -235,7 +237,7 @@ private:
GLintptr SetupIndexBuffer();
void SetupShaders(GLenum primitive_mode);
void SetupShaders();
Tegra::GPU& gpu;
Tegra::Engines::Maxwell3D& maxwell3d;
@ -247,19 +249,21 @@ private:
ProgramManager& program_manager;
StateTracker& state_tracker;
TextureCacheOpenGL texture_cache;
OGLStreamBuffer stream_buffer;
TextureCacheRuntime texture_cache_runtime;
TextureCache texture_cache;
ShaderCacheOpenGL shader_cache;
SamplerCacheOpenGL sampler_cache;
FramebufferCacheOpenGL framebuffer_cache;
QueryCache query_cache;
OGLBufferCache buffer_cache;
FenceManagerOpenGL fence_manager;
VideoCommon::Shader::AsyncShaders async_shaders;
static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
GLint vertex_binding = 0;
boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
std::array<GLuint, MAX_TEXTURES> texture_handles;
std::array<GLuint, MAX_IMAGES> image_handles;
std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
transform_feedback_buffers;
@ -273,7 +277,7 @@ private:
std::size_t current_cbuf = 0;
OGLBuffer unified_uniform_buffer;
/// Number of commands queued to the OpenGL driver. Reseted on flush.
/// Number of commands queued to the OpenGL driver. Resetted on flush.
std::size_t num_queued_commands = 0;
u32 last_clip_distance_mask = 0;

View File

@ -71,7 +71,7 @@ void OGLSampler::Create() {
return;
MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
glGenSamplers(1, &handle);
glCreateSamplers(1, &handle);
}
void OGLSampler::Release() {

View File

@ -1,52 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/logging/log.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_sampler_cache.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
namespace OpenGL {
SamplerCacheOpenGL::SamplerCacheOpenGL() = default;
SamplerCacheOpenGL::~SamplerCacheOpenGL() = default;
OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const {
OGLSampler sampler;
sampler.Create();
const GLuint sampler_id{sampler.handle};
glSamplerParameteri(
sampler_id, GL_TEXTURE_MAG_FILTER,
MaxwellToGL::TextureFilterMode(tsc.mag_filter, Tegra::Texture::TextureMipmapFilter::None));
glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER,
MaxwellToGL::TextureFilterMode(tsc.min_filter, tsc.mipmap_filter));
glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(tsc.wrap_u));
glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(tsc.wrap_v));
glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(tsc.wrap_p));
glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_MODE,
tsc.depth_compare_enabled == 1 ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE);
glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_FUNC,
MaxwellToGL::DepthCompareFunc(tsc.depth_compare_func));
glSamplerParameterfv(sampler_id, GL_TEXTURE_BORDER_COLOR, tsc.GetBorderColor().data());
glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, tsc.GetMinLod());
glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, tsc.GetMaxLod());
glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, tsc.GetLodBias());
if (GLAD_GL_ARB_texture_filter_anisotropic) {
glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy());
} else if (GLAD_GL_EXT_texture_filter_anisotropic) {
glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy());
} else {
LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver");
}
return sampler;
}
GLuint SamplerCacheOpenGL::ToSamplerType(const OGLSampler& sampler) const {
return sampler.handle;
}
} // namespace OpenGL

Some files were not shown because too many files have changed in this diff Show More