SMMU: Initial adaptation to video_core.

This commit is contained in:
Fernando Sahmkow
2023-12-25 07:32:16 +01:00
committed by Liam
parent c85d7ccd79
commit 0a2536a0df
79 changed files with 1262 additions and 1263 deletions

View File

@ -33,13 +33,12 @@ struct NullBufferParams {};
*
* The buffer size and address is forcefully aligned to CPU page boundaries.
*/
template <class RasterizerInterface>
class BufferBase {
public:
static constexpr u64 BASE_PAGE_BITS = 16;
static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS;
explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_)
explicit BufferBase(VAddr cpu_addr_, u64 size_bytes_)
: cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {}
explicit BufferBase(NullBufferParams) {}

File diff suppressed because it is too large Load Diff

View File

@ -32,7 +32,6 @@
#include "common/microprofile.h"
#include "common/scope_exit.h"
#include "common/settings.h"
#include "core/memory.h"
#include "video_core/buffer_cache/buffer_base.h"
#include "video_core/control/channel_state_cache.h"
#include "video_core/delayed_destruction_ring.h"
@ -41,7 +40,6 @@
#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/surface.h"
#include "video_core/texture_cache/slot_vector.h"
#include "video_core/texture_cache/types.h"
@ -94,7 +92,7 @@ static constexpr BufferId NULL_BUFFER_ID{0};
static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB);
struct Binding {
VAddr cpu_addr{};
DAddr device_addr{};
u32 size{};
BufferId buffer_id;
};
@ -104,7 +102,7 @@ struct TextureBufferBinding : Binding {
};
static constexpr Binding NULL_BINDING{
.cpu_addr = 0,
.device_addr = 0,
.size = 0,
.buffer_id = NULL_BUFFER_ID,
};
@ -204,10 +202,10 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInf
using Async_Buffer = typename P::Async_Buffer;
using MemoryTracker = typename P::MemoryTracker;
using IntervalCompare = std::less<VAddr>;
using IntervalInstance = boost::icl::interval_type_default<VAddr, std::less>;
using IntervalAllocator = boost::fast_pool_allocator<VAddr>;
using IntervalSet = boost::icl::interval_set<VAddr>;
using IntervalCompare = std::less<DAddr>;
using IntervalInstance = boost::icl::interval_type_default<DAddr, std::less>;
using IntervalAllocator = boost::fast_pool_allocator<DAddr>;
using IntervalSet = boost::icl::interval_set<DAddr>;
using IntervalType = typename IntervalSet::interval_type;
template <typename Type>
@ -230,32 +228,31 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInf
using OverlapCombine = counter_add_functor<int>;
using OverlapSection = boost::icl::inter_section<int>;
using OverlapCounter = boost::icl::split_interval_map<VAddr, int>;
using OverlapCounter = boost::icl::split_interval_map<DAddr, int>;
struct OverlapResult {
boost::container::small_vector<BufferId, 16> ids;
VAddr begin;
VAddr end;
DAddr begin;
DAddr end;
bool has_stream_leap = false;
};
public:
explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
Core::Memory::Memory& cpu_memory_, Runtime& runtime_);
explicit BufferCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, Runtime& runtime_);
void TickFrame();
void WriteMemory(VAddr cpu_addr, u64 size);
void WriteMemory(DAddr device_addr, u64 size);
void CachedWriteMemory(VAddr cpu_addr, u64 size);
void CachedWriteMemory(DAddr device_addr, u64 size);
bool OnCPUWrite(VAddr cpu_addr, u64 size);
bool OnCPUWrite(DAddr device_addr, u64 size);
void DownloadMemory(VAddr cpu_addr, u64 size);
void DownloadMemory(DAddr device_addr, u64 size);
std::optional<VideoCore::RasterizerDownloadArea> GetFlushArea(VAddr cpu_addr, u64 size);
std::optional<VideoCore::RasterizerDownloadArea> GetFlushArea(DAddr device_addr, u64 size);
bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
bool InlineMemory(DAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
@ -300,7 +297,7 @@ public:
ObtainBufferSynchronize sync_info,
ObtainBufferOperation post_op);
[[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(VAddr gpu_addr, u32 size,
[[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(DAddr gpu_addr, u32 size,
ObtainBufferSynchronize sync_info,
ObtainBufferOperation post_op);
void FlushCachedWrites();
@ -326,13 +323,13 @@ public:
bool DMAClear(GPUVAddr src_address, u64 amount, u32 value);
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
[[nodiscard]] bool IsRegionGpuModified(DAddr addr, size_t size);
/// Return true when a region is registered on the cache
[[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
[[nodiscard]] bool IsRegionRegistered(DAddr addr, size_t size);
/// Return true when a CPU region is modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
[[nodiscard]] bool IsRegionCpuModified(DAddr addr, size_t size);
void SetDrawIndirect(
const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) {
@ -366,9 +363,9 @@ private:
}
template <typename Func>
void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
const u64 page_end = Common::DivCeil(cpu_addr + size, CACHING_PAGESIZE);
for (u64 page = cpu_addr >> CACHING_PAGEBITS; page < page_end;) {
void ForEachBufferInRange(DAddr device_addr, u64 size, Func&& func) {
const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page];
if (!buffer_id) {
++page;
@ -377,15 +374,15 @@ private:
Buffer& buffer = slot_buffers[buffer_id];
func(buffer_id, buffer);
const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
const DAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
}
}
template <typename Func>
void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) {
const VAddr start_address = cpu_addr;
const VAddr end_address = start_address + size;
void ForEachInRangeSet(IntervalSet& current_range, DAddr device_addr, u64 size, Func&& func) {
const DAddr start_address = device_addr;
const DAddr end_address = start_address + size;
const IntervalType search_interval{start_address, end_address};
auto it = current_range.lower_bound(search_interval);
if (it == current_range.end()) {
@ -393,8 +390,8 @@ private:
}
auto end_it = current_range.upper_bound(search_interval);
for (; it != end_it; it++) {
VAddr inter_addr_end = it->upper();
VAddr inter_addr = it->lower();
DAddr inter_addr_end = it->upper();
DAddr inter_addr = it->lower();
if (inter_addr_end > end_address) {
inter_addr_end = end_address;
}
@ -406,10 +403,10 @@ private:
}
template <typename Func>
void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size,
void ForEachInOverlapCounter(OverlapCounter& current_range, DAddr device_addr, u64 size,
Func&& func) {
const VAddr start_address = cpu_addr;
const VAddr end_address = start_address + size;
const DAddr start_address = device_addr;
const DAddr end_address = start_address + size;
const IntervalType search_interval{start_address, end_address};
auto it = current_range.lower_bound(search_interval);
if (it == current_range.end()) {
@ -418,8 +415,8 @@ private:
auto end_it = current_range.upper_bound(search_interval);
for (; it != end_it; it++) {
auto& inter = it->first;
VAddr inter_addr_end = inter.upper();
VAddr inter_addr = inter.lower();
DAddr inter_addr_end = inter.upper();
DAddr inter_addr = inter.lower();
if (inter_addr_end > end_address) {
inter_addr_end = end_address;
}
@ -451,9 +448,9 @@ private:
} while (any_removals);
}
static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) ==
((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK);
static bool IsRangeGranular(DAddr device_addr, size_t size) {
return (device_addr & ~Core::Memory::YUZU_PAGEMASK) ==
((device_addr + size) & ~Core::Memory::YUZU_PAGEMASK);
}
void RunGarbageCollector();
@ -508,15 +505,15 @@ private:
void UpdateComputeTextureBuffers();
void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
void MarkWrittenBuffer(BufferId buffer_id, DAddr device_addr, u32 size);
[[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
[[nodiscard]] BufferId FindBuffer(DAddr device_addr, u32 size);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size);
[[nodiscard]] OverlapResult ResolveOverlaps(DAddr device_addr, u32 wanted_size);
void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
[[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
[[nodiscard]] BufferId CreateBuffer(DAddr device_addr, u32 wanted_size);
void Register(BufferId buffer_id);
@ -527,7 +524,7 @@ private:
void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept;
bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
bool SynchronizeBuffer(Buffer& buffer, DAddr device_addr, u32 size);
void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
std::span<BufferCopy> copies);
@ -539,7 +536,7 @@ private:
void DownloadBufferMemory(Buffer& buffer_id);
void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size);
void DownloadBufferMemory(Buffer& buffer_id, DAddr device_addr, u64 size);
void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false);
@ -549,7 +546,7 @@ private:
[[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
PixelFormat format);
[[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
[[nodiscard]] std::span<const u8> ImmediateBufferWithData(DAddr device_addr, size_t size);
[[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
@ -557,11 +554,10 @@ private:
void ClearDownload(IntervalType subtract_interval);
void InlineMemoryImplementation(VAddr dest_address, size_t copy_size,
void InlineMemoryImplementation(DAddr dest_address, size_t copy_size,
std::span<const u8> inlined_buffer);
VideoCore::RasterizerInterface& rasterizer;
Core::Memory::Memory& cpu_memory;
Tegra::MaxwellDeviceMemoryManager& device_memory;
SlotVector<Buffer> slot_buffers;
DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
@ -598,7 +594,7 @@ private:
u64 critical_memory = 0;
BufferId inline_buffer_id;
std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table;
std::array<BufferId, ((1ULL << 34) >> CACHING_PAGEBITS)> page_table;
Common::ScratchBuffer<u8> tmp_buffer;
};

View File

@ -17,19 +17,19 @@
namespace VideoCommon {
template <class RasterizerInterface>
template <typename DeviceTracker>
class MemoryTrackerBase {
static constexpr size_t MAX_CPU_PAGE_BITS = 39;
static constexpr size_t MAX_CPU_PAGE_BITS = 34;
static constexpr size_t HIGHER_PAGE_BITS = 22;
static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS;
static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL;
static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS);
static constexpr size_t MANAGER_POOL_SIZE = 32;
static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD;
using Manager = WordManager<RasterizerInterface, WORDS_STACK_NEEDED>;
using Manager = WordManager<DeviceTracker, WORDS_STACK_NEEDED>;
public:
MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {}
MemoryTrackerBase(DeviceTracker& device_tracker_) : device_tracker{&device_tracker_} {}
~MemoryTrackerBase() = default;
/// Returns the inclusive CPU modified range in a begin end pair
@ -74,7 +74,7 @@ public:
});
}
/// Mark region as CPU modified, notifying the rasterizer about this change
/// Mark region as CPU modified, notifying the device_tracker about this change
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<true>(dirty_cpu_addr, query_size,
[](Manager* manager, u64 offset, size_t size) {
@ -83,7 +83,7 @@ public:
});
}
/// Unmark region as CPU modified, notifying the rasterizer about this change
/// Unmark region as CPU modified, notifying the device_tracker about this change
void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<true>(dirty_cpu_addr, query_size,
[](Manager* manager, u64 offset, size_t size) {
@ -139,7 +139,7 @@ public:
});
}
/// Flushes cached CPU writes, and notify the rasterizer about the deltas
/// Flushes cached CPU writes, and notify the device_tracker about the deltas
void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(query_cpu_addr, query_size,
[](Manager* manager, [[maybe_unused]] u64 offset,
@ -280,7 +280,7 @@ private:
manager_pool.emplace_back();
auto& last_pool = manager_pool.back();
for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) {
new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE);
new (&last_pool[i]) Manager(0, *device_tracker, HIGHER_PAGE_SIZE);
free_managers.push_back(&last_pool[i]);
}
return on_return();
@ -293,7 +293,7 @@ private:
std::unordered_set<u32> cached_pages;
RasterizerInterface* rasterizer = nullptr;
DeviceTracker* device_tracker = nullptr;
};
} // namespace VideoCommon

View File

@ -163,11 +163,11 @@ struct Words {
WordsArray<stack_words> preflushable;
};
template <class RasterizerInterface, size_t stack_words = 1>
template <class DeviceTracker, size_t stack_words = 1>
class WordManager {
public:
explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes)
: cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {}
explicit WordManager(VAddr cpu_addr_, DeviceTracker& tracker_, u64 size_bytes)
: cpu_addr{cpu_addr_}, tracker{&tracker_}, words{size_bytes} {}
explicit WordManager() = default;
@ -279,7 +279,7 @@ public:
}
/**
* Loop over each page in the given range, turn off those bits and notify the rasterizer if
* Loop over each page in the given range, turn off those bits and notify the tracker if
* needed. Call the given function on each turned off range.
*
* @param query_cpu_range Base CPU address to loop over
@ -459,26 +459,26 @@ private:
}
/**
* Notify rasterizer about changes in the CPU tracking state of a word in the buffer
* Notify tracker about changes in the CPU tracking state of a word in the buffer
*
* @param word_index Index to the word to notify to the rasterizer
* @param word_index Index to the word to notify to the tracker
* @param current_bits Current state of the word
* @param new_bits New state of the word
*
* @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
* @tparam add_to_tracker True when the tracker should start tracking the new pages
*/
template <bool add_to_rasterizer>
template <bool add_to_tracker>
void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
IteratePages(changed_bits, [&](size_t offset, size_t size) {
rasterizer->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE,
size * BYTES_PER_PAGE, add_to_rasterizer ? 1 : -1);
tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE,
size * BYTES_PER_PAGE, add_to_tracker ? 1 : -1);
});
}
VAddr cpu_addr = 0;
RasterizerInterface* rasterizer = nullptr;
DeviceTracker* tracker = nullptr;
Words<stack_words> words;
};