[Linux] Implement KSM Kernel Samepage Merging with Maps (#4601)

* KSM work

* Windows fixes

* Add KSM logging, cleanup

* Cleanup raycast logging
This commit is contained in:
Chris Miles 2025-01-21 15:50:20 -06:00 committed by GitHub
parent 25826c6686
commit d13c725a74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 357 additions and 33 deletions

1
.gitignore vendored
View File

@ -68,3 +68,4 @@ compile_flags.txt
# CMake Files
cmake-build-relwithdebinfo/*
skill-caps.diff

View File

@ -98,6 +98,7 @@ SET(common_sources
json/json.hpp
json/jsoncpp.cpp
zone_store.cpp
memory/ksm.hpp
net/console_server.cpp
net/console_server_connection.cpp
net/crc32.cpp

View File

@ -144,6 +144,7 @@ namespace Logs {
XTargets,
EvolveItem,
PositionUpdate,
KSM,
MaxCategoryID /* Don't Remove this */
};
@ -246,7 +247,8 @@ namespace Logs {
"Corpses",
"XTargets",
"EvolveItem",
"PositionUpdate"
"PositionUpdate",
"KSM" // Kernel Samepage Merging
};
}

View File

@ -861,7 +861,17 @@
#define LogPositionUpdateDetail(message, ...) do {\
if (LogSys.IsLogEnabled(Logs::Detail, Logs::PositionUpdate))\
OutF(LogSys, Logs::Detail, Logs::PositionUpdate, __FILE__, __func__, __LINE__, message, ##__VA_ARGS__);\
OutF(LogSys, Logs::Detail, Logs::PositionUpdate, __FILE__, __func__, __LINE__, message, ##__VA_ARGS__); \
} while (0)
#define LogKSM(message, ...) do {\
if (LogSys.IsLogEnabled(Logs::General, Logs::KSM))\
OutF(LogSys, Logs::General, Logs::KSM, __FILE__, __func__, __LINE__, message, ##__VA_ARGS__);\
} while (0)
#define LogKSMDetail(message, ...) do {\
if (LogSys.IsLogEnabled(Logs::Detail, Logs::KSM))\
OutF(LogSys, Logs::Detail, Logs::KSM, __FILE__, __func__, __LINE__, message, ##__VA_ARGS__);\
} while (0)
#define Log(debug_level, log_category, message, ...) do {\

220
common/memory/ksm.hpp Normal file
View File

@ -0,0 +1,220 @@
#ifndef EQEMU_KSM_HPP
#define EQEMU_KSM_HPP
#include "../eqemu_logsys.h"
#include <iostream>
#include <vector>
#include <cstring>
#ifdef _WIN32
#include <malloc.h> // For _aligned_malloc, _aligned_free
#include <windows.h>
#else
#include <sys/mman.h> // For madvise
#include <unistd.h> // For sysconf, sbrk
#endif
// Page-aligned allocator for std::vector
template <typename T>
class PageAlignedAllocator {
public:
using value_type = T;
PageAlignedAllocator() noexcept = default;
template <typename U>
PageAlignedAllocator(const PageAlignedAllocator<U>&) noexcept {}
T* allocate(std::size_t n) {
void* ptr = nullptr;
size_t size = n * sizeof(T);
#ifdef _WIN32
// Simply allocate memory without alignment
ptr = malloc(size);
if (!ptr) throw std::bad_alloc();
#else
size_t alignment = getPageSize(); // Get the system's page size
if (posix_memalign(&ptr, alignment, size) != 0) {
throw std::bad_alloc();
}
#endif
return static_cast<T*>(ptr);
}
void deallocate(T* p, std::size_t) noexcept {
free(p);
}
private:
size_t getPageSize() const
{
#ifdef _WIN32
SYSTEM_INFO sysInfo;
GetSystemInfo(&sysInfo);
return sysInfo.dwPageSize; // Page size in bytes
#else
return static_cast<size_t>(sysconf(_SC_PAGESIZE));
#endif
};
};
template <typename T, typename U>
bool operator==(const PageAlignedAllocator<T>&, const PageAlignedAllocator<U>&) noexcept {
return true;
}
template <typename T, typename U>
bool operator!=(const PageAlignedAllocator<T>&, const PageAlignedAllocator<U>&) noexcept {
return false;
}
// Kernel Samepage Merging (KSM) functionality
namespace KSM {
#ifdef _WIN32
// Windows-specific placeholder functions (no-op)
inline void CheckPageAlignment(void* ptr) {
}
inline void* AllocatePageAligned(size_t size) {
return memset(malloc(size), 0, size);
}
inline void MarkMemoryForKSM(void* start, size_t size) {
}
inline void AlignHeapToPageBoundary() {
}
inline void* MarkHeapStart() {
return nullptr;
}
inline size_t MeasureHeapUsage(void* start) {
return 0;
}
#else
// Linux-specific functionality
inline void CheckPageAlignment(void* ptr) {
size_t page_size = sysconf(_SC_PAGESIZE);
if (reinterpret_cast<uintptr_t>(ptr) % page_size == 0) {
LogKSMDetail("Memory is page-aligned [{}]", ptr);
} else {
LogKSMDetail("Memory is NOT page-aligned [{}]", ptr);
}
}
inline void* AllocatePageAligned(size_t size) {
size_t page_size = sysconf(_SC_PAGESIZE);
void* aligned_ptr = nullptr;
if (posix_memalign(&aligned_ptr, page_size, size) != 0) {
LogKSM("Failed to allocate page-aligned memory on Linux. page_size [{}] size [{}] bytes", page_size, size);
}
std::memset(aligned_ptr, 0, size);
return aligned_ptr;
}
inline void MarkMemoryForKSM(void* start, size_t size) {
if (madvise(start, size, MADV_MERGEABLE) == 0) {
LogKSM("Marked memory for KSM | start [{}] size [{}] bytes", start, size);
} else {
perror("madvise failed");
}
}
inline void AlignHeapToPageBoundary() {
size_t page_size = sysconf(_SC_PAGESIZE);
if (page_size == 0) {
LogKSM("Failed to retrieve page size SC_PAGESIZE [{}]", page_size);
return;
}
void* current_break = sbrk(0);
if (current_break == (void*)-1) {
LogKSM("Failed to retrieve the current program break");
return;
}
uintptr_t current_address = reinterpret_cast<uintptr_t>(current_break);
size_t misalignment = current_address % page_size;
if (misalignment != 0) {
size_t adjustment = page_size - misalignment;
if (sbrk(adjustment) == (void*)-1) {
LogKSM("Failed to align heap to page boundary. adjustment [{}] bytes", adjustment);
return;
}
}
LogKSMDetail("Heap aligned to next page boundary. Current break [{}]", sbrk(0));
}
inline void* MarkHeapStart() {
void* current_pos = sbrk(0);
AlignHeapToPageBoundary();
return current_pos;
}
inline size_t MeasureHeapUsage(void* start) {
void* current_break = sbrk(0);
return static_cast<char*>(current_break) - static_cast<char*>(start);
}
#endif
inline size_t getPageSize()
{
#ifdef _WIN32
SYSTEM_INFO sysInfo;
GetSystemInfo(&sysInfo);
return sysInfo.dwPageSize; // Page size in bytes
#else
return static_cast<size_t>(sysconf(_SC_PAGESIZE)); // POSIX page size
#endif
};
template <typename T>
inline void PageAlignVectorAligned(std::vector<T, PageAlignedAllocator<T>>& vec) {
if (vec.empty()) {
return;
}
size_t page_size = getPageSize();
void* start = vec.data();
size_t size = vec.size() * sizeof(T);
// Check if the memory is page-aligned
if (reinterpret_cast<std::uintptr_t>(start) % page_size != 0) {
// Allocate a new aligned vector
std::vector<T, PageAlignedAllocator<T>> aligned_vec(vec.get_allocator());
aligned_vec.reserve(vec.capacity()); // Match capacity to avoid reallocation during copy
// Copy elements from the original vector
aligned_vec.insert(aligned_vec.end(), vec.begin(), vec.end());
// Swap the aligned vector with the original vector
vec.swap(aligned_vec);
// Clear the temporary aligned vector to free its memory
aligned_vec.clear();
// Verify the new alignment
start = vec.data();
if (reinterpret_cast<std::uintptr_t>(start) % page_size != 0) {
throw std::runtime_error("Failed to align vector memory to page boundaries.");
}
LogKSMDetail("Vector reallocated to ensure page alignment. start [{}] size [{}] bytes", start, size);
} else {
LogKSMDetail("Vector is already page-aligned. start [{}] size [{}] bytes", start, size);
}
#ifndef _WIN32
// Mark memory for KSM (only on non-Windows systems)
MarkMemoryForKSM(start, size);
#endif
}
}
#endif // EQEMU_KSM_HPP

View File

@ -1,6 +1,8 @@
#include <cereal/archives/json.hpp>
#include <cereal/types/vector.hpp>
#include <iomanip>
#include "../../common/events/player_events.h"
#include "../../common/memory/ksm.hpp"
void WorldserverCLI::TestCommand(int argc, char **argv, argh::parser &cmd, std::string &description)
{
@ -10,5 +12,21 @@ void WorldserverCLI::TestCommand(int argc, char **argv, argh::parser &cmd, std::
return;
}
void* start_marker = KSM::MarkHeapStart();
std::cout << "Start marker: " << start_marker << "\n";
std::vector<std::string> vec = {};
for (int i = 0; i < 100000; i++) {
vec.push_back("Some random string");
}
// Measure allocated memory size
size_t allocated_size = KSM::MeasureHeapUsage(start_marker);
// Convert to MB as a float and output with precision
double allocated_size_mb = static_cast<double>(allocated_size) / (1024 * 1024);
std::cout << std::fixed << std::setprecision(3)
<< "Allocated size: " << allocated_size_mb << " MB\n";
// Mark memory for KSM
KSM::MarkMemoryForKSM(start_marker, allocated_size);
}

View File

@ -9,6 +9,19 @@ void command_loc(Client *c, const Seperator *sep)
auto target_position = target->GetPosition();
// check los benchmark
BenchTimer timer;
for (int i = 0; i < 1000; i++) {
zone->zonemap->CheckLoS(c->GetPosition(), target_position);
}
c->Message(
Chat::White,
fmt::format(
"CheckLoS benchmark took [{}]",
timer.elapsed()
).c_str()
);
c->Message(
Chat::White,
fmt::format(

View File

@ -7,6 +7,7 @@
#include "raycast_mesh.h"
#include "zone.h"
#include "../common/file.h"
#include "../common/memory/ksm.hpp"
#include <algorithm>
#include <map>
@ -953,6 +954,7 @@ bool Map::LoadV2(FILE *f) {
return true;
}
void Map::RotateVertex(glm::vec3 &v, float rx, float ry, float rz) {
glm::vec3 nv = v;

View File

@ -1,4 +1,6 @@
#include "raycast_mesh.h"
#include "../common/memory/ksm.hpp"
#include "../common/eqemu_logsys.h"
#include <math.h>
#include <assert.h>
#include <stdlib.h>
@ -9,7 +11,7 @@
// This code snippet allows you to create an axis aligned bounding volume tree for a triangle mesh so that you can do
// high-speed raycasting.
//
// There are much better implementations of this available on the internet. In particular I recommend that you use
// There are much better implementations of this available on the internet. In particular I recommend that you use
// OPCODE written by Pierre Terdiman.
// @see: http://www.codercorner.com/Opcode.htm
//
@ -17,7 +19,7 @@
//
// I am providing this code snippet for the use case where you *only* want to do quick and dirty optimized raycasting.
// I have not done performance testing between this version and OPCODE; so I don't know how much slower it is. However,
// anytime you switch to using a spatial data structure for raycasting, you increase your performance by orders and orders
// anytime you switch to using a spatial data structure for raycasting, you increase your performance by orders and orders
// of magnitude; so this implementation should work fine for simple tools and utilities.
//
// It also serves as a nice sample for people who are trying to learn the algorithm of how to implement AABB trees.
@ -32,14 +34,14 @@
//
// The official source can be found at: http://code.google.com/p/raycastmesh/
//
//
//
#pragma warning(disable:4100)
namespace RAYCAST_MESH
{
typedef std::vector< RmUint32 > TriVector;
typedef std::vector<RmUint32, PageAlignedAllocator<RmUint32>> TriVector;
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
@ -365,7 +367,7 @@ public:
{
RmUint32 ret = 0;
if ( p[0] < mMin[0] )
if ( p[0] < mMin[0] )
{
ret|=CC_MINX;
}
@ -374,7 +376,7 @@ public:
ret|=CC_MAXX;
}
if ( p[1] < mMin[1] )
if ( p[1] < mMin[1] )
{
ret|=CC_MINY;
}
@ -383,7 +385,7 @@ public:
ret|=CC_MAXY;
}
if ( p[2] < mMin[2] )
if ( p[2] < mMin[2] )
{
ret|=CC_MINZ;
}
@ -514,7 +516,7 @@ public:
// the width of the longest axis is less than the minimum axis size then...
// we create the leaf node and copy the triangles into the leaf node triangle array.
if ( count < minLeafSize || depth >= maxDepth || laxis < minAxisSize )
{
{
// Copy the triangle indices into the leaf triangles array
mLeafTriangleIndex = leafTriangles.size(); // assign the array start location for these leaf triangles.
leafTriangles.push_back(count);
@ -542,7 +544,7 @@ public:
// and another array that includes all triangles which intersect the 'right' half of the bounding volume node.
for (auto i = triangles.begin(); i != triangles.end(); ++i) {
RmUint32 tri = (*i);
RmUint32 tri = (*i);
{
RmUint32 i1 = indices[tri*3+0];
@ -590,7 +592,7 @@ public:
{
leftBounds.clamp(b1); // we have to clamp the bounding volume so it stays inside the parent volume.
mLeft = callback->getNode(); // get a new AABB node
new ( mLeft ) NodeAABB(leftBounds); // initialize it to default constructor values.
new ( mLeft ) NodeAABB(leftBounds); // initialize it to default constructor values.
// Then recursively split this node.
mLeft->split(leftTriangles,vcount,vertices,tcount,indices,depth+1,maxDepth,minLeafSize,minAxisSize,callback,leafTriangles);
}
@ -662,7 +664,7 @@ public:
RmReal nd = nearestDistance;
if ( !intersectLineSegmentAABB(mBounds.mMin,mBounds.mMax,from,dir,nd,sect) )
{
return;
return;
}
if ( mLeafTriangleIndex != TRI_EOF )
{
@ -754,28 +756,60 @@ public:
{
mMaxNodeCount+=pow2Table[i];
}
mNodes = new NodeAABB[mMaxNodeCount];
// Allocate page-aligned memory
mNodes = static_cast<NodeAABB*>(KSM::AllocatePageAligned(sizeof(NodeAABB) * mMaxNodeCount));
if (!mNodes) {
throw std::bad_alloc();
}
mNodeCount = 0;
KSM::CheckPageAlignment(mNodes);
mVertices = static_cast<RmReal*>(KSM::AllocatePageAligned(sizeof(RmReal) * 3 * vcount));
if (!mVertices) {
throw std::bad_alloc();
}
std::memcpy(mVertices, vertices, sizeof(RmReal) * 3 * vcount);
mVcount = vcount;
mVertices = (RmReal *)::malloc(sizeof(RmReal)*3*vcount);
memcpy(mVertices,vertices,sizeof(RmReal)*3*vcount);
mIndices = static_cast<RmUint32*>(KSM::AllocatePageAligned(sizeof(RmUint32) * 3 * tcount));
if (!mIndices) {
throw std::bad_alloc();
}
std::memcpy(mIndices, indices, sizeof(RmUint32) * 3 * tcount);
mTcount = tcount;
mIndices = (RmUint32 *)::malloc(sizeof(RmUint32)*tcount*3);
memcpy(mIndices,indices,sizeof(RmUint32)*tcount*3);
mRaycastTriangles = (RmUint32 *)::malloc(tcount*sizeof(RmUint32));
memset(mRaycastTriangles,0,tcount*sizeof(RmUint32));
mRaycastTriangles = static_cast<RmUint32*>(KSM::AllocatePageAligned(sizeof(RmUint32) * tcount));
if (!mRaycastTriangles) {
throw std::bad_alloc();
}
std::memset(mRaycastTriangles, 0, sizeof(RmUint32) * tcount);
mFaceNormals = static_cast<RmReal*>(KSM::AllocatePageAligned(sizeof(RmReal) * 3 * tcount));
if (!mFaceNormals) {
throw std::bad_alloc();
}
std::memset(mFaceNormals, 0, sizeof(RmReal) * 3 * tcount);
// Mark memory as mergeable for KSM
KSM::MarkMemoryForKSM(mVertices, sizeof(RmReal) * 3 * vcount);
KSM::MarkMemoryForKSM(mIndices, sizeof(RmUint32) * 3 * tcount);
KSM::MarkMemoryForKSM(mRaycastTriangles, sizeof(RmUint32) * tcount);
KSM::MarkMemoryForKSM(mFaceNormals, sizeof(RmReal) * 3 * tcount);
mRoot = getNode();
mFaceNormals = NULL;
new ( mRoot ) NodeAABB(mVcount,mVertices,mTcount,mIndices,maxDepth,minLeafSize,minAxisSize,this,mLeafTriangles);
KSM::MarkMemoryForKSM(mLeafTriangles.data(), mLeafTriangles.size() * sizeof(RmUint32));
}
~MyRaycastMesh(void)
{
delete []mNodes;
::free(mVertices);
::free(mIndices);
::free(mFaceNormals);
::free(mRaycastTriangles);
if (mNodes) { free(mNodes); }
if (mVertices) { free(mVertices); }
if (mIndices) { free(mIndices); }
if (mRaycastTriangles) { free(mRaycastTriangles); }
if (mFaceNormals) { free(mFaceNormals); }
}
virtual bool raycast(const RmReal *from,const RmReal *to,RmReal *hitLocation,RmReal *hitNormal,RmReal *hitDistance)
@ -812,7 +846,7 @@ public:
return mRoot->mBounds.mMax;
}
virtual NodeAABB * getNode(void)
virtual NodeAABB * getNode(void)
{
assert( mNodeCount < mMaxNodeCount );
NodeAABB *ret = &mNodes[mNodeCount];
@ -820,7 +854,7 @@ public:
return ret;
}
virtual void getFaceNormal(RmUint32 tri,RmReal *faceNormal)
virtual void getFaceNormal(RmUint32 tri,RmReal *faceNormal)
{
if ( mFaceNormals == NULL )
{
@ -938,6 +972,29 @@ RaycastMesh * createRaycastMesh(RmUint32 vcount, // The number of vertices in t
)
{
auto m = new MyRaycastMesh(vcount, vertices, tcount, indices, maxDepth, minLeafSize, minAxisSize);
// Calculate memory usage
size_t vertex_size = vcount * sizeof(RmReal) * 3; // Each vertex has 3 floats
size_t index_size = tcount * 3 * sizeof(RmUint32); // Each triangle has 3 indices
size_t bvh_node_size = m->mNodeCount * sizeof(NodeAABB); // BVH Node memory usage
size_t bvh_leaf_size = m->mLeafTriangles.size() * sizeof(RmUint32); // BVH leaf triangles
size_t bvh_size = bvh_node_size + bvh_leaf_size; // Total BVH size
size_t total_size = vertex_size + index_size + bvh_size;
KSM::CheckPageAlignment(m->mNodes);
KSM::CheckPageAlignment(m->mVertices);
LogInfo(
"Map Raycast Memory Usage | Vertices [{:.2f}] MB Indices [{:.2f}] MB BVH Nodes [{:.2f}] MB BVH Leaves [{:.2f}] MB BVH Total [{:.2f}] MB",
vertex_size / (1024.0 * 1024.0),
index_size / (1024.0 * 1024.0),
bvh_node_size / (1024.0 * 1024.0),
bvh_leaf_size / (1024.0 * 1024.0),
bvh_size / (1024.0 * 1024.0)
);
LogInfo("Total Raycast Memory [{:.2f}] MB", total_size / (1024.0 * 1024.0));
return static_cast< RaycastMesh * >(m);
}
@ -984,12 +1041,12 @@ MyRaycastMesh::MyRaycastMesh(std::vector<char>& rm_buffer)
return;
char* buf = rm_buffer.data();
chunk_size = sizeof(RmUint32);
memcpy(&mVcount, buf, chunk_size);
buf += chunk_size;
bytes_read += chunk_size;
chunk_size = (sizeof(RmReal) * (3 * mVcount));
mVertices = (RmReal *)::malloc(chunk_size);
memcpy(mVertices, buf, chunk_size);
@ -1037,7 +1094,7 @@ MyRaycastMesh::MyRaycastMesh(std::vector<char>& rm_buffer)
buf += chunk_size;
bytes_read += chunk_size;
}
chunk_size = sizeof(RmUint32);
memcpy(&mNodeCount, buf, chunk_size);
buf += chunk_size;
@ -1071,7 +1128,7 @@ MyRaycastMesh::MyRaycastMesh(std::vector<char>& rm_buffer)
mNodes[index].mLeft = &mNodes[lNodeIndex];
buf += chunk_size;
bytes_read += chunk_size;
RmUint32 rNodeIndex;
chunk_size = sizeof(RmUint32);
memcpy(&rNodeIndex, buf, chunk_size);
@ -1106,7 +1163,7 @@ MyRaycastMesh::MyRaycastMesh(std::vector<char>& rm_buffer)
void MyRaycastMesh::serialize(std::vector<char>& rm_buffer)
{
rm_buffer.clear();
size_t rm_buffer_size_ = 0;
rm_buffer_size_ += sizeof(RmUint32); // mVcount