From 6fcd349f374710a3f4e0e0585bb6d7af86ebb66d Mon Sep 17 00:00:00 2001 From: Rye Date: Sun, 2 Feb 2025 02:43:46 -0500 Subject: Fix Tracy memory profiling overloads for aligned allocations Fix disabling renderdoc support Improve ll_aligned_alloc functions on darwin for 32 and 64byte aligned by utilizing posix_memalign --- indra/llcommon/llmemory.h | 49 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'indra/llcommon/llmemory.h') diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index b616edfde7..72aec57080 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -231,8 +231,6 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; #if defined(LL_WINDOWS) void* ret = _aligned_malloc(size, 32); -#elif defined(LL_DARWIN) - void* ret = ll_aligned_malloc_fallback( size, 32 ); #else void *ret; if (0 != posix_memalign(&ret, 32, size)) @@ -248,8 +246,31 @@ inline void ll_aligned_free_32(void *p) LL_PROFILE_FREE(p); #if defined(LL_WINDOWS) _aligned_free(p); -#elif defined(LL_DARWIN) - ll_aligned_free_fallback( p ); +#else + free(p); // posix_memalign() is compatible with heap deallocator +#endif +} + +inline void* ll_aligned_malloc_64(size_t size) // returned hunk MUST be freed with ll_aligned_free_32(). +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; +#if defined(LL_WINDOWS) + void* ret = _aligned_malloc(size, 64); +#else + void *ret; + if (0 != posix_memalign(&ret, 64, size)) + return nullptr; +#endif + LL_PROFILE_ALLOC(ret, size); + return ret; +} + +inline void ll_aligned_free_64(void *p) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + LL_PROFILE_FREE(p); +#if defined(LL_WINDOWS) + _aligned_free(p); #else free(p); // posix_memalign() is compatible with heap deallocator #endif @@ -261,19 +282,23 @@ LL_FORCE_INLINE void* ll_aligned_malloc(size_t size) { LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; void* ret; - if (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0) + if constexpr (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0) { ret = malloc(size); LL_PROFILE_ALLOC(ret, size); } - else if (ALIGNMENT == 16) + else if constexpr (ALIGNMENT == 16) { ret = ll_aligned_malloc_16(size); } - else if (ALIGNMENT == 32) + else if constexpr (ALIGNMENT == 32) { ret = ll_aligned_malloc_32(size); } + else if constexpr (ALIGNMENT == 64) + { + ret = ll_aligned_malloc_64(size); + } else { ret = ll_aligned_malloc_fallback(size, ALIGNMENT); @@ -285,16 +310,20 @@ template LL_FORCE_INLINE void ll_aligned_free(void* ptr) { LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; - if (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN) + if constexpr (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN) { LL_PROFILE_FREE(ptr); free(ptr); } - else if (ALIGNMENT == 16) + else if constexpr (ALIGNMENT == 16) { ll_aligned_free_16(ptr); } - else if (ALIGNMENT == 32) + else if constexpr (ALIGNMENT == 32) + { + return ll_aligned_free_32(ptr); + } + else if constexpr (ALIGNMENT == 64) { return ll_aligned_free_32(ptr); } -- cgit v1.3 From 4ab2a80e6c8ab6c183143fbbca2c3386088caeb6 Mon Sep 17 00:00:00 2001 From: Rye Date: Mon, 10 Feb 2025 14:08:56 -0500 Subject: Use SSE2NEON to emulate SSE intrinsics when building against an ARM target --- indra/llcommon/llmemory.h | 8 ++++++++ indra/llmath/llsimdmath.h | 14 ++++++++++++-- indra/llmath/llvector4a.inl | 18 +++++++++++++++++- indra/llmath/tests/llquaternion_test.cpp | 6 +++--- indra/newview/llappviewer.cpp | 11 ++++++++++- 5 files changed, 50 insertions(+), 7 deletions(-) (limited to 'indra/llcommon/llmemory.h') diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index 72aec57080..adc556d180 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -71,7 +71,11 @@ LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); #define ll_assert_aligned(ptr,alignment) #endif +#if LL_ARM64 +#include "sse2neon.h" +#else #include +#endif template T* LL_NEXT_ALIGNED_ADDRESS(T* address) { @@ -339,6 +343,9 @@ LL_FORCE_INLINE void ll_aligned_free(void* ptr) inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes) { LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; +#if defined(LL_ARM64) + memcpy(dst, src, bytes); +#else assert(src != NULL); assert(dst != NULL); assert(bytes > 0); @@ -404,6 +411,7 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __ dst += 16; src += 16; } +#endif } #ifndef __DEBUG_PRIVATE_MEM__ diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h index 40953dc2e8..b27b034cf3 100644 --- a/indra/llmath/llsimdmath.h +++ b/indra/llmath/llsimdmath.h @@ -31,16 +31,26 @@ #error "Please include llmath.h before this file." #endif -#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) ) -#error SSE2 not enabled. LLVector4a and related class will not compile. +// the check for this error case must be split into multiple parts +// because some versions of VS complain about '__SSE2__' +#if ( ( LL_DARWIN || LL_LINUX ) ) + #if !(__SSE2__) && !(__arm64__) && !(__aarch64__) + #error SSE2 not enabled. LLVector4a and related class will not compile. + #endif +#elif ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) ) + #error SSE2 not enabled. LLVector4a and related class will not compile. #endif #if !LL_WINDOWS #include #endif +#if defined(__arm64__) || defined(__aarch64__) +#include "sse2neon.h" +#else #include #include +#endif #include "llmemory.h" #include "llsimdtypes.h" diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl index 77a0257fbf..443a46c317 100644 --- a/indra/llmath/llvector4a.inl +++ b/indra/llmath/llvector4a.inl @@ -115,7 +115,7 @@ inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w) // Set to all zeros inline void LLVector4a::clear() { - mQ = LLVector4a::getZero().mQ; + mQ = _mm_setzero_ps(); } inline void LLVector4a::splat(const F32 x) @@ -272,6 +272,9 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) // Set all elements to the dot product of the x, y, and z elements in a and b inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) { +#if (defined(__arm64__) || defined(__aarch64__)) + mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f); +#else // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } @@ -284,11 +287,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); +#endif } // Set all elements to the dot product of the x, y, z, and w elements in a and b inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) { +#if (defined(__arm64__) || defined(__aarch64__)) + mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff); +#else // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } @@ -301,21 +308,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); +#endif } // Return the 3D dot product of this vector and b inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const { +#if (defined(__arm64__) || defined(__aarch64__)) + return _mm_dp_ps(mQ, b.mQ, 0x7f); +#else const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); const LLQuad xPlusY = _mm_add_ps( ab, splatY ); return _mm_add_ps( xPlusY, splatZ ); +#endif } // Return the 4D dot product of this vector and b inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const { +#if (defined(__arm64__) || defined(__aarch64__)) + return _mm_dp_ps(mQ, b.mQ, 0xff); +#else // ab = { w, z, y, x } const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); // upperProdsInLowerElems = { y, x, y, x } @@ -325,6 +340,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const // shuffled = { z+x, z+x, z+x, z+x } const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); return _mm_add_ss( sumOfPairs, shuffled ); +#endif } // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed diff --git a/indra/llmath/tests/llquaternion_test.cpp b/indra/llmath/tests/llquaternion_test.cpp index aa3c0ad843..ba18d54d55 100644 --- a/indra/llmath/tests/llquaternion_test.cpp +++ b/indra/llmath/tests/llquaternion_test.cpp @@ -349,9 +349,9 @@ namespace tut ensure( "2. LLVector4 operator*(const LLVector4 &a, const LLQuaternion &rot) failed", is_approx_equal(-58153.5390f, result.mV[0]) && - (183787.8125f == result.mV[1]) && - (116864.164063f == result.mV[2]) && - (78.099998f == result.mV[3])); + is_approx_equal(183787.8125f, result.mV[1]) && + is_approx_equal(116864.164063f, result.mV[2]) && + is_approx_equal(78.099998f, result.mV[3])); } //test case for LLVector3 operator*(const LLVector3 &a, const LLQuaternion &rot) fn. diff --git a/indra/newview/llappviewer.cpp b/indra/newview/llappviewer.cpp index 84cce2348a..9392d111f2 100644 --- a/indra/newview/llappviewer.cpp +++ b/indra/newview/llappviewer.cpp @@ -972,6 +972,7 @@ bool LLAppViewer::init() return false; } +#if defined(LL_X86) || defined(LL_X86_64) // Without SSE2 support we will crash almost immediately, warn here. if (!gSysCPU.hasSSE2()) { @@ -983,6 +984,7 @@ bool LLAppViewer::init() // quit immediately return false; } +#endif // alert the user if they are using unsupported hardware if (!gSavedSettings.getBOOL("AlertedUnsupportedHardware")) @@ -1268,7 +1270,7 @@ void LLAppViewer::initMaxHeapSize() //------------------------------------------------------------------------------------------ //currently SL is built under 32-bit setting, we set its max heap size no more than 1.6 GB. - #ifndef LL_X86_64 + #if !defined(LL_X86_64) && !defined(LL_ARM64) F32Gigabytes max_heap_size_gb = (F32Gigabytes)gSavedSettings.getF32("MaxHeapSize") ; #else F32Gigabytes max_heap_size_gb = (F32Gigabytes)gSavedSettings.getF32("MaxHeapSize64"); @@ -3246,6 +3248,11 @@ LLSD LLAppViewer::getViewerInfo() const info["VIEWER_VERSION_STR"] = versionInfo.getVersion(); info["CHANNEL"] = versionInfo.getChannel(); info["ADDRESS_SIZE"] = ADDRESS_SIZE; +#if LL_ARM64 + info["ARCHITECTURE"] = "ARM"; +#else + info["ARCHITECTURE"] = "x86"; +#endif std::string build_config = versionInfo.getBuildConfig(); if (build_config != "Release") { @@ -5538,7 +5545,9 @@ void LLAppViewer::forceErrorBreakpoint() #ifdef LL_WINDOWS DebugBreak(); #else +#if defined(LL_X86) || defined(LL_X86_64) asm ("int $3"); +#endif #endif return; } -- cgit v1.3