From 6fcd349f374710a3f4e0e0585bb6d7af86ebb66d Mon Sep 17 00:00:00 2001
From: Rye <rye@lindenlab.com>
Date: Sun, 2 Feb 2025 02:43:46 -0500
Subject: Fix Tracy memory profiling overloads for aligned allocations Fix
 disabling renderdoc support Improve ll_aligned_alloc functions on darwin for
 32 and 64byte aligned by utilizing posix_memalign

---
 indra/llcommon/llmemory.h | 49 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 10 deletions(-)

(limited to 'indra/llcommon/llmemory.h')

diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h
index b616edfde7..72aec57080 100644
--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@@ -231,8 +231,6 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
 #if defined(LL_WINDOWS)
     void* ret = _aligned_malloc(size, 32);
-#elif defined(LL_DARWIN)
-    void* ret = ll_aligned_malloc_fallback( size, 32 );
 #else
     void *ret;
     if (0 != posix_memalign(&ret, 32, size))
@@ -248,8 +246,31 @@ inline void ll_aligned_free_32(void *p)
     LL_PROFILE_FREE(p);
 #if defined(LL_WINDOWS)
     _aligned_free(p);
-#elif defined(LL_DARWIN)
-    ll_aligned_free_fallback( p );
+#else
+    free(p); // posix_memalign() is compatible with heap deallocator
+#endif
+}
+
+inline void* ll_aligned_malloc_64(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
+#if defined(LL_WINDOWS)
+    void* ret = _aligned_malloc(size, 64);
+#else
+    void *ret;
+    if (0 != posix_memalign(&ret, 64, size))
+        return nullptr;
+#endif
+    LL_PROFILE_ALLOC(ret, size);
+    return ret;
+}
+
+inline void ll_aligned_free_64(void *p)
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
+    LL_PROFILE_FREE(p);
+#if defined(LL_WINDOWS)
+    _aligned_free(p);
 #else
     free(p); // posix_memalign() is compatible with heap deallocator
 #endif
@@ -261,19 +282,23 @@ LL_FORCE_INLINE void* ll_aligned_malloc(size_t size)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
     void* ret;
-    if (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0)
+    if constexpr (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0)
     {
         ret = malloc(size);
         LL_PROFILE_ALLOC(ret, size);
     }
-    else if (ALIGNMENT == 16)
+    else if constexpr (ALIGNMENT == 16)
     {
         ret = ll_aligned_malloc_16(size);
     }
-    else if (ALIGNMENT == 32)
+    else if constexpr (ALIGNMENT == 32)
     {
         ret = ll_aligned_malloc_32(size);
     }
+    else if constexpr (ALIGNMENT == 64)
+    {
+        ret = ll_aligned_malloc_64(size);
+    }
     else
     {
         ret = ll_aligned_malloc_fallback(size, ALIGNMENT);
@@ -285,16 +310,20 @@ template<size_t ALIGNMENT>
 LL_FORCE_INLINE void ll_aligned_free(void* ptr)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
-    if (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN)
+    if constexpr (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN)
     {
         LL_PROFILE_FREE(ptr);
         free(ptr);
     }
-    else if (ALIGNMENT == 16)
+    else if constexpr (ALIGNMENT == 16)
     {
         ll_aligned_free_16(ptr);
     }
-    else if (ALIGNMENT == 32)
+    else if constexpr (ALIGNMENT == 32)
+    {
+        return ll_aligned_free_32(ptr);
+    }
+    else if constexpr (ALIGNMENT == 64)
     {
         return ll_aligned_free_32(ptr);
     }
-- 
cgit v1.3


From 4ab2a80e6c8ab6c183143fbbca2c3386088caeb6 Mon Sep 17 00:00:00 2001
From: Rye <rye@lindenlab.com>
Date: Mon, 10 Feb 2025 14:08:56 -0500
Subject: Use SSE2NEON to emulate SSE intrinsics when building against an ARM
 target

---
 indra/llcommon/llmemory.h                |  8 ++++++++
 indra/llmath/llsimdmath.h                | 14 ++++++++++++--
 indra/llmath/llvector4a.inl              | 18 +++++++++++++++++-
 indra/llmath/tests/llquaternion_test.cpp |  6 +++---
 indra/newview/llappviewer.cpp            | 11 ++++++++++-
 5 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'indra/llcommon/llmemory.h')

diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h
index 72aec57080..adc556d180 100644
--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@@ -71,7 +71,11 @@ LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
 #define ll_assert_aligned(ptr,alignment)
 #endif
 
+#if LL_ARM64
+#include "sse2neon.h"
+#else
 #include <xmmintrin.h>
+#endif
 
 template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
 {
@@ -339,6 +343,9 @@ LL_FORCE_INLINE void ll_aligned_free(void* ptr)
 inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
+#if defined(LL_ARM64)
+    memcpy(dst, src, bytes);
+#else
     assert(src != NULL);
     assert(dst != NULL);
     assert(bytes > 0);
@@ -404,6 +411,7 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __
         dst += 16;
         src += 16;
     }
+#endif
 }
 
 #ifndef __DEBUG_PRIVATE_MEM__
diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h
index 40953dc2e8..b27b034cf3 100644
--- a/indra/llmath/llsimdmath.h
+++ b/indra/llmath/llsimdmath.h
@@ -31,16 +31,26 @@
 #error "Please include llmath.h before this file."
 #endif
 
-#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) )
-#error SSE2 not enabled. LLVector4a and related class will not compile.
+// the check for this error case must be split into multiple parts
+// because some versions of VS complain about '__SSE2__'
+#if ( ( LL_DARWIN || LL_LINUX ) )
+    #if !(__SSE2__) && !(__arm64__) && !(__aarch64__)
+        #error SSE2 not enabled. LLVector4a and related class will not compile.
+    #endif
+#elif ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) )
+    #error SSE2 not enabled. LLVector4a and related class will not compile.
 #endif
 
 #if !LL_WINDOWS
 #include <stdint.h>
 #endif
 
+#if defined(__arm64__) || defined(__aarch64__)
+#include "sse2neon.h"
+#else
 #include <xmmintrin.h>
 #include <emmintrin.h>
+#endif
 
 #include "llmemory.h"
 #include "llsimdtypes.h"
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 77a0257fbf..443a46c317 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -115,7 +115,7 @@ inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
 // Set to all zeros
 inline void LLVector4a::clear()
 {
-    mQ = LLVector4a::getZero().mQ;
+    mQ = _mm_setzero_ps();
 }
 
 inline void LLVector4a::splat(const F32 x)
@@ -272,6 +272,9 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
 // Set all elements to the dot product of the x, y, and z elements in a and b
 inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f);
+#else
     // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
     const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
     // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
@@ -284,11 +287,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
     const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
     // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
     mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
+#endif
 }
 
 // Set all elements to the dot product of the x, y, z, and w elements in a and b
 inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff);
+#else
     // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
     const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
     // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
@@ -301,21 +308,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
 
     // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
     mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
+#endif
 }
 
 // Return the 3D dot product of this vector and b
 inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    return _mm_dp_ps(mQ, b.mQ, 0x7f);
+#else
     const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
     const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
     const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
     const LLQuad xPlusY = _mm_add_ps( ab, splatY );
     return _mm_add_ps( xPlusY, splatZ );
+#endif
 }
 
 // Return the 4D dot product of this vector and b
 inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    return _mm_dp_ps(mQ, b.mQ, 0xff);
+#else
     // ab = { w, z, y, x }
     const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
     // upperProdsInLowerElems = { y, x, y, x }
@@ -325,6 +340,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
     // shuffled = { z+x, z+x, z+x, z+x }
     const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
     return _mm_add_ss( sumOfPairs, shuffled );
+#endif
 }
 
 // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
diff --git a/indra/llmath/tests/llquaternion_test.cpp b/indra/llmath/tests/llquaternion_test.cpp
index aa3c0ad843..ba18d54d55 100644
--- a/indra/llmath/tests/llquaternion_test.cpp
+++ b/indra/llmath/tests/llquaternion_test.cpp
@@ -349,9 +349,9 @@ namespace tut
         ensure(
             "2. LLVector4 operator*(const LLVector4 &a, const LLQuaternion &rot) failed",
             is_approx_equal(-58153.5390f, result.mV[0]) &&
-            (183787.8125f == result.mV[1]) &&
-            (116864.164063f == result.mV[2]) &&
-            (78.099998f == result.mV[3]));
+            is_approx_equal(183787.8125f, result.mV[1]) &&
+            is_approx_equal(116864.164063f, result.mV[2]) &&
+            is_approx_equal(78.099998f, result.mV[3]));
     }
 
     //test case for LLVector3 operator*(const LLVector3 &a, const LLQuaternion &rot) fn.
diff --git a/indra/newview/llappviewer.cpp b/indra/newview/llappviewer.cpp
index 84cce2348a..9392d111f2 100644
--- a/indra/newview/llappviewer.cpp
+++ b/indra/newview/llappviewer.cpp
@@ -972,6 +972,7 @@ bool LLAppViewer::init()
         return false;
     }
 
+#if defined(LL_X86) || defined(LL_X86_64)
     // Without SSE2 support we will crash almost immediately, warn here.
     if (!gSysCPU.hasSSE2())
     {
@@ -983,6 +984,7 @@ bool LLAppViewer::init()
         // quit immediately
         return false;
     }
+#endif
 
     // alert the user if they are using unsupported hardware
     if (!gSavedSettings.getBOOL("AlertedUnsupportedHardware"))
@@ -1268,7 +1270,7 @@ void LLAppViewer::initMaxHeapSize()
     //------------------------------------------------------------------------------------------
     //currently SL is built under 32-bit setting, we set its max heap size no more than 1.6 GB.
 
- #ifndef LL_X86_64
+ #if !defined(LL_X86_64) && !defined(LL_ARM64)
     F32Gigabytes max_heap_size_gb = (F32Gigabytes)gSavedSettings.getF32("MaxHeapSize") ;
 #else
     F32Gigabytes max_heap_size_gb = (F32Gigabytes)gSavedSettings.getF32("MaxHeapSize64");
@@ -3246,6 +3248,11 @@ LLSD LLAppViewer::getViewerInfo() const
     info["VIEWER_VERSION_STR"] = versionInfo.getVersion();
     info["CHANNEL"] = versionInfo.getChannel();
     info["ADDRESS_SIZE"] = ADDRESS_SIZE;
+#if LL_ARM64
+    info["ARCHITECTURE"] = "ARM";
+#else
+    info["ARCHITECTURE"] = "x86";
+#endif
     std::string build_config = versionInfo.getBuildConfig();
     if (build_config != "Release")
     {
@@ -5538,7 +5545,9 @@ void LLAppViewer::forceErrorBreakpoint()
 #ifdef LL_WINDOWS
     DebugBreak();
 #else
+#if defined(LL_X86) || defined(LL_X86_64)
     asm ("int $3");
+#endif
 #endif
     return;
 }
-- 
cgit v1.3