build: limit ThinLTO link parallelism to prevent OOM in release builds

When building Scylla with ThinLTO enabled (default with Clang), the linker spawns threads equal to the number of CPU cores during linking. This high parallelism can cause out-of-memory (OOM) issues in CI environments, potentially freezing the build host or triggering the OOM killer. In this change: 1. Rename `LINK_MEM_PER_JOB` to `Scylla_RAM_PER_LINK_JOB` and make it user-configurable 2. Add `Scylla_PARALLEL_LINK_JOBS` option to directly control concurrent link jobs (useful for hosts with large RAM) 3. Increase the default value of `Scylla_PARALLEL_LINK_JOBS` to 16 GiB when LTO is enabled 4. Default to 2 parallel link jobs when LTO is enabled if the calculated number if less than 2 for faster build. Notes: - Host memory is shared across job pools, so pool separation alone doesn't help - Ninja lacks per-job memory quota support - Only affects link parallelism in LTO-enabled builds See https://clang.llvm.org/docs/ThinLTO.html#controlling-backend-parallelism Fixes scylladb/scylladb#22275 Signed-off-by: Kefu Chai <kefu.chai@scylladb.com> Closes scylladb/scylladb#22383
2025-01-16 17:58:03 +08:00
parent 3ac533251a
commit 6e1fb2c74e
2 changed files with 35 additions and 8 deletions
--- a/cmake/limit_jobs.cmake
+++ b/cmake/limit_jobs.cmake
@@ -1,16 +1,36 @@
-set(LINK_MEM_PER_JOB 4096 CACHE INTERNAL "Maximum memory used by each link job in (in MiB)")
+if(NOT DEFINED Scylla_PARALLEL_LINK_JOBS)
+  if(NOT DEFINED Scylla_RAM_PER_LINK_JOB)
+    # preserve user-provided value
+    set(_default_ram_value 4096)
+    if(Scylla_ENABLE_LTO)
+      # When ThinLTO optimization is enabled, the linker uses all available CPU threads.
+      # To prevent excessive memory usage, we limit parallel link jobs based on available RAM,
+      # as each link job requires significant memory during optimization.
+      set(_default_ram_value 16384)
+    endif()
+    set(Scylla_RAM_PER_LINK_JOB ${_default_ram_value} CACHE STRING
+      "Maximum amount of memory used by each link job (in MiB)")
+  endif()
+  cmake_host_system_information(
+    RESULT _total_mem_mb
+    QUERY AVAILABLE_PHYSICAL_MEMORY)
+  math(EXPR _link_pool_depth "${_total_mem_mb} / ${Scylla_RAM_PER_LINK_JOB}")
+  # Use 2 parallel link jobs to optimize build throughput. The main executable requires
+  # LTO (slower link phase) while tests are linked without LTO (faster link phase).
+  # This allows simultaneous linking of LTO and non-LTO targets, enabling better CPU
+  # utilization by overlapping the slower LTO link with faster test links.
+  if(_link_pool_depth LESS 2)
+    set(_link_pool_depth 2)
+  endif()

-cmake_host_system_information(
-  RESULT _total_mem
-  QUERY AVAILABLE_PHYSICAL_MEMORY)
-math(EXPR _link_pool_depth "${_total_mem} / ${LINK_MEM_PER_JOB}")
-if(_link_pool_depth EQUAL 0)
-  set(_link_pool_depth 1)
+  set(Scylla_PARALLEL_LINK_JOBS "${_link_pool_depth}" CACHE STRING
+    "Maximum number of concurrent link jobs")
 endif()
+
 set_property(
  GLOBAL
  APPEND
  PROPERTY JOB_POOLS
-    link_pool=${_link_pool_depth}
+    link_pool=${Scylla_PARALLEL_LINK_JOBS}
    submodule_pool=1)
 set(CMAKE_JOB_POOL_LINK link_pool)
--- a/configure.py
+++ b/configure.py
@@ -1664,7 +1664,14 @@ defines = ' '.join(['-D' + d for d in defines])
 globals().update(vars(args))

 total_memory = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
+# assuming each link job takes around 7GiB of memory without LTO
 link_pool_depth = max(int(total_memory / 7e9), 1)
+if args.lto:
+    # ThinLTO provides its own parallel linking, use 16GiB for RAM size used
+    # by each link job
+    depth_with_lto = max(int(total_memory / 16e9), 2)
+    if depth_with_lto < link_pool_depth:
+        link_pool_depth = depth_with_lto

 selected_modes = args.selected_modes or modes.keys()
 default_modes = args.selected_modes or [mode for mode, mode_cfg in modes.items() if mode_cfg["default"]]