dist: introduce scylla-tune-sched.service to tune kernel scheduler

On /usr/lib/sysctl.d/99-scylla-sched.conf, we have some sysctl settings to
tune the scheduler for lower latency.
This is mostly to prevent softirq threads processing tcp and reactor threads
from injecting latency into each other.
However, these parameters are moved to debugfs from linux-5.13+, so we lost
scheduler tuneing on recent kernels.

To support tuning recent kernel, let's add a new service which support
to configure both sysctl and debugfs.
The service named scylla-tune-sched.service
The service will unconditionally enables when installed, on older kernel
it will tune via sysctl, on recent kernel it will tune via debugfs.

Fixes #16077

Closes scylladb/scylladb#16122
This commit is contained in:
Takuya ASADA
2023-11-21 00:49:20 +09:00
committed by Avi Kivity
parent 3ffd8737e4
commit 6eb9344cb3
11 changed files with 141 additions and 19 deletions

15
dist/common/kernel_conf/post_install.sh vendored Normal file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
#
# Copyright (C) 2023-present ScyllaDB
#
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
if [ ! -d /run/systemd/system ]; then
exit 0
fi
systemctl --system daemon-reload >/dev/null || true
systemctl --system enable --now scylla-tune-sched.service || true

73
dist/common/kernel_conf/scylla_tune_sched vendored Executable file
View File

@@ -0,0 +1,73 @@
#!/usr/bin/env python3
#
# Copyright 2023-present ScyllaDB
#
#
# SPDX-License-Identifier: AGPL-3.0-or-later
import os
import sys
import errno
import logging
PROCFS = '/proc/sys/kernel'
DEBUGFS = '/sys/kernel/debug'
TUNE_PARAMS = {
# Prevent auto-scaling from doing anything to our tunables
'sched.tunable_scaling': 0,
# Preempt sooner (For CFS, only available on <linux-6.6)
'sched.min_granularity_ns': 500000,
# Preempt sooner (For EEVDF, only available on >=linux-6.6)
'sched.base_slice_ns': 500000,
# Don't delay unrelated workloads (For CFS, only available on <linux-6.6)
'sched.wakeup_granularity_ns': 450000,
# Schedule all tasks in this period (For CFS, only available on <linux-6.6)
'sched.latency_ns': 1000000,
# autogroup seems to prevent sched_latency_ns from being respected
'sched.autogroup_enabled': 0,
# Disable numa balancing
'numa_balancing': 0
}
def write_to(prefix, key, value):
if prefix == PROCFS:
path = os.path.join(prefix, key.replace('.', '_'))
elif prefix == DEBUGFS:
path = os.path.join(prefix, key.replace('.', '/'))
if not os.path.isfile(path):
return False
try:
with open(path, 'w') as f:
f.write(str(value))
except OSError as e:
# On Ubuntu 22.04, 5.13.0 kernel has following bug on debugfs:
# https://lists.openwall.net/linux-kernel/2021/10/01/455
# It causes "Invalid argument" while writing to tunable_scaling.
# The bug caused because the kernel code forgetting to add
# null-terminate on the tail of the string which received from
# userspace.
# As a workaround, we can avoid the error by writing '\0' on the tail
# of the string.
if e.errno == errno.EINVAL and key == 'sched.tunable_scaling':
with open(path, 'w') as f:
f.write(f'{value}\0')
else:
logging.error(str(e))
return False
except Exception as e:
logging.error(str(e))
return False
return True
if __name__ == '__main__':
for k, v in TUNE_PARAMS.items():
if not write_to(PROCFS, k, v) and not write_to(DEBUGFS, k, v):
logging.error(f'Failed to set {k} = {v}')

View File

@@ -1,17 +0,0 @@
# Prevent auto-scaling from doing anything to our tunables
kernel.sched_tunable_scaling = 0
# Preempt sooner
kernel.sched_min_granularity_ns = 500000
# Don't delay unrelated workloads
kernel.sched_wakeup_granularity_ns = 450000
# Schedule all tasks in this period
kernel.sched_latency_ns = 1000000
# autogroup seems to prevent sched_latency_ns from being respected
kernel.sched_autogroup_enabled = 0
# Disable numa balancing
kernel.numa_balancing = 0

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Kernel scheduler tuning script for Scylla
RequiresMountsFor=/proc
RequiresMountsFor=/sys/kernel/debug
[Service]
Type=simple
ExecStart=/opt/scylladb/kernel_conf/scylla_tune_sched
[Install]
WantedBy=multi-user.target

View File

@@ -33,6 +33,7 @@ endif
dh_installinit --no-start --name scylla-housekeeping-daily
dh_installinit --no-start --name scylla-housekeeping-restart
dh_installinit --no-start --name scylla-fstrim
dh_installinit --no-start --name scylla-tune-sched
override_dh_strip:
# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are

View File

@@ -1 +1,2 @@
usr/lib/sysctl.d/*.conf
opt/scylladb/kernel_conf/*

View File

@@ -8,11 +8,12 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
echo "kernel $KVER detected, skip running sysctl..."
else
# expect failures in virtualized environments
sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-filemax.conf || :
fi
/opt/scylladb/kernel_conf/post_install.sh
#DEBHELPER#

View File

@@ -0,0 +1,9 @@
#!/bin/sh
set -e
if [ -d /run/systemd/system ]; then
systemctl --system daemon-reload >/dev/null || true
fi
#DEBHELPER#

View File

@@ -0,0 +1 @@
../../common/systemd/scylla-tune-sched.service

View File

@@ -208,15 +208,26 @@ if Scylla is the main application on your server and you wish to optimize its la
%post kernel-conf
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
# following is a "manual" expansion
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-filemax.conf >/dev/null 2>&1 || :
/opt/scylladb/kernel_conf/post_install.sh
%preun kernel-conf
if [ $1 -eq 0 ] ; then
/usr/bin/systemctl --no-reload disable scylla-tune-sched.service ||:
/usr/bin/systemctl stop scylla-tune-sched.service ||:
fi
%postun kernel-conf
/usr/bin/systemctl daemon-reload ||:
%files kernel-conf
%defattr(-,root,root)
%{_sysctldir}/*.conf
%{_unitdir}/scylla-tune-sched.service
/opt/scylladb/kernel_conf/*
%package node-exporter

View File

@@ -333,6 +333,21 @@ if ! $nonroot; then
installconfig 644 "$file" "$rusr"/lib/sysctl.d
done
fi
install -d -m755 -d "$rprefix"/kernel_conf
install -m755 dist/common/kernel_conf/scylla_tune_sched -Dt "$rprefix"/kernel_conf
install -m755 dist/common/kernel_conf/post_install.sh "$rprefix"/kernel_conf
if ! $without_systemd; then
install -m644 dist/common/systemd/scylla-tune-sched.service -Dt "$rsystemd"
if ! $nonroot && [ "$prefix" != "/opt/scylladb" ]; then
install -d -m755 "$retc"/systemd/system/scylla-tune-sched.service.d
cat << EOS > "$retc"/systemd/system/scylla-tune-sched.service.d/execpath.conf
[Service]
ExecStart=
ExecStart=$prefix/kernel_conf/scylla_tune_sched
EOS
fi
fi
relocate_python3 "$rprefix"/kernel_conf dist/common/kernel_conf/scylla_tune_sched
# scylla-node-exporter
if ! $without_systemd; then
install -d -m755 "$rsystemd"
@@ -618,6 +633,7 @@ elif ! $packaging; then
done
if ! $supervisor; then
$rprefix/scripts/scylla_post_install.sh
$rprefix/kernel_conf/post_install.sh
fi
echo "Scylla offline install completed."
fi