dist: introduce scylla-tune-sched.service to tune kernel scheduler
On /usr/lib/sysctl.d/99-scylla-sched.conf, we have some sysctl settings to tune the scheduler for lower latency. This is mostly to prevent softirq threads processing tcp and reactor threads from injecting latency into each other. However, these parameters are moved to debugfs from linux-5.13+, so we lost scheduler tuneing on recent kernels. To support tuning recent kernel, let's add a new service which support to configure both sysctl and debugfs. The service named scylla-tune-sched.service The service will unconditionally enables when installed, on older kernel it will tune via sysctl, on recent kernel it will tune via debugfs. Fixes #16077 Closes scylladb/scylladb#16122
This commit is contained in:
15
dist/common/kernel_conf/post_install.sh
vendored
Normal file
15
dist/common/kernel_conf/post_install.sh
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (C) 2023-present ScyllaDB
|
||||
#
|
||||
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
#
|
||||
|
||||
if [ ! -d /run/systemd/system ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
systemctl --system daemon-reload >/dev/null || true
|
||||
systemctl --system enable --now scylla-tune-sched.service || true
|
||||
73
dist/common/kernel_conf/scylla_tune_sched
vendored
Executable file
73
dist/common/kernel_conf/scylla_tune_sched
vendored
Executable file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2023-present ScyllaDB
|
||||
#
|
||||
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import os
|
||||
import sys
|
||||
import errno
|
||||
import logging
|
||||
|
||||
PROCFS = '/proc/sys/kernel'
|
||||
DEBUGFS = '/sys/kernel/debug'
|
||||
|
||||
TUNE_PARAMS = {
|
||||
# Prevent auto-scaling from doing anything to our tunables
|
||||
'sched.tunable_scaling': 0,
|
||||
|
||||
# Preempt sooner (For CFS, only available on <linux-6.6)
|
||||
'sched.min_granularity_ns': 500000,
|
||||
|
||||
# Preempt sooner (For EEVDF, only available on >=linux-6.6)
|
||||
'sched.base_slice_ns': 500000,
|
||||
|
||||
# Don't delay unrelated workloads (For CFS, only available on <linux-6.6)
|
||||
'sched.wakeup_granularity_ns': 450000,
|
||||
|
||||
# Schedule all tasks in this period (For CFS, only available on <linux-6.6)
|
||||
'sched.latency_ns': 1000000,
|
||||
|
||||
# autogroup seems to prevent sched_latency_ns from being respected
|
||||
'sched.autogroup_enabled': 0,
|
||||
|
||||
# Disable numa balancing
|
||||
'numa_balancing': 0
|
||||
}
|
||||
|
||||
def write_to(prefix, key, value):
|
||||
if prefix == PROCFS:
|
||||
path = os.path.join(prefix, key.replace('.', '_'))
|
||||
elif prefix == DEBUGFS:
|
||||
path = os.path.join(prefix, key.replace('.', '/'))
|
||||
if not os.path.isfile(path):
|
||||
return False
|
||||
try:
|
||||
with open(path, 'w') as f:
|
||||
f.write(str(value))
|
||||
except OSError as e:
|
||||
# On Ubuntu 22.04, 5.13.0 kernel has following bug on debugfs:
|
||||
# https://lists.openwall.net/linux-kernel/2021/10/01/455
|
||||
# It causes "Invalid argument" while writing to tunable_scaling.
|
||||
# The bug caused because the kernel code forgetting to add
|
||||
# null-terminate on the tail of the string which received from
|
||||
# userspace.
|
||||
# As a workaround, we can avoid the error by writing '\0' on the tail
|
||||
# of the string.
|
||||
if e.errno == errno.EINVAL and key == 'sched.tunable_scaling':
|
||||
with open(path, 'w') as f:
|
||||
f.write(f'{value}\0')
|
||||
else:
|
||||
logging.error(str(e))
|
||||
return False
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
return False
|
||||
return True
|
||||
|
||||
if __name__ == '__main__':
|
||||
for k, v in TUNE_PARAMS.items():
|
||||
if not write_to(PROCFS, k, v) and not write_to(DEBUGFS, k, v):
|
||||
logging.error(f'Failed to set {k} = {v}')
|
||||
17
dist/common/sysctl.d/99-scylla-sched.conf
vendored
17
dist/common/sysctl.d/99-scylla-sched.conf
vendored
@@ -1,17 +0,0 @@
|
||||
# Prevent auto-scaling from doing anything to our tunables
|
||||
kernel.sched_tunable_scaling = 0
|
||||
|
||||
# Preempt sooner
|
||||
kernel.sched_min_granularity_ns = 500000
|
||||
|
||||
# Don't delay unrelated workloads
|
||||
kernel.sched_wakeup_granularity_ns = 450000
|
||||
|
||||
# Schedule all tasks in this period
|
||||
kernel.sched_latency_ns = 1000000
|
||||
|
||||
# autogroup seems to prevent sched_latency_ns from being respected
|
||||
kernel.sched_autogroup_enabled = 0
|
||||
|
||||
# Disable numa balancing
|
||||
kernel.numa_balancing = 0
|
||||
11
dist/common/systemd/scylla-tune-sched.service
vendored
Normal file
11
dist/common/systemd/scylla-tune-sched.service
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Kernel scheduler tuning script for Scylla
|
||||
RequiresMountsFor=/proc
|
||||
RequiresMountsFor=/sys/kernel/debug
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/opt/scylladb/kernel_conf/scylla_tune_sched
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
1
dist/debian/debian/rules
vendored
1
dist/debian/debian/rules
vendored
@@ -33,6 +33,7 @@ endif
|
||||
dh_installinit --no-start --name scylla-housekeeping-daily
|
||||
dh_installinit --no-start --name scylla-housekeeping-restart
|
||||
dh_installinit --no-start --name scylla-fstrim
|
||||
dh_installinit --no-start --name scylla-tune-sched
|
||||
|
||||
override_dh_strip:
|
||||
# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
usr/lib/sysctl.d/*.conf
|
||||
opt/scylladb/kernel_conf/*
|
||||
|
||||
@@ -8,11 +8,12 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
|
||||
echo "kernel $KVER detected, skip running sysctl..."
|
||||
else
|
||||
# expect failures in virtualized environments
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-filemax.conf || :
|
||||
fi
|
||||
|
||||
/opt/scylladb/kernel_conf/post_install.sh
|
||||
|
||||
#DEBHELPER#
|
||||
|
||||
9
dist/debian/debian/scylla-kernel-conf.postrm
vendored
Normal file
9
dist/debian/debian/scylla-kernel-conf.postrm
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl --system daemon-reload >/dev/null || true
|
||||
fi
|
||||
|
||||
#DEBHELPER#
|
||||
1
dist/debian/debian/scylla-kernel-conf.scylla-tune-sched.service
vendored
Symbolic link
1
dist/debian/debian/scylla-kernel-conf.scylla-tune-sched.service
vendored
Symbolic link
@@ -0,0 +1 @@
|
||||
../../common/systemd/scylla-tune-sched.service
|
||||
13
dist/redhat/scylla.spec
vendored
13
dist/redhat/scylla.spec
vendored
@@ -208,15 +208,26 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
%post kernel-conf
|
||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||
# following is a "manual" expansion
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-filemax.conf >/dev/null 2>&1 || :
|
||||
/opt/scylladb/kernel_conf/post_install.sh
|
||||
|
||||
%preun kernel-conf
|
||||
if [ $1 -eq 0 ] ; then
|
||||
/usr/bin/systemctl --no-reload disable scylla-tune-sched.service ||:
|
||||
/usr/bin/systemctl stop scylla-tune-sched.service ||:
|
||||
fi
|
||||
|
||||
%postun kernel-conf
|
||||
/usr/bin/systemctl daemon-reload ||:
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
%{_sysctldir}/*.conf
|
||||
%{_unitdir}/scylla-tune-sched.service
|
||||
/opt/scylladb/kernel_conf/*
|
||||
|
||||
|
||||
%package node-exporter
|
||||
|
||||
16
install.sh
16
install.sh
@@ -333,6 +333,21 @@ if ! $nonroot; then
|
||||
installconfig 644 "$file" "$rusr"/lib/sysctl.d
|
||||
done
|
||||
fi
|
||||
install -d -m755 -d "$rprefix"/kernel_conf
|
||||
install -m755 dist/common/kernel_conf/scylla_tune_sched -Dt "$rprefix"/kernel_conf
|
||||
install -m755 dist/common/kernel_conf/post_install.sh "$rprefix"/kernel_conf
|
||||
if ! $without_systemd; then
|
||||
install -m644 dist/common/systemd/scylla-tune-sched.service -Dt "$rsystemd"
|
||||
if ! $nonroot && [ "$prefix" != "/opt/scylladb" ]; then
|
||||
install -d -m755 "$retc"/systemd/system/scylla-tune-sched.service.d
|
||||
cat << EOS > "$retc"/systemd/system/scylla-tune-sched.service.d/execpath.conf
|
||||
[Service]
|
||||
ExecStart=
|
||||
ExecStart=$prefix/kernel_conf/scylla_tune_sched
|
||||
EOS
|
||||
fi
|
||||
fi
|
||||
relocate_python3 "$rprefix"/kernel_conf dist/common/kernel_conf/scylla_tune_sched
|
||||
# scylla-node-exporter
|
||||
if ! $without_systemd; then
|
||||
install -d -m755 "$rsystemd"
|
||||
@@ -618,6 +633,7 @@ elif ! $packaging; then
|
||||
done
|
||||
if ! $supervisor; then
|
||||
$rprefix/scripts/scylla_post_install.sh
|
||||
$rprefix/kernel_conf/post_install.sh
|
||||
fi
|
||||
echo "Scylla offline install completed."
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user