summaryrefslogtreecommitdiffstats
path: root/bin/reproducible_node_health_check.sh
blob: c4b42a24c5e3528db5ee7200f8697b7ab7e0de67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/bash

# Copyright 2014-2017 Holger Levsen <holger@layer-acht.org>
#         © 2015 Mattia Rizzolo <mattia@mapreri.org>
# released under the GPLv=2

DEBUG=false
. /srv/jenkins/bin/common-functions.sh
common_init "$@"

# common code defining db access
. /srv/jenkins/bin/reproducible_common.sh

# some defaults
DIRTY=false
REP_RESULTS=/srv/reproducible-results

show_fstab_and_mounts() {
	echo "################################"
	echo "/dev/shm and /run/shm on $HOSTNAME"
	echo "################################"
	ls -lartd /run/shm /dev/shm/
	echo "################################"
	echo "/etc/fstab on $HOSTNAME"
	echo "################################"
	cat /etc/fstab
	echo "################################"
	echo "mount output on $HOSTNAME"
	echo "################################"
	mount
	echo "################################"
	DIRTY=true
}

#
# we fail hard
#
set -e

#
# is the filesystem writetable?
#
echo "$(date -u) - testing whether /tmp is writable..."
TEST=$(mktemp --tmpdir=/tmp rwtest-XXXXXX)
if [ -z "$TEST" ] ; then
	echo "Failure to write a file in /tmp, assuming read-only filesystem."
	exit 1
fi
rm $TEST > /dev/null

#
# check for /dev/shm being mounted properly
#
echo "$(date -u) - testing whether /dev/shm is mounted correctly..."
mount | egrep -q "^tmpfs on /dev/shm"
if [ $? -ne 0 ] ; then
	echo "Warning: /dev/shm is not mounted correctly on $HOSTNAME, it should be a tmpfs, please tell the jenkins admins to fix this."
	show_fstab_and_mounts
fi
test "$(stat -c %a -L /dev/shm)" = 1777
if [ $? -ne 0 ] ; then
	echo "Warning: /dev/shm is not mounted correctly on $HOSTNAME, it should be mounted with 1777 permissions, please tell the jenkins admins to fix this."
	show_fstab_and_mounts
fi
#
# check for /run/shm being a link to /dev/shm
#
echo "$(date -u) - testing whether /run/shm is a link..."
if ! test -L /run/shm ; then
	echo "Warning: /run/shm is not a link on $HOSTNAME, please tell the jenkins admins to fix this."
	show_fstab_and_mounts
elif [ "$(readlink /run/shm)" != "/dev/shm" ] ; then
	echo "Warning: /run/shm is a link, but not pointing to /dev/shm on $HOSTNAME, please tell the jenkins admins to fix this."
	show_fstab_and_mounts
fi

#
# check for hanging mounts
#
echo "$(date -u) - testing whether running 'mount' takes forever..."
timeout -s 9 15 mount > /dev/null
TIMEOUT=$?
if [ $TIMEOUT -ne 0 ] ; then
	echo "$(date -u) - running 'mount' takes forever, giving up."
	exit 1
fi

#
# check for correct MTU
#
echo "$(date -u) - testing whether the network interfaces MTU is 1500..."
if [ "$(ip link | sed -n '/LOOPBACK\|NOARP/!s/.* mtu \([0-9]*\) .*/\1/p' | sort -u)" != "1500" ] ; then
	ip link
	echo "$(date -u) - network interfaces MTU != 1500 - this is wrong.  => please \`sudo ifconfig eth0 mtu 1500\`"
	# should probably turn this into a warning if this becomes to annoying
	irc_message debian-reproducible "$HOSTNAME has wrong MTU, please tell the jenkins admins to fix this.  (sudo ifconfig eth0 mtu 1500)"
	exit 1
fi

#
# check for correct future
#
# (yes this is hardcoded but meh…)
echo "$(date -u) - testing whether the time is right..."
if [ "$(date +%Y)" = "2019" ] ; then
	echo "Warning, today is the wrong future: $(date -u)."
	DIRTY=true
elif [ "$(date +%Y)" = "2018" ] ; then
	echo "Good, today is the right future: $(date -u)."
else
	echo "Cherrish today, $(date -u)."
fi

#
# check for cleaned up kernels
# (on Ubuntu systems only, as those have free spaces issues on /boot frequently)
#
if [ "$(lsb_release -si)" = "Ubuntu" ] ; then
	echo "$(date -u) - testing whether only one kernel is installed..."
	if [ "$(ls /boot/vmlinuz-*|wc -l)" != "1" ] ; then
		echo "Warning, more than one kernel in /boot:"
		ls -lart /boot/vmlinuz-*
		df -h /boot
		DIRTY=true
	fi
fi

#
# check for haveged running
#
echo "$(date -u) - testing 'haveged' is running..."
HAVEGED="$(ps fax | grep '/usr/sbin/haveged' | grep -v grep || true)"
if [ -z "$HAVEGED" ] ; then
	echo "$(date -u) - haveged ain't running, giving up."
	systemctl status haveged
	exit 1
fi

#
# checks only for the main node
#
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
	#
	# sometimes deleted jobs come back as zombies
	# and we dont know why and when that happens,
	# so just report those zombies here.
	#
	ZOMBIES=$(ls -1d /var/lib/jenkins/jobs/* | egrep 'reproducible_(builder_(amd64|i386|armhf|arm64)|setup_(pbuilder|schroot)_testing)|chroot-installation_wheezy|ff64a|jtk1a' || true)
	if [ ! -z "$ZOMBIES" ] ; then
		echo "Warning, rise of the jenkins job zombies has started again, these jobs should not exist:"
		for z in $ZOMBIES ; do
			echo $(basename $z)
		done
		DIRTY=true
		echo
	fi
	#
	# /var/log/jenkins/jenkins.log sometimes grows very fast
	# and we don't yet know why, so let's monitor this for now.
	JENKINSLOG="$(find /var/log/jenkins -name jenkins.log -size +42G)"
	if [ -z "JENKINSLOG" ] ; then
		echo "Warning, jenkins.log is larger than 42G, please fix, erroring out now."
		exit 1
	else
		JENKINSLOG="$(find /var/log/jenkins -name jenkins.log -size +23G)"
		if [ -z "JENKINSLOG" ] ; then
			echo "Warning, jenkins.log is larger than 23G, please do something…"
			DIRTY=true
		fi
	fi
fi


#
# finally
#
if ! $DIRTY ; then
	echo "$(date -u ) - Everything seems to be fine."
	echo
fi

echo "$(date -u) - the end."