From 5307bd19abf0f36a66fa48a51dea88a27e40d100 Mon Sep 17 00:00:00 2001 From: Mattia Rizzolo Date: Thu, 26 Feb 2015 03:02:27 +0100 Subject: reproducible: scheduler: rewrite in python. + use the new database schema supporting multi-release + add myself to the notified people in case of failure --- bin/reproducible_scheduler.py | 286 ++++++++++++++++++++++++++++++++++++++++++ bin/reproducible_scheduler.sh | 217 -------------------------------- job-cfg/reproducible.yaml | 4 +- 3 files changed, 288 insertions(+), 219 deletions(-) create mode 100755 bin/reproducible_scheduler.py delete mode 100755 bin/reproducible_scheduler.sh diff --git a/bin/reproducible_scheduler.py b/bin/reproducible_scheduler.py new file mode 100755 index 00000000..c4071b76 --- /dev/null +++ b/bin/reproducible_scheduler.py @@ -0,0 +1,286 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Copyright © 2015 Mattia Rizzolo +# Based on reproducible_scheduler.sh © 2014-2015 Holger Levsen +# Licensed under GPL-2 +# +# Depends: python3 python3-debian +# +# Schedule packages to be build. + +import sys +import lzma +import deb822 +import aptsources.sourceslist +from time import sleep +from random import randint +from subprocess import call +from apt_pkg import version_compare +from urllib.request import urlopen + +from reproducible_common import * +from reproducible_html_indexes import build_page + + +def call_apt_update(): + # try three times, before failing the job + for i in [1, 2, 3]: + if not call(['sudo', 'apt-get', 'update']): + return + else: + log.warning('apt failed. retring another ' + 3-i + ' times') + sleep(randint(1, 70) + 30) + print_critical_message('`apt-get update` failed for three times in a row') + sys.exit(1) + + +def check_suite_avail(suite): + log.debug('Checking wheter the suite ' + suite + ' is listed in your ' + + 'sources.list file') + listall = aptsources.sourceslist.SourcesList() + splittedlist = [x.str() for x in listall] + for line in splittedlist: + if line[0][0] == '#': + continue + if 'deb-src' not in line: + continue + if suite in line: + log.debug('\tyes, it is') + return True + return False + + +def update_sources_tables(suite): + # download the sources file for this suite + mirror = 'http://ftp.de.debian.org/debian' + remotefile = mirror + '/dists/' + suite + '/main/source/Sources.xz' + log.info('Downloading sources file for ' + suite + ': ' + remotefile) + sources = lzma.decompress(urlopen(remotefile).read()).decode() + log.debug('\tdownloaded') + # extract relevant info (package name and version) from the sources file + new_pkgs = [] + for src in deb822.Sources.iter_paragraphs(sources): + pkg = (src['Package'], src['Version'], suite) + new_pkgs.append(pkg) + # get the current packages in the database + query = 'SELECT name, version, suite FROM sources ' + \ + 'WHERE suite="{}"'.format(suite) + cur_pkgs = query_db(query) + pkgs_to_add = [] + updated_pkgs = [] + different_pkgs = [x for x in new_pkgs if x not in cur_pkgs] + log.debug('Packages different in the archive and in the db: ' + + str(different_pkgs)) + for pkg in different_pkgs: + query = 'SELECT id, version FROM sources ' + \ + 'WHERE name="{name}" AND suite="{suite}"' + query = query.format(name=pkg[0], suite=pkg[2]) + try: + result = query_db(query)[0] + except IndexError: # new package + pkgs_to_add.append((pkg[0], pkg[1], pkg[2], 'amd64')) + pkg_id = result[0] + old_version = result[1] + if version_compare(pkg[1], old_version) > 0: + log.debug('New version: ' + str(pkg) + ' (we had ' + + old_version + ')') + updated_pkgs.append((pkg_id, pkg[0], pkg[1], pkg[2])) + # Now actually update the database: + cursor = conn_db.cursor() + # updated packages + log.debug('Pusing updated packages to the database...') + cursor.executemany('REPLACE INTO sources ' + + '(id, name, version, suite, architecture) ' + + 'VALUES (?, ?, ?, ?, "{arch}")'.format(arch='amd64'), + updated_pkgs) + conn_db.commit() + # new packages + log.info('Now inserting the new sources in the database: ' + + str(pkgs_to_add)) + cursor.executemany('INSERT INTO sources ' + + '(name, version, suite, architecture) ' + + 'VALUES (?, ?, ?, ?)', pkgs_to_add) + conn_db.commit() + # RM'ed packages + cur_pkgs_name = [x[0] for x in cur_pkgs] + new_pkgs_name = [x[0] for x in new_pkgs] + rmed_pkgs = [x for x in cur_pkgs_name if x not in new_pkgs_name] + log.info('Now deleting removed packages: ' + str(rmed_pkgs)) + rmed_pkgs_id = [] + for pkg in rmed_pkgs: + result = query_db(('SELECT id FROM sources ' + + 'WHERE name="{name}" ' + + 'AND suite="{suite}"').format(name=pkg, suite=suite)) + rmed_pkgs_id.extend(result) + log.debug('removed packages ID: ' + str([str(x[0]) for x in rmed_pkgs_id])) + cursor.executemany('DELETE FROM sources ' + + 'WHERE id=?', rmed_pkgs_id) + cursor.executemany('DELETE FROM results ' + + 'WHERE package_id=?', rmed_pkgs_id) + cursor.executemany('DELETE FROM schedule ' + + 'WHERE package_id=?', rmed_pkgs_id) + conn_db.commit() + # finally check whether the db has the correct number of packages + pkgs_end = query_db('SELECT count(*) FROM sources WHERE suite="%s"' % suite) + count_new_pkgs = len(set([x[0] for x in new_pkgs])) + if int(pkgs_end[0][0]) != count_new_pkgs: + print_critical_message('AH! The number of source in the Sources file' + + ' is different than the one in the DB!') + log.critical('source in the debian archive for the ' + suite + + ' suite:' + str(count_new_pkgs)) + log.critical('source in the reproducible db for the ' + suite + + ' suite:' + str(pkgs_end[0][0])) + sys.exit(1) + + +def print_schedule_result(suite, criteria, packages): + ''' + `packages` is the usual list-of-tuples returned by SQL queries, + where the first item is the id and the second one the package name + ''' + log.info('Criteria: ' + criteria) + log.info('Suite: ' + suite) + log.info('Amount: ' + str(len(packages))) + log.info('Packages: ' + ' '.join([x[1] for x in packages])) + log.info('==============================================================') + + +def schedule_packages(packages): + date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + pkgs = [(x[0], date) for x in packages] + log.debug('IDs about to be scheduled: ' + str([x[0] for x in packages])) + query = 'INSERT INTO schedule ' + \ + '(package_id, date_scheduled, date_build_started) ' + \ + 'VALUES (?, ?, "")' + cursor = conn_db.cursor() + cursor.executemany(query, pkgs) + conn_db.commit() + log.info('==============================================================') + log.info('The following ' + str(len(pkgs)) + ' source packages have ' + + 'been scheduled: ' + ' '.join([str(x[1]) for x in packages])) + log.info('==============================================================') + + +def scheduler_unknown_packages(suite, limit): + criteria = 'not tested before, randomly sorted' + query = """SELECT DISTINCT sources.id, sources.name FROM sources + WHERE sources.suite='{suite}' + AND sources.id NOT IN + (SELECT schedule.package_id FROM schedule) + AND sources.id NOT IN + (SELECT results.package_id FROM results) + ORDER BY random() + LIMIT {limit}""".format(suite=suite, limit=limit) + packages = query_db(query) + print_schedule_result(suite, criteria, packages) + return packages + + +def scheduler_new_versions(suite, limit): + criteria = 'tested before, new version available, sorted by last build date' + query = """SELECT DISTINCT s.id, s.name + FROM sources AS s JOIN results AS r ON s.id = r.package_id + WHERE s.suite='{suite}' + AND s.version != r.version + AND r.status != 'blacklisted' + AND s.id IN (SELECT package_id FROM results) + AND s.id NOT IN (SELECT schedule.package_id FROM schedule) + ORDER BY r.build_date + LIMIT {limit}""".format(suite=suite, limit=limit) + packages = query_db(query) + print_schedule_result(suite, criteria, packages) + return packages + + +def scheduler_old_versions(suite, limit): + criteria = 'tested at least two weeks ago, no new version available, ' + \ + 'sorted by last build date' + query = """SELECT DISTINCT s.id, s.name + FROM sources AS s JOIN results AS r ON s.id = r.package_id + WHERE s.suite='{suite}' + AND r.version = s.version + AND r.status != 'blacklisted' + AND r.build_date < datetime('now', '-14 day') + AND s.id NOT IN (SELECT schedule.package_id FROM schedule) + ORDER BY r.build_date + LIMIT {limit}""".format(suite=suite, limit=limit) + packages = query_db(query) + print_schedule_result(suite, criteria, packages) + return packages + + +def scheduler(suite): + total = int(query_db('SELECT count(*) FROM schedule')[0][0]) + log.debug('current scheduled packages: ' + str(total)) + if total > 250: + build_page('scheduled') # from reproducible_html_indexes + log.info(str(total) + ' packages already scheduled, nothing to do.') + return + else: + log.info(str(total) + ' packages already scheduled, scheduling some more...') + # unknown packages + log.info('Requesting 200 unknown packages...') + unknown = scheduler_unknown_packages(suite, 200) + total += len(unknown) + log.info('So, in total now ' + str(total) + ' packages about to be ' + + 'scheduled for ' + suite + '.') + + # packages with new versions + if total <= 250: + many_new = 50 + elif total <= 450: + many_new = 25 + else: + many_new = 0 + log.info('Requesting ' + str(many_new) + ' new versions...') + new = scheduler_new_versions(suite, many_new) + total += len(new) + log.info('So, in total now ' + str(total) + ' packages about to be ' + + 'scheduled for ' + suite + '.') + + # old packages + if total <= 250: + many_old = 200 + elif total <= 350: + many_old = 250 + else: + many_old = 1 + log.info('Requesting ' + str(many_old) + ' old packages...') + old = scheduler_old_versions(suite, many_old) + total += len(old) + log.info('So, in total now ' + str(total) + ' packages about to be ' + + 'scheduled for ' + suite + '.') + + # build the final message text + message = 'Scheduled ' + str(len(unknown)) + ' unknown package, ' + \ + str(len(new)) + ' packages with new versions and ' + \ + str(len(old)) + ' with the same version (total: ' + \ + str(total) + ')' + kgb = ['kgb-client', '--conf', '/srv/jenkins/kgb/debian-reproducible.conf', + '--relay-msg', '"'] + kgb.extend(message.split()) + kgb.append('"') + + # finally + all_scheduled_pkgs = [] + all_scheduled_pkgs.extend(unknown) + all_scheduled_pkgs.extend(new) + all_scheduled_pkgs.extend(old) + schedule_packages(all_scheduled_pkgs) + build_page('scheduled') # from reproducible_html_indexes + log.info('\n\n\n') + log.info(message) + call(kgb) + + +if __name__ == '__main__': + call_apt_update() + for suite in SUITES: +# for now we need entries for whatever suite we want to test in sources.list + if not check_suite_avail(suite): + print_critical_message('Please add a deb-src entry for ' + suite + + ' in your sources.list file') + raise ValueError + update_sources_tables(suite) + scheduler(suite) diff --git a/bin/reproducible_scheduler.sh b/bin/reproducible_scheduler.sh deleted file mode 100755 index b69c6339..00000000 --- a/bin/reproducible_scheduler.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# Copyright 2014-2015 Holger Levsen -# released under the GPLv=2 - -DEBUG=false -. /srv/jenkins/bin/common-functions.sh -common_init "$@" - -# common code defining db access -. /srv/jenkins/bin/reproducible_common.sh - -# -# functions, see below for main -# -update_apt() { - # this needs sid entries in sources.list: - grep deb-src /etc/apt/sources.list | grep sid - # try apt-get update three times, else fail - sudo apt-get update || ( sleep $(( $RANDOM % 70 + 30 )) ; sudo apt-get update ) || ( sleep $(( $RANDOM % 70 + 30 )) ; sudo apt-get update || exit 1 ) -} - -cleanup_lock() { - rm -f ${PACKAGES_DB}.lock -} - -# update sources table in db -update_sources_table() { - trap cleanup_lock INT TERM EXIT - touch ${PACKAGES_DB}.lock - TMPFILE=$(mktemp) - curl $MIRROR/dists/sid/main/source/Sources.xz > $TMPFILE - CSVFILE=$(mktemp) - (xzcat $TMPFILE | egrep "(^Package:|^Version:)" | sed -s "s#^Version: ##g; s#Package: ##g; s#\n# #g"| while read PKG ; do read VERSION ; echo "$PKG,$VERSION" ; done) > $CSVFILE - sqlite3 -csv -init $INIT ${PACKAGES_DB} "DELETE from sources" - echo ".import $CSVFILE sources" | sqlite3 -csv -init $INIT ${PACKAGES_DB} - # count unique packages for later comparison - P_IN_TMPFILE=$(xzcat $TMPFILE | grep "^Package:" | cut -d " " -f2 | sort -u | wc -l) - # cleanup files already - rm $CSVFILE $TMPFILE - # cleanup db - echo "=============================================================================" - echo "$(date) Removing duplicate versions from sources db..." - for PKG in $(sqlite3 ${PACKAGES_DB} 'SELECT name FROM sources GROUP BY name HAVING count(name) > 1') ; do - BET="" - for VERSION in $(sqlite3 ${PACKAGES_DB} "SELECT version FROM sources where name = \"$PKG\"") ; do - if [ "$BET" = "" ] ; then - BET=$VERSION - continue - elif dpkg --compare-versions "$BET" lt "$VERSION" ; then - BET=$VERSION - fi - done - sqlite3 -init $INIT ${PACKAGES_DB} "DELETE FROM sources WHERE name = '$PKG' AND version != '$BET'" - done - echo "$(date) Done removing duplicate versions from sources db..." - echo "=============================================================================" - cleanup_lock - trap - INT TERM EXIT - # verify duplicate entries have been removed correctly from the db - P_IN_SOURCES=$(sqlite3 ${PACKAGES_DB} 'SELECT count(name) FROM sources') - if [ $P_IN_TMPFILE -ne $P_IN_SOURCES ] ; then - echo "DEBUG: P_IN_SOURCES = $P_IN_SOURCES" - echo "DEBUG: P_IN_TMPFILE = $P_IN_TMPFILE" - RESULT=1 - else - RESULT=0 - fi -} - -do_sql_query() { - PACKAGES=$(sqlite3 -init $INIT ${PACKAGES_DB} "$QUERY") - if [ ! -z "$PACKAGES" ] ; then - AMOUNT=$(echo "$PACKAGES" | wc -l) - PACKAGES="$(echo $PACKAGES)" - else - AMOUNT=0 - fi - echo "Criteria: $1" - echo "Amount: $AMOUNT" - echo "Packages: $PACKAGES" - echo "=============================================================================" -} - -select_unknown_packages() { - QUERY=" - SELECT DISTINCT sources.name FROM sources - WHERE sources.name NOT IN - (SELECT sources.name FROM sources,sources_scheduled - WHERE sources.name=sources_scheduled.name) - AND sources.name NOT IN - (SELECT sources.name FROM sources,source_packages - WHERE sources.name=source_packages.name) - ORDER BY random() - LIMIT $1" - do_sql_query "not tested before, randomly sorted" -} - -select_new_versions() { - QUERY=" - SELECT DISTINCT sources.name FROM sources,source_packages - WHERE sources.name NOT IN - (SELECT sources.name FROM sources,sources_scheduled - WHERE sources.name=sources_scheduled.name) - AND sources.name IN - (SELECT sources.name FROM sources,source_packages - WHERE sources.name=source_packages.name - AND sources.version!=source_packages.version - AND source_packages.status!='blacklisted') - AND sources.name=source_packages.name - ORDER BY source_packages.build_date - LIMIT $1" - do_sql_query "tested before, new version available, sorted by last test date" -} - -select_old_versions() { - # old versions older than two weeks only - QUERY=" - SELECT DISTINCT sources.name FROM sources,source_packages - WHERE sources.name NOT IN - (SELECT sources.name FROM sources,sources_scheduled - WHERE sources.name=sources_scheduled.name) - AND sources.name IN - (SELECT sources.name FROM sources,source_packages - WHERE sources.name=source_packages.name - AND sources.version=source_packages.version - AND source_packages.status!='blacklisted') - AND sources.name=source_packages.name - AND source_packages.build_date < datetime('now', '-14 day') - ORDER BY source_packages.build_date - LIMIT $1" - do_sql_query "tested at least two weeks ago, no new version available, sorted by last test date" -} - -schedule_packages() { - DATE=$(date +'%Y-%m-%d %H:%M') - TMPFILE=$(mktemp) - for PKG in $ALL_PACKAGES ; do - echo "INSERT INTO sources_scheduled VALUES ('$PKG','$DATE','');" >> $TMPFILE - done - cat $TMPFILE | sqlite3 -init $INIT ${PACKAGES_DB} - rm $TMPFILE - echo "=============================================================================" - echo "The following $TOTAL source packages have been scheduled: $ALL_PACKAGES" - echo "=============================================================================" - echo -} - -# -# main -# -update_apt -COUNT_SCHEDULED=$(sqlite3 ${PACKAGES_DB} 'SELECT count(name) FROM sources_scheduled') -if [ $COUNT_SCHEDULED -gt 250 ] ; then - /srv/jenkins/bin/reproducible_html_indexes.py - echo "$COUNT_SCHEDULED packages scheduled, nothing to do." - exit 0 -else - echo "$COUNT_SCHEDULED packages currently scheduled, scheduling some more..." -fi - -RESULT=0 -for i in 1 2 3 4 5 ; do - # try fives times, before failing the job - update_sources_table - if [ $RESULT -eq 0 ] ; then - break - fi - sleep 2m -done -if [ $RESULT -ne 0 ] ; then - echo "failure to update sources table" - exit 1 -fi - -echo "Requesting 200 unknown packages..." -select_unknown_packages 200 -let "TOTAL=$COUNT_SCHEDULED+$AMOUNT" -echo "So in total now $TOTAL packages about to be scheduled." -ALL_PACKAGES="$PACKAGES" -MESSAGE="Scheduled $AMOUNT unknown packages" - -if [ $TOTAL -le 250 ] ; then - NEW=50 -elif [ $TOTAL -le 450 ] ; then - NEW=25 -fi -echo "Requesting $NEW new versions..." -select_new_versions $NEW -let "TOTAL=$TOTAL+$AMOUNT" -echo "So in total now $TOTAL packages about to be scheduled." -ALL_PACKAGES="$ALL_PACKAGES $PACKAGES" -MESSAGE="$MESSAGE, $AMOUNT packages with new versions" - -if [ $TOTAL -lt 250 ] ; then - OLD=200 -elif [ $TOTAL -le 350 ] ; then - OLD=100 -else - OLD=1 -fi -echo "Requesting $OLD old packages..." -select_old_versions $OLD -echo -n "Found $AMOUNT old packages, " -let "TOTAL=$TOTAL+$AMOUNT" -ALL_PACKAGES="$ALL_PACKAGES $PACKAGES" - -echo "So in total now $TOTAL packages about to be scheduled." -MESSAGE="$MESSAGE and $AMOUNT packages with the same version (total: $TOTAL)" - -# finally -schedule_packages -/srv/jenkins/bin/reproducible_html_indexes.py -echo -echo "$MESSAGE" -kgb-client --conf /srv/jenkins/kgb/debian-reproducible.conf --relay-msg "$MESSAGE" -echo diff --git a/job-cfg/reproducible.yaml b/job-cfg/reproducible.yaml index bcdf71f8..38965606 100644 --- a/job-cfg/reproducible.yaml +++ b/job-cfg/reproducible.yaml @@ -207,8 +207,8 @@ - '{name}_scheduler': my_description: 'Schedule packages to be tested for reproducibility.' my_timed: '42 * * * *' - my_shell: '/srv/jenkins/bin/reproducible_scheduler.sh' - my_recipients: 'holger@layer-acht.org' + my_shell: '/srv/jenkins/bin/reproducible_scheduler.py' + my_recipients: 'holger@layer-acht.org mattia@mapreri.org' - '{name}_html_graphs': my_description: 'Generate HTML results (stats with graphs) for reproducible builds.' my_timed: '0 * * * *' -- cgit v1.2.3-70-g09d2