summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerie R Young <spectranaut@riseup.net>2016-08-23 20:32:48 -0400
committerHolger Levsen <holger@layer-acht.org>2016-12-19 12:05:17 +0100
commitfc17c87a536bb52eaa3bf49dbcaf6824fce7b0f4 (patch)
treea8d9b3983853edb5303db3e3975fafa7b7b0e02f
parenta55f5a0c1b2843bd3ff96f6b8a532c224b71d42d (diff)
downloadjenkins.debian.net-fc17c87a536bb52eaa3bf49dbcaf6824fce7b0f4.tar.xz
temporary: script to migrate from sqlite to postgres db
Signed-off-by: Holger Levsen <holger@layer-acht.org>
-rwxr-xr-xbin/reproducible_migrate_db.py154
1 files changed, 154 insertions, 0 deletions
diff --git a/bin/reproducible_migrate_db.py b/bin/reproducible_migrate_db.py
new file mode 100755
index 00000000..575b04ba
--- /dev/null
+++ b/bin/reproducible_migrate_db.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2016 Valerie Young <spectranaut@riseup.net>
+# Licensed under GPL-2
+#
+# Depends: sqlalchemy, reproducible_common
+
+from sqlalchemy import *
+from sqlalchemy.schema import *
+import sqlalchemy.util
+from pprint import pprint
+import subprocess
+import sys
+import os
+import csv
+
+# We skip the database connection because the database
+# may not exist yet, but we would like to use the constants
+# available in reproducible_common.py
+sys.argv.append('--skip-database-connection')
+from reproducible_common import *
+
+# Database defined in reproducible_common.py
+DEST_DB_NAME = PGDATABASE
+DEST_DB_USER = os.environ['USER']
+
+# Old SQLite database
+SOURCE_DB_NAME = '/var/lib/jenkins/reproducible.db'
+
+try:
+ # check is the user exists, if not created it
+ log.info("Checking if postgres role exists...")
+ query = "SELECT 1 FROM pg_roles WHERE rolname='%s';" % DEST_DB_USER
+ command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query]
+ result = subprocess.check_output(command).decode("utf-8").strip()
+
+ if result == '1':
+ log.info("user exists.")
+ else:
+ log.info("Postgres role %s does not exist. Creating role."
+ % DEST_DB_USER)
+ check_call(['sudo', '-u', 'postgres', 'createuser', '-w', DEST_DB_USER])
+
+ # check is the database exists, if not created it
+ log.info("Checking if postgres database exists...")
+ query = "SELECT 1 FROM pg_database WHERE datname='%s'" % DEST_DB_NAME
+ command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query]
+ result = subprocess.check_output(command).decode("utf-8").strip()
+
+ if result == '1':
+ log.info("database exists.")
+ else:
+ log.info("Postgres database %s does not exist. Creating database."
+ % DEST_DB_NAME)
+ check_call(['sudo', '-u', 'postgres', 'createdb', '-O', DEST_DB_USER,
+ '-w', DEST_DB_NAME])
+
+except FileNotFoundError:
+ print_critical_message("Postgres is not installed. Install postgres before continuing.")
+ sys.exit(1)
+
+# Run reproducible_db_maintenance. This will create the appropriate schema.
+db_maintenance = os.path.join(BIN_PATH, "reproducible_db_maintenance.py")
+check_call([db_maintenance])
+
+# Connect to both databases
+dest_engine = create_engine("postgresql:///%s" % DEST_DB_NAME)
+dest_conn = dest_engine.connect()
+source_engine = create_engine("sqlite:///%s" % SOURCE_DB_NAME)
+source_conn = source_engine.connect()
+
+# Load all table definitions for both databases. They should be identical
+# (both up to date according to reproducible_db_maintenance.py)
+source_metadata = MetaData(source_engine)
+source_metadata.reflect()
+dest_metadata = MetaData(dest_engine)
+dest_metadata.reflect()
+
+# The order in which we will copy the table
+all_tables = ['sources', 'issues', 'notes', 'removed_packages', 'results',
+ 'stats_bugs', 'stats_build', 'stats_builds_age', 'stats_notes',
+ 'stats_issues', 'stats_meta_pkg_state', 'stats_builds_per_day',
+ 'stats_pkg_state']
+
+# Get all table definitions in source and destination. If the table doesn't
+# exist in one of these two places, an error will occur.
+dest_tables = {t: Table(t, dest_metadata, autoload=True) for t in all_tables}
+source_tables = {t: Table(t, source_metadata, autoload=True) for t in all_tables}
+
+for table in all_tables:
+ log.info("Copying table: " + table)
+ dest_table = dest_tables[table]
+ dest_columns = dest_table.columns.keys()
+ source_table = source_tables[table]
+ source_columns = source_table.columns.keys()
+
+ if table in ['notes', 'results']:
+ sources = Table('sources', source_metadata, autoload=True)
+ # only select rows with correct foreign references to the SOURCES table
+ query = sql.select(source_table.c).select_from(
+ source_table.join(sources))
+
+ # save rows with incorrect foreign references to the SOURCES table
+ ignored_query = select(source_table.c).select_from(source_table).where(
+ source_table.c.package_id.notin_(select([sources.c.id])))
+ ignored_results = source_conn.execute(ignored_query).fetchall()
+ if len(ignored_results):
+ log.info('Ignoring bad foreign keys in %s. Dumping rows to '
+ 'ignored_rows_%s.csv' % (table, table))
+ with open('ignored_rows_' + table + '.csv', 'w') as f:
+ writer = csv.DictWriter(f, fieldnames=ignored_results[0].keys())
+ writer.writeheader()
+ for row in ignored_results:
+ writer.writerow(dict(row))
+ else:
+ query = "select * from %s" % table
+
+ # Perform each table copy in a single transaction
+ transaction = dest_conn.begin()
+ try:
+ for record in source_conn.execute(query):
+ data = {}
+ for c in source_columns:
+ col = c.lower()
+ value = getattr(record, c)
+ if str(dest_table.c[col].type) == 'INTEGER' and value == "":
+ # there exist empty string values in the sqlite database
+ # for integers. This will result in an error if we try to
+ # write these values to the postgres db.
+ log.info("column %s has empty string/null value" % col)
+ else:
+ data[col] = value
+
+ try:
+ dest_conn.execute(dest_table.insert(), [data])
+ except:
+ log.critical("Could not insert: %s" % data)
+ raise
+
+ # Commit the whole table at once
+ transaction.commit()
+ except:
+ transaction.rollback()
+ log.error("Transaction rolled back")
+ raise
+
+# For the autoincrementing columns to work correctly, we must set the next
+# value of the sequence to be the next highest available ID.
+table_sequences = {t : t + "_id_seq" for t in ['sources', 'results']}
+
+for table in table_sequences:
+ query = "select setval('%s', (select max(id)+1 from %s))"
+ dest_conn.execute(query % (table_sequences[table], table))