diff options
-rwxr-xr-x | bin/reproducible_migrate_db.py | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/bin/reproducible_migrate_db.py b/bin/reproducible_migrate_db.py new file mode 100755 index 00000000..575b04ba --- /dev/null +++ b/bin/reproducible_migrate_db.py @@ -0,0 +1,154 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Copyright © 2016 Valerie Young <spectranaut@riseup.net> +# Licensed under GPL-2 +# +# Depends: sqlalchemy, reproducible_common + +from sqlalchemy import * +from sqlalchemy.schema import * +import sqlalchemy.util +from pprint import pprint +import subprocess +import sys +import os +import csv + +# We skip the database connection because the database +# may not exist yet, but we would like to use the constants +# available in reproducible_common.py +sys.argv.append('--skip-database-connection') +from reproducible_common import * + +# Database defined in reproducible_common.py +DEST_DB_NAME = PGDATABASE +DEST_DB_USER = os.environ['USER'] + +# Old SQLite database +SOURCE_DB_NAME = '/var/lib/jenkins/reproducible.db' + +try: + # check is the user exists, if not created it + log.info("Checking if postgres role exists...") + query = "SELECT 1 FROM pg_roles WHERE rolname='%s';" % DEST_DB_USER + command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query] + result = subprocess.check_output(command).decode("utf-8").strip() + + if result == '1': + log.info("user exists.") + else: + log.info("Postgres role %s does not exist. Creating role." + % DEST_DB_USER) + check_call(['sudo', '-u', 'postgres', 'createuser', '-w', DEST_DB_USER]) + + # check is the database exists, if not created it + log.info("Checking if postgres database exists...") + query = "SELECT 1 FROM pg_database WHERE datname='%s'" % DEST_DB_NAME + command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query] + result = subprocess.check_output(command).decode("utf-8").strip() + + if result == '1': + log.info("database exists.") + else: + log.info("Postgres database %s does not exist. Creating database." + % DEST_DB_NAME) + check_call(['sudo', '-u', 'postgres', 'createdb', '-O', DEST_DB_USER, + '-w', DEST_DB_NAME]) + +except FileNotFoundError: + print_critical_message("Postgres is not installed. Install postgres before continuing.") + sys.exit(1) + +# Run reproducible_db_maintenance. This will create the appropriate schema. +db_maintenance = os.path.join(BIN_PATH, "reproducible_db_maintenance.py") +check_call([db_maintenance]) + +# Connect to both databases +dest_engine = create_engine("postgresql:///%s" % DEST_DB_NAME) +dest_conn = dest_engine.connect() +source_engine = create_engine("sqlite:///%s" % SOURCE_DB_NAME) +source_conn = source_engine.connect() + +# Load all table definitions for both databases. They should be identical +# (both up to date according to reproducible_db_maintenance.py) +source_metadata = MetaData(source_engine) +source_metadata.reflect() +dest_metadata = MetaData(dest_engine) +dest_metadata.reflect() + +# The order in which we will copy the table +all_tables = ['sources', 'issues', 'notes', 'removed_packages', 'results', + 'stats_bugs', 'stats_build', 'stats_builds_age', 'stats_notes', + 'stats_issues', 'stats_meta_pkg_state', 'stats_builds_per_day', + 'stats_pkg_state'] + +# Get all table definitions in source and destination. If the table doesn't +# exist in one of these two places, an error will occur. +dest_tables = {t: Table(t, dest_metadata, autoload=True) for t in all_tables} +source_tables = {t: Table(t, source_metadata, autoload=True) for t in all_tables} + +for table in all_tables: + log.info("Copying table: " + table) + dest_table = dest_tables[table] + dest_columns = dest_table.columns.keys() + source_table = source_tables[table] + source_columns = source_table.columns.keys() + + if table in ['notes', 'results']: + sources = Table('sources', source_metadata, autoload=True) + # only select rows with correct foreign references to the SOURCES table + query = sql.select(source_table.c).select_from( + source_table.join(sources)) + + # save rows with incorrect foreign references to the SOURCES table + ignored_query = select(source_table.c).select_from(source_table).where( + source_table.c.package_id.notin_(select([sources.c.id]))) + ignored_results = source_conn.execute(ignored_query).fetchall() + if len(ignored_results): + log.info('Ignoring bad foreign keys in %s. Dumping rows to ' + 'ignored_rows_%s.csv' % (table, table)) + with open('ignored_rows_' + table + '.csv', 'w') as f: + writer = csv.DictWriter(f, fieldnames=ignored_results[0].keys()) + writer.writeheader() + for row in ignored_results: + writer.writerow(dict(row)) + else: + query = "select * from %s" % table + + # Perform each table copy in a single transaction + transaction = dest_conn.begin() + try: + for record in source_conn.execute(query): + data = {} + for c in source_columns: + col = c.lower() + value = getattr(record, c) + if str(dest_table.c[col].type) == 'INTEGER' and value == "": + # there exist empty string values in the sqlite database + # for integers. This will result in an error if we try to + # write these values to the postgres db. + log.info("column %s has empty string/null value" % col) + else: + data[col] = value + + try: + dest_conn.execute(dest_table.insert(), [data]) + except: + log.critical("Could not insert: %s" % data) + raise + + # Commit the whole table at once + transaction.commit() + except: + transaction.rollback() + log.error("Transaction rolled back") + raise + +# For the autoincrementing columns to work correctly, we must set the next +# value of the sequence to be the next highest available ID. +table_sequences = {t : t + "_id_seq" for t in ['sources', 'results']} + +for table in table_sequences: + query = "select setval('%s', (select max(id)+1 from %s))" + dest_conn.execute(query % (table_sequences[table], table)) |