bin/reproducible_migrate_db.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Copyright © 2016 Valerie Young <spectranaut@riseup.net>
# Licensed under GPL-2
#
# Depends: sqlalchemy, reproducible_common

from sqlalchemy import *
from sqlalchemy.schema import *
import sqlalchemy.util
from pprint import pprint
import subprocess
import sys
import os
import csv

# We skip the database connection because the database
# may not exist yet, but we would like to use the constants
# available in reproducible_common.py
sys.argv.append('--skip-database-connection')
from reproducible_common import *

# Database defined in reproducible_common.py
DEST_DB_NAME = PGDATABASE
DEST_DB_USER = os.environ['USER']

# Old SQLite database
SOURCE_DB_NAME = '/var/lib/jenkins/reproducible.db'

try:
    # check is the user exists, if not created it
    log.info("Checking if postgres role exists...")
    query = "SELECT 1 FROM pg_roles WHERE rolname='%s';" % DEST_DB_USER
    command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query]
    result = subprocess.check_output(command).decode("utf-8").strip()

    if result == '1':
        log.info("user exists.")
    else:
        log.info("Postgres role %s does not exist. Creating role."
                 % DEST_DB_USER)
        check_call(['sudo', '-u', 'postgres', 'createuser', '-w', DEST_DB_USER])

    # check is the database exists, if not created it
    log.info("Checking if postgres database exists...")
    query = "SELECT 1 FROM pg_database WHERE datname='%s'" % DEST_DB_NAME
    command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query]
    result = subprocess.check_output(command).decode("utf-8").strip()

    if result == '1':
        log.info("database exists.")
    else:
        log.info("Postgres database %s does not exist. Creating database."
                 % DEST_DB_NAME)
        check_call(['sudo', '-u', 'postgres', 'createdb', '-O', DEST_DB_USER,
                    '-w', DEST_DB_NAME])

except FileNotFoundError:
    print_critical_message("Postgres is not installed. Install postgres before continuing.")
    sys.exit(1)

# Run reproducible_db_maintenance. This will create the appropriate schema.
db_maintenance = os.path.join(BIN_PATH, "reproducible_db_maintenance.py")
check_call([db_maintenance])

# Connect to both databases
dest_engine = create_engine("postgresql:///%s" % DEST_DB_NAME)
dest_conn = dest_engine.connect()
source_engine = create_engine("sqlite:///%s" % SOURCE_DB_NAME)
source_conn = source_engine.connect()

# Load all table definitions for both databases. They should be identical
# (both up to date according to reproducible_db_maintenance.py)
source_metadata = MetaData(source_engine)
source_metadata.reflect()
dest_metadata = MetaData(dest_engine)
dest_metadata.reflect()

# The order in which we will copy the table
all_tables = ['sources', 'issues', 'notes', 'removed_packages', 'results',
              'stats_bugs', 'stats_build', 'stats_builds_age', 'stats_notes',
              'stats_issues', 'stats_meta_pkg_state', 'stats_builds_per_day',
              'stats_pkg_state']

# Get all table definitions in source and destination. If the table doesn't
# exist in one of these two places, an error will occur.
dest_tables = {t: Table(t, dest_metadata, autoload=True) for t in all_tables}
source_tables = {t: Table(t, source_metadata, autoload=True) for t in all_tables}

for table in all_tables:
    log.info("Copying table: "  + table)
    dest_table = dest_tables[table]
    dest_columns = dest_table.columns.keys()
    source_table = source_tables[table]
    source_columns = source_table.columns.keys()

    if table in ['notes', 'results']:
        sources = Table('sources', source_metadata, autoload=True)
        # only select rows with correct foreign references to the SOURCES table
        query = sql.select(source_table.c).select_from(
            source_table.join(sources))

        # save rows with incorrect foreign references to the SOURCES table
        ignored_query = select(source_table.c).select_from(source_table).where(
            source_table.c.package_id.notin_(select([sources.c.id])))
        ignored_results = source_conn.execute(ignored_query).fetchall()
        if len(ignored_results):
            log.info('Ignoring bad foreign keys in %s. Dumping rows to '
                     'ignored_rows_%s.csv' % (table, table))
            with open('ignored_rows_' + table + '.csv', 'w') as f:
                writer = csv.DictWriter(f, fieldnames=ignored_results[0].keys())
                writer.writeheader()
                for row in ignored_results:
                    writer.writerow(dict(row))
    else:
        query = "select * from %s" % table

    # Perform each table copy in a single transaction
    transaction = dest_conn.begin()
    try:
        for record in source_conn.execute(query):
            data = {}
            for c in source_columns:
                col = c.lower()
                value = getattr(record, c)
                if str(dest_table.c[col].type) == 'INTEGER' and value == "":
                    # there exist empty string values in the sqlite database
                    # for integers. This will result in an error if we try to
                    # write these values to the postgres db.
                    log.info("column %s has empty string/null value" % col)
                else:
                    data[col] = value

            try:
                dest_conn.execute(dest_table.insert(), [data])
            except:
                log.critical("Could not insert: %s" % data)
                raise

        # Commit the whole table at once
        transaction.commit()
    except:
        transaction.rollback()
        log.error("Transaction rolled back")
        raise

# For the autoincrementing columns to work correctly, we must set the next
# value of the sequence to be the next highest available ID.
table_sequences = {t : t + "_id_seq" for t in ['sources', 'results']}

for table in table_sequences:
    query = "select setval('%s', (select max(id)+1 from %s))"
    dest_conn.execute(query % (table_sequences[table], table))