diff --git a/montage/labs.py b/montage/labs.py index 85c0597b..8daafe86 100644 --- a/montage/labs.py +++ b/montage/labs.py @@ -10,18 +10,25 @@ DB_CONFIG = os.path.expanduser('~/replica.my.cnf') -IMAGE_COLS = ['img_width', - 'img_height', - 'img_name', - 'img_major_mime', - 'img_minor_mime', - 'IFNULL(oi.actor_user, ci.actor_user) AS img_user', - 'IFNULL(oi.actor_name, ci.actor_name) AS img_user_text', - 'IFNULL(oi_timestamp, img_timestamp) AS img_timestamp', - 'img_timestamp AS rec_img_timestamp', - 'ci.actor_user AS rec_img_user', - 'ci.actor_name AS rec_img_text', - 'oi.oi_archive_name AS oi_archive_name'] +# Column mapping for the new file/filerevision schema on wikireplica. +# The old image/oldimage tables are being removed (deadline: 28 May 2026, +# see https://phabricator.wikimedia.org/T123582 and hatnote/montage#504). +# +# Mapping summary: +# image -> file (img_* cols -> file_* cols, img_actor -> file_actor) +# oldimage -> filerevision (oi_* cols -> fr_* cols, oi_actor -> fr_actor) +FILE_COLS = ['fi.file_width AS img_width', + 'fi.file_height AS img_height', + 'fi.file_name AS img_name', + 'fi.file_media_type AS img_major_mime', + 'fi.file_minor_mime AS img_minor_mime', + 'IFNULL(fr.actor_user, ci.actor_user) AS img_user', + 'IFNULL(fr.actor_name, ci.actor_name) AS img_user_text', + 'IFNULL(fr.fr_timestamp, fi.file_timestamp) AS img_timestamp', + 'fi.file_timestamp AS rec_img_timestamp', + 'ci.actor_user AS rec_img_user', + 'ci.actor_name AS rec_img_text', + 'fr.fr_archive_name AS oi_archive_name'] class MissingMySQLClient(RuntimeError): @@ -42,8 +49,7 @@ def fetchall_from_commonswiki(query, params): cursor.execute(query, params) res = cursor.fetchall() - # looking at the schema on labs, it's all varbinary, not varchar, - # so this block converts values + # schema on labs is varbinary, not varchar, so decode each value ret = [] for rec in res: new_rec = {} @@ -56,28 +62,29 @@ def fetchall_from_commonswiki(query, params): def get_files(category_name): + # Migrated from image/oldimage to file/filerevision (see issue #504) query = ''' SELECT {cols} - FROM commonswiki_p.image AS i - LEFT JOIN actor AS ci ON img_actor=ci.actor_id - LEFT JOIN (SELECT oi_name, - oi_actor, + FROM commonswiki_p.`file` AS fi + LEFT JOIN actor AS ci ON fi.file_actor=ci.actor_id + LEFT JOIN (SELECT fr_name, + fr_actor, actor_user, actor_name, - oi_timestamp, - oi_archive_name - FROM oldimage - LEFT JOIN actor ON oi_actor=actor.actor_id) AS oi ON img_name=oi.oi_name + fr_timestamp, + fr_archive_name + FROM filerevision + LEFT JOIN actor ON fr_actor=actor.actor_id) AS fr ON fi.file_name=fr.fr_name JOIN page ON page_namespace = 6 - AND page_title = img_name + AND page_title = fi.file_name JOIN categorylinks ON cl_from = page_id AND cl_type = 'file' JOIN linktarget ON cl_target_id = lt_id AND lt_namespace = 14 AND lt_title = %s - GROUP BY img_name - ORDER BY oi_timestamp ASC; - '''.format(cols=', '.join(IMAGE_COLS)) + GROUP BY fi.file_name + ORDER BY fr.fr_timestamp ASC; + '''.format(cols=', '.join(FILE_COLS)) params = (category_name.replace(' ', '_'),) results = fetchall_from_commonswiki(query, params) @@ -86,22 +93,23 @@ def get_files(category_name): def get_file_info(filename): + # Migrated from image/oldimage to file/filerevision (see issue #504) query = ''' SELECT {cols} - FROM commonswiki_p.image AS i - LEFT JOIN actor AS ci ON img_actor=ci.actor_id - LEFT JOIN (SELECT oi_name, - oi_actor, + FROM commonswiki_p.`file` AS fi + LEFT JOIN actor AS ci ON fi.file_actor=ci.actor_id + LEFT JOIN (SELECT fr_name, + fr_actor, actor_user, actor_name, - oi_timestamp, - oi_archive_name - FROM oldimage - LEFT JOIN actor ON oi_actor=actor.actor_id) AS oi ON img_name=oi.oi_name - WHERE img_name = %s - GROUP BY img_name - ORDER BY oi_timestamp ASC; - '''.format(cols=', '.join(IMAGE_COLS)) + fr_timestamp, + fr_archive_name + FROM filerevision + LEFT JOIN actor ON fr_actor=actor.actor_id) AS fr ON fi.file_name=fr.fr_name + WHERE fi.file_name = %s + GROUP BY fi.file_name + ORDER BY fr.fr_timestamp ASC; + '''.format(cols=', '.join(FILE_COLS)) params = (filename.replace(' ', '_'),) results = fetchall_from_commonswiki(query, params) if results: @@ -112,4 +120,4 @@ def get_file_info(filename): if __name__ == '__main__': imgs = get_files('Images_from_Wiki_Loves_Monuments_2015_in_France') - import pdb; pdb.set_trace() + print('got %d images' % len(imgs))