From 47d178e782cf45ae5b0ff90b3ae7d6662a152ec0 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Tue, 10 Mar 2026 11:49:18 -0400 Subject: [PATCH 01/12] Skeleton of what a job to update a user from GitHub will do Steps broken down roughly into functions that the background job calls that have `todo!()` implementations for now --- src/worker/jobs/mod.rs | 2 + src/worker/jobs/update_user_from_github.rs | 50 ++++++++++++++++++++++ src/worker/mod.rs | 1 + 3 files changed, 53 insertions(+) create mode 100644 src/worker/jobs/update_user_from_github.rs diff --git a/src/worker/jobs/mod.rs b/src/worker/jobs/mod.rs index b3cb1e32915..c098e5fd068 100644 --- a/src/worker/jobs/mod.rs +++ b/src/worker/jobs/mod.rs @@ -18,6 +18,7 @@ mod sync_admins; pub mod trustpub; mod typosquat; mod update_default_version; +mod update_user_from_github; pub use self::analyze_crate_file::AnalyzeCrateFile; pub use self::archive_version_downloads::ArchiveVersionDownloads; @@ -42,3 +43,4 @@ pub use self::send_publish_notifications::SendPublishNotificationsJob; pub use self::sync_admins::SyncAdmins; pub use self::typosquat::CheckTyposquat; pub use self::update_default_version::UpdateDefaultVersion; +pub use self::update_user_from_github::UpdateUserFromGithub; diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs new file mode 100644 index 00000000000..a0a16b40b8d --- /dev/null +++ b/src/worker/jobs/update_user_from_github.rs @@ -0,0 +1,50 @@ +use crate::worker::Environment; +use crates_io_github::GitHubUser; +use crates_io_worker::BackgroundJob; +use diesel_async::AsyncPgConnection; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +#[derive(Serialize, Deserialize)] +pub struct UpdateUserFromGithub { + /// Crates.io user ID + user_id: i32, + /// GitHub ID + account_id: i32, + /// Encrypted GitHub token + encrypted_token: Vec, +} + +impl BackgroundJob for UpdateUserFromGithub { + const JOB_NAME: &'static str = "update_user_from_github"; + const DEDUPLICATED: bool = true; + + type Context = Arc; + + /// For the specified user, query the GitHub API for the user's current information to see if + /// their account has been deleted or renamed. Update the `users` and `oauth_github` tables, + /// saving the current time in `last_sync` even if the user information hasn't changed. + async fn run(&self, ctx: Self::Context) -> anyhow::Result<()> { + let mut conn = ctx.deadpool.get().await?; + + let github_user = self.refresh_user(&ctx).await?; + + self.apply_update(&github_user, &mut conn).await; + + Ok(()) + } +} + +impl UpdateUserFromGithub { + /// Given the current environment's context, request information from GitHub using the user's + /// API token. + async fn refresh_user(&self, _ctx: &Arc) -> anyhow::Result { + todo!(); + } + + /// Given the information from GitHub about the current user, make the appropriate changes to + /// the `users` and `oauth_github` tables. + async fn apply_update(&self, _github_user: &GitHubUser, _conn: &mut AsyncPgConnection) { + todo!(); + } +} diff --git a/src/worker/mod.rs b/src/worker/mod.rs index 02caf2703ac..9f863afa8f0 100644 --- a/src/worker/mod.rs +++ b/src/worker/mod.rs @@ -43,6 +43,7 @@ impl RunnerExt for Runner> { .register_job_type::() .register_job_type::() .register_job_type::() + .register_job_type::() .register_job_type::() .register_job_type::() .register_job_type::() From 2486b4a9cc7daaa74dbe3dc63d7ccef70fc50601 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Tue, 5 May 2026 14:25:23 -0400 Subject: [PATCH 02/12] Add a last_sync column to the oauth_github table Set all existing rows to 1970 so that we'll refresh them soon. Set new rows to now, because creating a new user/oauth_github record means we just got the user's github information (which counts as a sync). --- crates/crates_io_database/src/models/user.rs | 4 ++++ crates/crates_io_database/src/schema.rs | 2 ++ crates/crates_io_database_dump/src/dump-db.toml | 1 + .../down.sql | 2 ++ .../up.sql | 8 ++++++++ 5 files changed, 17 insertions(+) create mode 100644 migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/down.sql create mode 100644 migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql diff --git a/crates/crates_io_database/src/models/user.rs b/crates/crates_io_database/src/models/user.rs index fa86bf8bd96..7222ba67625 100644 --- a/crates/crates_io_database/src/models/user.rs +++ b/crates/crates_io_database/src/models/user.rs @@ -142,6 +142,9 @@ pub struct OauthGithub { pub avatar: Option, /// In the process of being migrated from `users.gh_encrypted_token`. pub encrypted_token: Vec, + /// The last time we verified with GitHub what the GitHub username for this user was, and + /// whether the account was valid + pub last_sync: DateTime, /// In the process of being migrated from `users.gh_login`. pub login: String, /// Foreign key to the `users` table. @@ -193,6 +196,7 @@ impl NewOauthGithub<'_> { oauth_github::encrypted_token.eq(excluded(oauth_github::encrypted_token)), oauth_github::login.eq(excluded(oauth_github::login)), oauth_github::avatar.eq(excluded(oauth_github::avatar)), + oauth_github::last_sync.eq(Utc::now()), )) .get_result(&mut conn) .await diff --git a/crates/crates_io_database/src/schema.rs b/crates/crates_io_database/src/schema.rs index d62115bc904..fbd19784fa3 100644 --- a/crates/crates_io_database/src/schema.rs +++ b/crates/crates_io_database/src/schema.rs @@ -676,6 +676,8 @@ diesel::table! { avatar -> Nullable, /// Encrypted GitHub access token encrypted_token -> Bytea, + /// The last time we verified with GitHub what the GitHub username for this user was, and whether the account was valid + last_sync -> Timestamptz, /// GitHub username login -> Varchar, /// Crates.io user ID foreign key diff --git a/crates/crates_io_database_dump/src/dump-db.toml b/crates/crates_io_database_dump/src/dump-db.toml index af0a2fc0dcf..67f57077c84 100644 --- a/crates/crates_io_database_dump/src/dump-db.toml +++ b/crates/crates_io_database_dump/src/dump-db.toml @@ -170,6 +170,7 @@ account_id = "private" encrypted_token = "private" login = "private" avatar = "private" +last_sync = "private" [oauth_github.column_defaults] encrypted_token = "''" diff --git a/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/down.sql b/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/down.sql new file mode 100644 index 00000000000..9fe9fd6385d --- /dev/null +++ b/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE IF EXISTS oauth_github +DROP COLUMN last_sync; diff --git a/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql b/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql new file mode 100644 index 00000000000..3408a19799c --- /dev/null +++ b/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql @@ -0,0 +1,8 @@ +ALTER TABLE IF EXISTS oauth_github +-- Set existing rows' last sync value to 1970 +ADD COLUMN last_sync timestamptz NOT NULL DEFAULT to_timestamp(0), +-- Set new rows' last sync value to the current time going forward, because creating a new +-- crates.io account fetches the user info from GitHub +ALTER COLUMN last_sync SET DEFAULT now(); + +comment on column oauth_github.last_sync is 'The last time we verified with GitHub what the GitHub username for this user was, and whether the account was valid'; From 9dff33297efc2f00b81ec0c9ecdf0ff44def54df Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Thu, 7 May 2026 15:23:47 -0400 Subject: [PATCH 03/12] Implement refreshing user from GitHub --- src/tests/worker/mod.rs | 1 + src/tests/worker/update_user_from_github.rs | 235 ++++++++++++++++++++ src/worker/jobs/update_user_from_github.rs | 112 +++++++++- 3 files changed, 340 insertions(+), 8 deletions(-) create mode 100644 src/tests/worker/update_user_from_github.rs diff --git a/src/tests/worker/mod.rs b/src/tests/worker/mod.rs index d2d7f98e57b..3780a2e51f9 100644 --- a/src/tests/worker/mod.rs +++ b/src/tests/worker/mod.rs @@ -9,3 +9,4 @@ mod squash_index; mod sync_admins; mod trustpub; mod update_default_version; +mod update_user_from_github; diff --git a/src/tests/worker/update_user_from_github.rs b/src/tests/worker/update_user_from_github.rs new file mode 100644 index 00000000000..f45b730deef --- /dev/null +++ b/src/tests/worker/update_user_from_github.rs @@ -0,0 +1,235 @@ +use crate::util::TestApp; +use crates_io::{ + controllers::session, + models::{OauthGithub, User}, + schema::{background_jobs, oauth_github}, + util::gh_token_encryption::GitHubTokenEncryption, + worker::jobs, +}; +use crates_io_github::{GitHubError, GitHubUser, MockGitHubClient}; +use crates_io_worker::BackgroundJob; +use diesel::prelude::*; +use diesel_async::RunQueryDsl; +use std::sync::LazyLock; + +const GITHUB_ID: i64 = 456789; +const EXISTING_LOGIN: &str = "my-login"; +static ENCRYPTED_TOKEN: LazyLock> = LazyLock::new(|| { + GitHubTokenEncryption::for_testing() + .encrypt("some random token") + .unwrap() +}); + +struct UpdateTest { + existing_github_id: i64, + existing_username: &'static str, + github_response: Result, + expected_username: &'static str, + expected_last_sync_updated: bool, +} + +impl UpdateTest { + async fn run(self) { + let Self { + existing_github_id, + existing_username, + github_response, + expected_username, + expected_last_sync_updated, + } = self; + + let mut github_mock = MockGitHubClient::new(); + github_mock + .expect_current_user() + .return_once(|_| github_response); + + let (app, _) = TestApp::full().with_github(github_mock).empty().await; + let emails = &app.as_inner().emails; + let mut conn = app.db_conn().await; + + let original_gh_user = github_user(existing_github_id, existing_username); + let u = + session::save_user_to_database(&original_gh_user, &ENCRYPTED_TOKEN, emails, &mut conn) + .await + .unwrap(); + + let oauth_github_before_update = oauth_github::table + .filter(oauth_github::user_id.eq(u.id)) + .first::(&mut conn) + .await + .unwrap(); + let last_sync_before_update = oauth_github_before_update.last_sync; + + let job = jobs::UpdateUserFromGithub::new(oauth_github_before_update); + job.enqueue(&conn).await.unwrap(); + let _ = app.try_run_pending_background_jobs().await; + + let oauth_github_after_update = oauth_github::table + .filter(oauth_github::user_id.eq(u.id)) + .first::(&mut conn) + .await + .unwrap(); + assert_eq!(expected_username, oauth_github_after_update.login); + if expected_last_sync_updated { + assert_ne!(last_sync_before_update, oauth_github_after_update.last_sync); + } else { + assert_eq!(last_sync_before_update, oauth_github_after_update.last_sync); + } + + // For now, we want to update the `User` record too + let user_after_update = User::find(&conn, u.id).await.unwrap(); + assert_eq!(expected_username, user_after_update.gh_login); + + // Drain the failed job so the `TestAppInner::drop` empty-queue + // post-condition is satisfied. + diesel::delete(background_jobs::table) + .execute(&mut conn) + .await + .unwrap(); + } +} + +fn github_user(id: i64, username: &str) -> GitHubUser { + GitHubUser { + login: username.into(), + id: id as i32, + avatar_url: None, + email: None, + name: None, + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn no_updates_needed() { + UpdateTest { + existing_github_id: GITHUB_ID, + // What we have and what github has agree + existing_username: EXISTING_LOGIN, + github_response: Ok(github_user(GITHUB_ID, EXISTING_LOGIN)), + expected_username: EXISTING_LOGIN, + // but still update last sync + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn yes_updates_needed() { + UpdateTest { + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + github_response: Ok(github_user(GITHUB_ID, "my-new-username")), + expected_username: "my-new-username", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn negative_github_id() { + UpdateTest { + // The GitHub ID in our database is -1 because at some point we learned the GitHub user + // was no longer valid + existing_github_id: -1, + existing_username: EXISTING_LOGIN, + // We shouldn't even contact GitHub in this case, so error in case we do + github_response: Err(GitHubError::Other(anyhow::anyhow!("Shouldn't be called"))), + expected_username: "ghost_1", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn zero_github_id() { + UpdateTest { + // Check that we're also treating 0 as invalid + existing_github_id: 0, + existing_username: EXISTING_LOGIN, + // We shouldn't even contact GitHub in this case, so error in case we do + github_response: Err(GitHubError::Other(anyhow::anyhow!("Shouldn't be called"))), + expected_username: "ghost_1", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn negative_github_id_with_ghost_username() { + UpdateTest { + // The GitHub ID in our database is negative because at some point we learned the GitHub + // user was no longer valid + existing_github_id: -9000, + // We've already set this username to ghost, but set it and update `last_sync` anyway + existing_username: "ghost_1", + // We shouldn't even contact GitHub in this case, so error in case we do + github_response: Err(GitHubError::Other(anyhow::anyhow!("Shouldn't be called"))), + expected_username: "ghost_1", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_deleted() { + UpdateTest { + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // If GitHub returns 404, this user has deleted their account. + github_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + expected_username: "ghost_1", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn still_deleted() { + UpdateTest { + existing_github_id: GITHUB_ID, + // We marked this user as deleted previously + existing_username: "ghost_1", + // GitHub still returns 404, they're still deleted + github_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + expected_username: "ghost_1", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_unavailable() { + UpdateTest { + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // If GitHub returns some error we haven't accounted for, we can't know anything about + // this user. Try again later. + github_response: Err(GitHubError::Other(anyhow::anyhow!("9% uptime is one nine"))), + expected_username: EXISTING_LOGIN, + expected_last_sync_updated: false, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn undeleted() { + UpdateTest { + existing_github_id: GITHUB_ID, + existing_username: "ghost_1", + // Not sure how often this happens, but if we marked an account as ghost but we get a + // valid user again, we should update their username + github_response: Ok(github_user(GITHUB_ID, "my-new-username")), + expected_username: "my-new-username", + expected_last_sync_updated: true, + } + .run() + .await; +} diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index a0a16b40b8d..94b5f41e53e 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -1,18 +1,27 @@ -use crate::worker::Environment; -use crates_io_github::GitHubUser; +use crate::{ + models::OauthGithub, + schema::{oauth_github, users}, + worker::Environment, +}; +use chrono::Utc; +use crates_io_github::{GitHubError, GitHubUser}; use crates_io_worker::BackgroundJob; -use diesel_async::AsyncPgConnection; +use diesel::prelude::*; +use diesel_async::{AsyncConnection, AsyncPgConnection, RunQueryDsl}; use serde::{Deserialize, Serialize}; use std::sync::Arc; +use tracing::error; #[derive(Serialize, Deserialize)] pub struct UpdateUserFromGithub { /// Crates.io user ID user_id: i32, /// GitHub ID - account_id: i32, + account_id: i64, /// Encrypted GitHub token encrypted_token: Vec, + /// Username currently in the database + old_username: String, } impl BackgroundJob for UpdateUserFromGithub { @@ -36,15 +45,102 @@ impl BackgroundJob for UpdateUserFromGithub { } impl UpdateUserFromGithub { + pub fn new(oauth_github: OauthGithub) -> Self { + let OauthGithub { + user_id, + account_id, + encrypted_token, + login, + .. + } = oauth_github; + + Self { + user_id, + account_id, + encrypted_token, + old_username: login, + } + } + /// Given the current environment's context, request information from GitHub using the user's /// API token. - async fn refresh_user(&self, _ctx: &Arc) -> anyhow::Result { - todo!(); + async fn refresh_user(&self, ctx: &Arc) -> anyhow::Result { + // if the user's gh_id isn't positive, we don't even need to ask github about this, + // we know this user is invalid. Just make sure their username is the ghost username. + if self.account_id < 1 { + Ok(self.ghost_user()) + } else { + let github = ctx.github.as_ref(); + let token = ctx + .config + .gh_token_encryption + .decrypt(&self.encrypted_token)?; + + match github.current_user(&token).await { + Ok(github_user) => Ok(github_user), + // If the user is not found, the account has been deleted. Update to the ghost + // username. + Err(GitHubError::NotFound(_)) => Ok(self.ghost_user()), + // Does unauthorized/forbidden mean the user has been deleted? + // Or does it mean the user removed crates.io's authorization? + // Or that the token we have for the user is bad? + Err(GitHubError::Unauthorized(_)) | Err(GitHubError::Forbidden(_)) => { + Ok(self.ghost_user()) + } + // If we get another sort of error, it may be transient; stop and try this user + // again later. + Err(e @ GitHubError::Other(_)) => Err(e.into()), + } + } } /// Given the information from GitHub about the current user, make the appropriate changes to /// the `users` and `oauth_github` tables. - async fn apply_update(&self, _github_user: &GitHubUser, _conn: &mut AsyncPgConnection) { - todo!(); + async fn apply_update(&self, github_user: &GitHubUser, conn: &mut AsyncPgConnection) { + // Use a transaction so that we either update both or neither the `users` record and the + // corresponding `oauth_github` record. If neither are updated, log and continue to the + // next user rather than stopping-- hopefully we'll get that user updated next time. + if let Err(e) = conn + .transaction(async |conn| { + // This will be removed when we no longer sync crates.io usernames with GitHub. + // (The transaction can be removed when this is removed as well) + diesel::update(users::table) + .filter(users::id.eq(self.user_id)) + .set(users::gh_login.eq(&github_user.login)) + .execute(conn) + .await?; + + diesel::update(oauth_github::table) + .filter(oauth_github::user_id.eq(self.user_id)) + .set(( + oauth_github::login.eq(&github_user.login), + oauth_github::last_sync.eq(Utc::now()), + )) + .execute(conn) + .await?; + + Ok::<(), diesel::result::Error>(()) + }) + .await + { + // Database update failed; it's ok to not update this user this round. + // Better luck next time. + error!( + "Could not update user ID {} from username {} to username {}: {e}", + self.user_id, self.old_username, github_user.login, + ); + } + } + + /// If this user has been deleted, ensure their username has been changed to + /// `ghost_{crates.io id}` to ensure uniqueness by creating a `GitHubUser` by hand. + fn ghost_user(&self) -> GitHubUser { + GitHubUser { + avatar_url: None, + email: None, + id: self.account_id as i32, + login: format!("ghost_{}", self.user_id), + name: None, + } } } From a7bec2087203dcc6a3bba8c4330f15d4a0b81ca1 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Fri, 8 May 2026 16:53:15 -0400 Subject: [PATCH 04/12] Add a dry run mode to only log updates from GitHub --- src/tests/worker/update_user_from_github.rs | 27 ++++++++++++++++++++- src/worker/jobs/update_user_from_github.rs | 27 ++++++++++++++++++--- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/src/tests/worker/update_user_from_github.rs b/src/tests/worker/update_user_from_github.rs index f45b730deef..298944844cd 100644 --- a/src/tests/worker/update_user_from_github.rs +++ b/src/tests/worker/update_user_from_github.rs @@ -21,6 +21,7 @@ static ENCRYPTED_TOKEN: LazyLock> = LazyLock::new(|| { }); struct UpdateTest { + dry_run: bool, existing_github_id: i64, existing_username: &'static str, github_response: Result, @@ -31,6 +32,7 @@ struct UpdateTest { impl UpdateTest { async fn run(self) { let Self { + dry_run, existing_github_id, existing_username, github_response, @@ -60,7 +62,7 @@ impl UpdateTest { .unwrap(); let last_sync_before_update = oauth_github_before_update.last_sync; - let job = jobs::UpdateUserFromGithub::new(oauth_github_before_update); + let job = jobs::UpdateUserFromGithub::new(dry_run, oauth_github_before_update); job.enqueue(&conn).await.unwrap(); let _ = app.try_run_pending_background_jobs().await; @@ -102,6 +104,7 @@ fn github_user(id: i64, username: &str) -> GitHubUser { #[tokio::test(flavor = "multi_thread")] async fn no_updates_needed() { UpdateTest { + dry_run: false, existing_github_id: GITHUB_ID, // What we have and what github has agree existing_username: EXISTING_LOGIN, @@ -117,6 +120,7 @@ async fn no_updates_needed() { #[tokio::test(flavor = "multi_thread")] async fn yes_updates_needed() { UpdateTest { + dry_run: false, existing_github_id: GITHUB_ID, existing_username: EXISTING_LOGIN, github_response: Ok(github_user(GITHUB_ID, "my-new-username")), @@ -130,6 +134,7 @@ async fn yes_updates_needed() { #[tokio::test(flavor = "multi_thread")] async fn negative_github_id() { UpdateTest { + dry_run: false, // The GitHub ID in our database is -1 because at some point we learned the GitHub user // was no longer valid existing_github_id: -1, @@ -146,6 +151,7 @@ async fn negative_github_id() { #[tokio::test(flavor = "multi_thread")] async fn zero_github_id() { UpdateTest { + dry_run: false, // Check that we're also treating 0 as invalid existing_github_id: 0, existing_username: EXISTING_LOGIN, @@ -161,6 +167,7 @@ async fn zero_github_id() { #[tokio::test(flavor = "multi_thread")] async fn negative_github_id_with_ghost_username() { UpdateTest { + dry_run: false, // The GitHub ID in our database is negative because at some point we learned the GitHub // user was no longer valid existing_github_id: -9000, @@ -178,6 +185,7 @@ async fn negative_github_id_with_ghost_username() { #[tokio::test(flavor = "multi_thread")] async fn github_deleted() { UpdateTest { + dry_run: false, existing_github_id: GITHUB_ID, existing_username: EXISTING_LOGIN, // If GitHub returns 404, this user has deleted their account. @@ -192,6 +200,7 @@ async fn github_deleted() { #[tokio::test(flavor = "multi_thread")] async fn still_deleted() { UpdateTest { + dry_run: false, existing_github_id: GITHUB_ID, // We marked this user as deleted previously existing_username: "ghost_1", @@ -207,6 +216,7 @@ async fn still_deleted() { #[tokio::test(flavor = "multi_thread")] async fn github_unavailable() { UpdateTest { + dry_run: false, existing_github_id: GITHUB_ID, existing_username: EXISTING_LOGIN, // If GitHub returns some error we haven't accounted for, we can't know anything about @@ -222,6 +232,7 @@ async fn github_unavailable() { #[tokio::test(flavor = "multi_thread")] async fn undeleted() { UpdateTest { + dry_run: false, existing_github_id: GITHUB_ID, existing_username: "ghost_1", // Not sure how often this happens, but if we marked an account as ghost but we get a @@ -233,3 +244,17 @@ async fn undeleted() { .run() .await; } + +#[tokio::test(flavor = "multi_thread")] +async fn dry_run_mode_doesnt_update() { + UpdateTest { + dry_run: true, + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + github_response: Ok(github_user(GITHUB_ID, "my-new-username")), + expected_username: EXISTING_LOGIN, + expected_last_sync_updated: false, + } + .run() + .await; +} diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index 94b5f41e53e..d2b25291c9f 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -10,10 +10,13 @@ use diesel::prelude::*; use diesel_async::{AsyncConnection, AsyncPgConnection, RunQueryDsl}; use serde::{Deserialize, Serialize}; use std::sync::Arc; -use tracing::error; +use tracing::{error, info}; #[derive(Serialize, Deserialize)] pub struct UpdateUserFromGithub { + /// Dry run will fetch updates from GitHub and log what it would change, but does not actually + /// update the database. + dry_run: bool, /// Crates.io user ID user_id: i32, /// GitHub ID @@ -34,18 +37,35 @@ impl BackgroundJob for UpdateUserFromGithub { /// their account has been deleted or renamed. Update the `users` and `oauth_github` tables, /// saving the current time in `last_sync` even if the user information hasn't changed. async fn run(&self, ctx: Self::Context) -> anyhow::Result<()> { + info!( + dry_run = self.dry_run, + user_id = self.user_id, + github_id = self.account_id, + old_username = self.old_username, + "Starting UpdateUserFromGithub" + ); + let mut conn = ctx.deadpool.get().await?; let github_user = self.refresh_user(&ctx).await?; - self.apply_update(&github_user, &mut conn).await; + if self.dry_run { + info!( + user_id = self.user_id, + old_username = self.old_username, + new_username = github_user.login, + "Dry run UpdateUserFromGithub proposed update" + ); + } else { + self.apply_update(&github_user, &mut conn).await; + } Ok(()) } } impl UpdateUserFromGithub { - pub fn new(oauth_github: OauthGithub) -> Self { + pub fn new(dry_run: bool, oauth_github: OauthGithub) -> Self { let OauthGithub { user_id, account_id, @@ -55,6 +75,7 @@ impl UpdateUserFromGithub { } = oauth_github; Self { + dry_run, user_id, account_id, encrypted_token, From 773b789f58790d3188c95aa68056076ee0caf690 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Fri, 8 May 2026 17:06:01 -0400 Subject: [PATCH 05/12] Add an admin command to enqueue a batch of user update jobs --- src/bin/crates-admin/enqueue_job.rs | 41 ++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/bin/crates-admin/enqueue_job.rs b/src/bin/crates-admin/enqueue_job.rs index 01b2fe5be0f..edc711640d1 100644 --- a/src/bin/crates-admin/enqueue_job.rs +++ b/src/bin/crates-admin/enqueue_job.rs @@ -1,7 +1,8 @@ use anyhow::Result; use chrono::NaiveDate; use crates_io::db; -use crates_io::schema::{background_jobs, crates}; +use crates_io::models::OauthGithub; +use crates_io::schema::{background_jobs, crates, oauth_github}; use crates_io::worker::jobs; use crates_io_worker::BackgroundJob; use diesel::dsl::exists; @@ -56,6 +57,14 @@ pub enum Command { SyncUpdatesFeed, TrustpubCleanup, UpdateDownloads, + /// Sync the oldest batch of users with GitHub + UpdateUserBatch { + #[arg(long = "dry-run")] + dry_run: bool, + + #[arg(long = "batch-size", default_value = "100")] + batch_size: usize, + }, } pub async fn run(command: Command) -> Result<()> { @@ -169,6 +178,36 @@ pub async fn run(command: Command) -> Result<()> { jobs::UpdateDownloads.enqueue(&conn).await?; } } + + Command::UpdateUserBatch { + dry_run, + batch_size, + } => { + let oldest_oauth_github_records = oauth_github::table + .order(oauth_github::last_sync.asc()) + .limit(batch_size as i64) + .load::(&mut conn) + .await?; + + for oauth_github in oldest_oauth_github_records { + let user_id = oauth_github.user_id; + let github_id = oauth_github.account_id; + let old_username = oauth_github.login.clone(); + + let job = jobs::UpdateUserFromGithub::new(dry_run, oauth_github); + + // Don't stop the whole batch if one user update errors, but do log the error + if let Err(e) = job.enqueue(&conn).await { + error!( + error = %e, + user_id, + github_id, + old_username, + "Error running UpdateUserFromGithub" + ); + } + } + } }; Ok(()) From 748df9c59874ecf8115928952e0189bd97906c02 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 14:12:50 -0400 Subject: [PATCH 06/12] Mark add column with constant default safe The diesel-guard message says: > Note: For Postgres 11+, this is safe if the default is a constant value. which it is. We don't actually need the part where I changed the default after adding the column because we set the default explicitly for new rows in the code. --- .../up.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql b/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql index 3408a19799c..54f99339ab8 100644 --- a/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql +++ b/migrations/2026-05-05-180007-0000_add_last_sync_to_oauth_github/up.sql @@ -1,8 +1,8 @@ +-- safety-assured:start +-- Adding a column with a constant value default is safe because we're using Postgres > 11. ALTER TABLE IF EXISTS oauth_github --- Set existing rows' last sync value to 1970 -ADD COLUMN last_sync timestamptz NOT NULL DEFAULT to_timestamp(0), --- Set new rows' last sync value to the current time going forward, because creating a new --- crates.io account fetches the user info from GitHub -ALTER COLUMN last_sync SET DEFAULT now(); +-- Set existing rows' last sync value to 1970. +ADD COLUMN last_sync timestamptz NOT NULL DEFAULT to_timestamp(0); +-- safety-assured:end comment on column oauth_github.last_sync is 'The last time we verified with GitHub what the GitHub username for this user was, and whether the account was valid'; From 69b60f27cb487eb3838d6942c89a79aa1a9f5196 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 15:59:26 -0400 Subject: [PATCH 07/12] If current_user is unauthorized/forbidden, request public user info Anonymously via the regular GitHub API. Except for enterprise managed users that have an underscore in their username; anonymous API requests will always return 404 for them (so don't try, just keep their username). If the fallback request to get public user info by the user ID fails, probably because we've hit the rate limit, fail the job and try again later. --- .../examples/test_github_client.rs | 7 + crates/crates_io_github/src/lib.rs | 6 + src/tests/util/github.rs | 18 ++ src/tests/worker/update_user_from_github.rs | 173 ++++++++++++++++-- src/worker/jobs/update_user_from_github.rs | 34 +++- 5 files changed, 221 insertions(+), 17 deletions(-) diff --git a/crates/crates_io_github/examples/test_github_client.rs b/crates/crates_io_github/examples/test_github_client.rs index f8c259d495d..94d0a0bfdc2 100644 --- a/crates/crates_io_github/examples/test_github_client.rs +++ b/crates/crates_io_github/examples/test_github_client.rs @@ -17,6 +17,9 @@ enum Request { #[clap(long, env = "GITHUB_ACCESS_TOKEN", hide_env_values = true)] access_token: SecretString, }, + GetUserById { + account_id: i64, + }, OrgByName { org_name: String, #[clap(long, env = "GITHUB_ACCESS_TOKEN", hide_env_values = true)] @@ -68,6 +71,10 @@ async fn main() -> Result<()> { let response = github_client.get_user(&name, &access_token).await?; println!("{response:#?}"); } + Request::GetUserById { account_id } => { + let response = github_client.get_user_by_id(account_id).await?; + println!("{response:#?}"); + } Request::OrgByName { org_name, access_token, diff --git a/crates/crates_io_github/src/lib.rs b/crates/crates_io_github/src/lib.rs index 2179e773b28..ccba54204e9 100644 --- a/crates/crates_io_github/src/lib.rs +++ b/crates/crates_io_github/src/lib.rs @@ -26,6 +26,7 @@ type Result = std::result::Result; pub trait GitHubClient: Send + Sync { async fn current_user(&self, auth: &AccessToken) -> Result; async fn get_user(&self, name: &str, auth: &AccessToken) -> Result; + async fn get_user_by_id(&self, account_id: i64) -> Result; async fn org_by_name(&self, org_name: &str, auth: &AccessToken) -> Result; async fn team_by_name( &self, @@ -213,6 +214,11 @@ impl GitHubClient for RealGitHubClient { self.request(&url, auth).await } + async fn get_user_by_id(&self, account_id: i64) -> Result { + let url = format!("/user/{account_id}"); + self._request(&url, std::convert::identity).await + } + async fn org_by_name(&self, org_name: &str, auth: &AccessToken) -> Result { let url = format!("/orgs/{org_name}"); self.request(&url, auth).await diff --git a/src/tests/util/github.rs b/src/tests/util/github.rs index 8bb9b9e92fd..f605dd6e777 100644 --- a/src/tests/util/github.rs +++ b/src/tests/util/github.rs @@ -53,6 +53,9 @@ impl MockData { mock.expect_current_user() .returning(|_auth| self.current_user()); + mock.expect_get_user_by_id() + .returning(|account_id| self.get_user_by_id(account_id)); + mock.expect_org_by_name() .returning(|org_name, _auth| self.org_by_name(org_name)); @@ -81,6 +84,21 @@ impl MockData { }) } + fn get_user_by_id(&self, account_id: i64) -> Result { + let user = self + .users + .iter() + .find(|user| user.id as i64 == account_id) + .ok_or_else(not_found)?; + Ok(GitHubUser { + id: user.id, + login: user.login.into(), + name: Some(user.name.into()), + email: Some(user.email.into()), + avatar_url: Some(format!("https://avatars.example.com/{}", user.id)), + }) + } + fn org_by_name(&self, org_name: &str) -> Result { let org = self .orgs diff --git a/src/tests/worker/update_user_from_github.rs b/src/tests/worker/update_user_from_github.rs index 298944844cd..50a194c0fef 100644 --- a/src/tests/worker/update_user_from_github.rs +++ b/src/tests/worker/update_user_from_github.rs @@ -24,7 +24,8 @@ struct UpdateTest { dry_run: bool, existing_github_id: i64, existing_username: &'static str, - github_response: Result, + github_current_user_response: Result, + github_user_by_id_response: Result, expected_username: &'static str, expected_last_sync_updated: bool, } @@ -35,7 +36,8 @@ impl UpdateTest { dry_run, existing_github_id, existing_username, - github_response, + github_current_user_response, + github_user_by_id_response, expected_username, expected_last_sync_updated, } = self; @@ -43,7 +45,10 @@ impl UpdateTest { let mut github_mock = MockGitHubClient::new(); github_mock .expect_current_user() - .return_once(|_| github_response); + .return_once(|_| github_current_user_response); + github_mock + .expect_get_user_by_id() + .return_once(|_| github_user_by_id_response); let (app, _) = TestApp::full().with_github(github_mock).empty().await; let emails = &app.as_inner().emails; @@ -108,7 +113,9 @@ async fn no_updates_needed() { existing_github_id: GITHUB_ID, // What we have and what github has agree existing_username: EXISTING_LOGIN, - github_response: Ok(github_user(GITHUB_ID, EXISTING_LOGIN)), + github_current_user_response: Ok(github_user(GITHUB_ID, EXISTING_LOGIN)), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: EXISTING_LOGIN, // but still update last sync expected_last_sync_updated: true, @@ -123,7 +130,9 @@ async fn yes_updates_needed() { dry_run: false, existing_github_id: GITHUB_ID, existing_username: EXISTING_LOGIN, - github_response: Ok(github_user(GITHUB_ID, "my-new-username")), + github_current_user_response: Ok(github_user(GITHUB_ID, "my-new-username")), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "my-new-username", expected_last_sync_updated: true, } @@ -140,7 +149,11 @@ async fn negative_github_id() { existing_github_id: -1, existing_username: EXISTING_LOGIN, // We shouldn't even contact GitHub in this case, so error in case we do - github_response: Err(GitHubError::Other(anyhow::anyhow!("Shouldn't be called"))), + github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( + "current_user shouldn't be called" + ))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "ghost_1", expected_last_sync_updated: true, } @@ -156,7 +169,11 @@ async fn zero_github_id() { existing_github_id: 0, existing_username: EXISTING_LOGIN, // We shouldn't even contact GitHub in this case, so error in case we do - github_response: Err(GitHubError::Other(anyhow::anyhow!("Shouldn't be called"))), + github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( + "current_user shouldn't be called" + ))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "ghost_1", expected_last_sync_updated: true, } @@ -174,7 +191,11 @@ async fn negative_github_id_with_ghost_username() { // We've already set this username to ghost, but set it and update `last_sync` anyway existing_username: "ghost_1", // We shouldn't even contact GitHub in this case, so error in case we do - github_response: Err(GitHubError::Other(anyhow::anyhow!("Shouldn't be called"))), + github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( + "Shouldn't be called" + ))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "ghost_1", expected_last_sync_updated: true, } @@ -189,7 +210,9 @@ async fn github_deleted() { existing_github_id: GITHUB_ID, existing_username: EXISTING_LOGIN, // If GitHub returns 404, this user has deleted their account. - github_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + github_current_user_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "ghost_1", expected_last_sync_updated: true, } @@ -205,7 +228,9 @@ async fn still_deleted() { // We marked this user as deleted previously existing_username: "ghost_1", // GitHub still returns 404, they're still deleted - github_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + github_current_user_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "ghost_1", expected_last_sync_updated: true, } @@ -213,6 +238,120 @@ async fn still_deleted() { .await; } +#[tokio::test(flavor = "multi_thread")] +async fn github_unauthorized_fallback_success_no_update() { + UpdateTest { + dry_run: false, + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // This could mean the user's oauth token has been revoked or similar + github_current_user_response: Err(GitHubError::Unauthorized(anyhow::anyhow!( + "Not allowed" + ))), + // Falling back to anonymous get_user_by_id shows no rename needed + github_user_by_id_response: Ok(github_user(GITHUB_ID, EXISTING_LOGIN)), + expected_username: EXISTING_LOGIN, + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_unauthorized_enterprise_user() { + UpdateTest { + dry_run: false, + existing_github_id: GITHUB_ID, + existing_username: "asmith_microsoft", + // This could mean the user's oauth token has been revoked or similar + github_current_user_response: Err(GitHubError::Unauthorized(anyhow::anyhow!( + "Not allowed" + ))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), + expected_username: "asmith_microsoft", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_unauthorized_fallback_success_yes_update() { + UpdateTest { + dry_run: false, + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // This could mean the user's oauth token has been revoked or similar + github_current_user_response: Err(GitHubError::Unauthorized(anyhow::anyhow!( + "Not allowed" + ))), + // Falling back to anonymous get_user_by_id shows yes rename needed + github_user_by_id_response: Ok(github_user(GITHUB_ID, "updated-username")), + expected_username: "updated-username", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_unauthorized_fallback_not_found_deleted() { + UpdateTest { + dry_run: false, + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // This could mean the user's oauth token has been revoked or similar + github_current_user_response: Err(GitHubError::Unauthorized(anyhow::anyhow!( + "Not allowed" + ))), + // Falling back to anonymous get_user_by_id shows the user has been deleted + github_user_by_id_response: Err(GitHubError::NotFound(anyhow::anyhow!("404 Not Found"))), + expected_username: "ghost_1", + expected_last_sync_updated: true, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_unauthorized_fallback_other_error_no_update() { + UpdateTest { + dry_run: false, + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // This could mean the user's oauth token has been revoked or similar + github_current_user_response: Err(GitHubError::Unauthorized(anyhow::anyhow!( + "Not allowed" + ))), + // Falling back to anonymous get_user_by_id fails too; try again later + github_user_by_id_response: Err(GitHubError::Other(anyhow::anyhow!( + "Over your rate limit" + ))), + expected_username: EXISTING_LOGIN, + expected_last_sync_updated: false, + } + .run() + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn github_forbidden_fallback_success_yes_update() { + UpdateTest { + dry_run: false, + existing_github_id: GITHUB_ID, + existing_username: EXISTING_LOGIN, + // This could mean the user's oauth token needs to be refreshed or similar + github_current_user_response: Err(GitHubError::Forbidden(anyhow::anyhow!("Not allowed"))), + // Falling back to anonymous get_user_by_id shows yes rename needed + github_user_by_id_response: Ok(github_user(GITHUB_ID, "updated-username")), + expected_username: "updated-username", + expected_last_sync_updated: true, + } + .run() + .await; +} + #[tokio::test(flavor = "multi_thread")] async fn github_unavailable() { UpdateTest { @@ -221,7 +360,11 @@ async fn github_unavailable() { existing_username: EXISTING_LOGIN, // If GitHub returns some error we haven't accounted for, we can't know anything about // this user. Try again later. - github_response: Err(GitHubError::Other(anyhow::anyhow!("9% uptime is one nine"))), + github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( + "9% uptime is one nine" + ))), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: EXISTING_LOGIN, expected_last_sync_updated: false, } @@ -237,7 +380,9 @@ async fn undeleted() { existing_username: "ghost_1", // Not sure how often this happens, but if we marked an account as ghost but we get a // valid user again, we should update their username - github_response: Ok(github_user(GITHUB_ID, "my-new-username")), + github_current_user_response: Ok(github_user(GITHUB_ID, "my-new-username")), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: "my-new-username", expected_last_sync_updated: true, } @@ -251,7 +396,9 @@ async fn dry_run_mode_doesnt_update() { dry_run: true, existing_github_id: GITHUB_ID, existing_username: EXISTING_LOGIN, - github_response: Ok(github_user(GITHUB_ID, "my-new-username")), + github_current_user_response: Ok(github_user(GITHUB_ID, "my-new-username")), + // Shouldn't be called in this test + github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), expected_username: EXISTING_LOGIN, expected_last_sync_updated: false, } diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index d2b25291c9f..f598ccd756d 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -102,11 +102,37 @@ impl UpdateUserFromGithub { // If the user is not found, the account has been deleted. Update to the ghost // username. Err(GitHubError::NotFound(_)) => Ok(self.ghost_user()), - // Does unauthorized/forbidden mean the user has been deleted? - // Or does it mean the user removed crates.io's authorization? - // Or that the token we have for the user is bad? + // Unauthorized/forbidden could mean: + // + // - the token we have for this user is out-of-date + // - the user has revoked crates.io's oauth access + // + // In those cases, try to request the user's info via an unauthenticated GitHub + // API request, unless they are a GitHub Enterprise Managed User as indicated by an + // underscore in their username because we have to be authorized by the managing + // enterprise to see any information on enterprise managed users. Err(GitHubError::Unauthorized(_)) | Err(GitHubError::Forbidden(_)) => { - Ok(self.ghost_user()) + // Enterprise managed users are the only ones that should contain underscores. + if self.old_username.contains('_') { + // We can't get updated info, so keep what we have. + Ok(GitHubUser { + login: self.old_username.clone(), + id: self.account_id as i32, + // The other fields are not used in `apply_update`. + avatar_url: Default::default(), + email: Default::default(), + name: Default::default(), + }) + } else { + match github.get_user_by_id(self.account_id).await { + Ok(github_user) => Ok(github_user), + Err(GitHubError::NotFound(_)) => Ok(self.ghost_user()), + // Not sure how we could get Unauthorized/Forbidden for an anonymous + // API request. We could get rate limited though; if that's the case, + // stop and try this user again later. + Err(e) => Err(e.into()), + } + } } // If we get another sort of error, it may be transient; stop and try this user // again later. From 8440bc6e575b9704078b42a41e6e4d4993011308 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 16:09:11 -0400 Subject: [PATCH 08/12] Don't update the users table if the username hasn't changed --- src/worker/jobs/update_user_from_github.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index f598ccd756d..0b42a082a2c 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -151,12 +151,17 @@ impl UpdateUserFromGithub { .transaction(async |conn| { // This will be removed when we no longer sync crates.io usernames with GitHub. // (The transaction can be removed when this is removed as well) - diesel::update(users::table) - .filter(users::id.eq(self.user_id)) - .set(users::gh_login.eq(&github_user.login)) - .execute(conn) - .await?; + // It's only needed if there's a change in username. + if self.old_username != github_user.login { + diesel::update(users::table) + .filter(users::id.eq(self.user_id)) + .set(users::gh_login.eq(&github_user.login)) + .execute(conn) + .await?; + } + // This update is needed even if there's no change in username to set the + // `last_sync` time to `now`. diesel::update(oauth_github::table) .filter(oauth_github::user_id.eq(self.user_id)) .set(( From 7240582fc22c0f842b6122faa2b879720298da29 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 16:33:34 -0400 Subject: [PATCH 09/12] Only put dry_run and account_id in job params --- src/bin/crates-admin/enqueue_job.rs | 5 +- src/tests/worker/update_user_from_github.rs | 5 +- src/worker/jobs/update_user_from_github.rs | 91 ++++++++++----------- 3 files changed, 51 insertions(+), 50 deletions(-) diff --git a/src/bin/crates-admin/enqueue_job.rs b/src/bin/crates-admin/enqueue_job.rs index edc711640d1..12d28a349ac 100644 --- a/src/bin/crates-admin/enqueue_job.rs +++ b/src/bin/crates-admin/enqueue_job.rs @@ -194,7 +194,10 @@ pub async fn run(command: Command) -> Result<()> { let github_id = oauth_github.account_id; let old_username = oauth_github.login.clone(); - let job = jobs::UpdateUserFromGithub::new(dry_run, oauth_github); + let job = jobs::UpdateUserFromGithub { + dry_run, + account_id: oauth_github.account_id, + }; // Don't stop the whole batch if one user update errors, but do log the error if let Err(e) = job.enqueue(&conn).await { diff --git a/src/tests/worker/update_user_from_github.rs b/src/tests/worker/update_user_from_github.rs index 50a194c0fef..c9af5e1cda9 100644 --- a/src/tests/worker/update_user_from_github.rs +++ b/src/tests/worker/update_user_from_github.rs @@ -67,7 +67,10 @@ impl UpdateTest { .unwrap(); let last_sync_before_update = oauth_github_before_update.last_sync; - let job = jobs::UpdateUserFromGithub::new(dry_run, oauth_github_before_update); + let job = jobs::UpdateUserFromGithub { + dry_run, + account_id: oauth_github_before_update.account_id, + }; job.enqueue(&conn).await.unwrap(); let _ = app.try_run_pending_background_jobs().await; diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index 0b42a082a2c..128e5c2c06e 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -16,15 +16,9 @@ use tracing::{error, info}; pub struct UpdateUserFromGithub { /// Dry run will fetch updates from GitHub and log what it would change, but does not actually /// update the database. - dry_run: bool, - /// Crates.io user ID - user_id: i32, + pub dry_run: bool, /// GitHub ID - account_id: i64, - /// Encrypted GitHub token - encrypted_token: Vec, - /// Username currently in the database - old_username: String, + pub account_id: i64, } impl BackgroundJob for UpdateUserFromGithub { @@ -37,27 +31,35 @@ impl BackgroundJob for UpdateUserFromGithub { /// their account has been deleted or renamed. Update the `users` and `oauth_github` tables, /// saving the current time in `last_sync` even if the user information hasn't changed. async fn run(&self, ctx: Self::Context) -> anyhow::Result<()> { + let mut conn = ctx.deadpool.get().await?; + + // If no oauth_github info with this account id is found, then the record has been deleted + // since this job was enqueued. Stop and exit with success so we don't retry the job. + let oauth_github = oauth_github::table + .filter(oauth_github::account_id.eq(self.account_id)) + .first::(&mut conn) + .await?; + info!( dry_run = self.dry_run, - user_id = self.user_id, + user_id = oauth_github.user_id, github_id = self.account_id, - old_username = self.old_username, + old_username = oauth_github.login, "Starting UpdateUserFromGithub" ); - let mut conn = ctx.deadpool.get().await?; - - let github_user = self.refresh_user(&ctx).await?; + let github_user = self.refresh_user(&ctx, &oauth_github).await?; if self.dry_run { info!( - user_id = self.user_id, - old_username = self.old_username, + user_id = oauth_github.user_id, + old_username = oauth_github.login, new_username = github_user.login, "Dry run UpdateUserFromGithub proposed update" ); } else { - self.apply_update(&github_user, &mut conn).await; + self.apply_update(&oauth_github, &github_user, &mut conn) + .await; } Ok(()) @@ -65,43 +67,29 @@ impl BackgroundJob for UpdateUserFromGithub { } impl UpdateUserFromGithub { - pub fn new(dry_run: bool, oauth_github: OauthGithub) -> Self { - let OauthGithub { - user_id, - account_id, - encrypted_token, - login, - .. - } = oauth_github; - - Self { - dry_run, - user_id, - account_id, - encrypted_token, - old_username: login, - } - } - /// Given the current environment's context, request information from GitHub using the user's /// API token. - async fn refresh_user(&self, ctx: &Arc) -> anyhow::Result { + async fn refresh_user( + &self, + ctx: &Arc, + oauth_github: &OauthGithub, + ) -> anyhow::Result { // if the user's gh_id isn't positive, we don't even need to ask github about this, // we know this user is invalid. Just make sure their username is the ghost username. if self.account_id < 1 { - Ok(self.ghost_user()) + Ok(self.ghost_user(oauth_github.user_id)) } else { let github = ctx.github.as_ref(); let token = ctx .config .gh_token_encryption - .decrypt(&self.encrypted_token)?; + .decrypt(&oauth_github.encrypted_token)?; match github.current_user(&token).await { Ok(github_user) => Ok(github_user), // If the user is not found, the account has been deleted. Update to the ghost // username. - Err(GitHubError::NotFound(_)) => Ok(self.ghost_user()), + Err(GitHubError::NotFound(_)) => Ok(self.ghost_user(oauth_github.user_id)), // Unauthorized/forbidden could mean: // // - the token we have for this user is out-of-date @@ -113,10 +101,10 @@ impl UpdateUserFromGithub { // enterprise to see any information on enterprise managed users. Err(GitHubError::Unauthorized(_)) | Err(GitHubError::Forbidden(_)) => { // Enterprise managed users are the only ones that should contain underscores. - if self.old_username.contains('_') { + if oauth_github.login.contains('_') { // We can't get updated info, so keep what we have. Ok(GitHubUser { - login: self.old_username.clone(), + login: oauth_github.login.clone(), id: self.account_id as i32, // The other fields are not used in `apply_update`. avatar_url: Default::default(), @@ -126,7 +114,9 @@ impl UpdateUserFromGithub { } else { match github.get_user_by_id(self.account_id).await { Ok(github_user) => Ok(github_user), - Err(GitHubError::NotFound(_)) => Ok(self.ghost_user()), + Err(GitHubError::NotFound(_)) => { + Ok(self.ghost_user(oauth_github.user_id)) + } // Not sure how we could get Unauthorized/Forbidden for an anonymous // API request. We could get rate limited though; if that's the case, // stop and try this user again later. @@ -143,7 +133,12 @@ impl UpdateUserFromGithub { /// Given the information from GitHub about the current user, make the appropriate changes to /// the `users` and `oauth_github` tables. - async fn apply_update(&self, github_user: &GitHubUser, conn: &mut AsyncPgConnection) { + async fn apply_update( + &self, + oauth_github: &OauthGithub, + github_user: &GitHubUser, + conn: &mut AsyncPgConnection, + ) { // Use a transaction so that we either update both or neither the `users` record and the // corresponding `oauth_github` record. If neither are updated, log and continue to the // next user rather than stopping-- hopefully we'll get that user updated next time. @@ -152,9 +147,9 @@ impl UpdateUserFromGithub { // This will be removed when we no longer sync crates.io usernames with GitHub. // (The transaction can be removed when this is removed as well) // It's only needed if there's a change in username. - if self.old_username != github_user.login { + if oauth_github.login != github_user.login { diesel::update(users::table) - .filter(users::id.eq(self.user_id)) + .filter(users::id.eq(oauth_github.user_id)) .set(users::gh_login.eq(&github_user.login)) .execute(conn) .await?; @@ -163,7 +158,7 @@ impl UpdateUserFromGithub { // This update is needed even if there's no change in username to set the // `last_sync` time to `now`. diesel::update(oauth_github::table) - .filter(oauth_github::user_id.eq(self.user_id)) + .filter(oauth_github::account_id.eq(self.account_id)) .set(( oauth_github::login.eq(&github_user.login), oauth_github::last_sync.eq(Utc::now()), @@ -179,19 +174,19 @@ impl UpdateUserFromGithub { // Better luck next time. error!( "Could not update user ID {} from username {} to username {}: {e}", - self.user_id, self.old_username, github_user.login, + oauth_github.user_id, oauth_github.login, github_user.login, ); } } /// If this user has been deleted, ensure their username has been changed to /// `ghost_{crates.io id}` to ensure uniqueness by creating a `GitHubUser` by hand. - fn ghost_user(&self) -> GitHubUser { + fn ghost_user(&self, user_id: i32) -> GitHubUser { GitHubUser { avatar_url: None, email: None, id: self.account_id as i32, - login: format!("ghost_{}", self.user_id), + login: format!("ghost_{}", user_id), name: None, } } From 16330d214290e93d99fb5f082077030583757be4 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 16:38:57 -0400 Subject: [PATCH 10/12] Remove handling of github ids < 1; they don't have oauth_github records And oauth_github is the only source used to enqueue jobs. I'll update the `user.gh_login` records for users with gh_id < 1 manually once this whole strategy is approved, to get us to a state where `user.gh_login` is unique-- but that isn't what this job needs to do in perpetuity, so don't. --- src/tests/worker/update_user_from_github.rs | 63 -------------- src/worker/jobs/update_user_from_github.rs | 92 ++++++++++----------- 2 files changed, 42 insertions(+), 113 deletions(-) diff --git a/src/tests/worker/update_user_from_github.rs b/src/tests/worker/update_user_from_github.rs index c9af5e1cda9..e5504bf3ce9 100644 --- a/src/tests/worker/update_user_from_github.rs +++ b/src/tests/worker/update_user_from_github.rs @@ -143,69 +143,6 @@ async fn yes_updates_needed() { .await; } -#[tokio::test(flavor = "multi_thread")] -async fn negative_github_id() { - UpdateTest { - dry_run: false, - // The GitHub ID in our database is -1 because at some point we learned the GitHub user - // was no longer valid - existing_github_id: -1, - existing_username: EXISTING_LOGIN, - // We shouldn't even contact GitHub in this case, so error in case we do - github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( - "current_user shouldn't be called" - ))), - // Shouldn't be called in this test - github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), - expected_username: "ghost_1", - expected_last_sync_updated: true, - } - .run() - .await; -} - -#[tokio::test(flavor = "multi_thread")] -async fn zero_github_id() { - UpdateTest { - dry_run: false, - // Check that we're also treating 0 as invalid - existing_github_id: 0, - existing_username: EXISTING_LOGIN, - // We shouldn't even contact GitHub in this case, so error in case we do - github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( - "current_user shouldn't be called" - ))), - // Shouldn't be called in this test - github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), - expected_username: "ghost_1", - expected_last_sync_updated: true, - } - .run() - .await; -} - -#[tokio::test(flavor = "multi_thread")] -async fn negative_github_id_with_ghost_username() { - UpdateTest { - dry_run: false, - // The GitHub ID in our database is negative because at some point we learned the GitHub - // user was no longer valid - existing_github_id: -9000, - // We've already set this username to ghost, but set it and update `last_sync` anyway - existing_username: "ghost_1", - // We shouldn't even contact GitHub in this case, so error in case we do - github_current_user_response: Err(GitHubError::Other(anyhow::anyhow!( - "Shouldn't be called" - ))), - // Shouldn't be called in this test - github_user_by_id_response: Ok(github_user(GITHUB_ID, "wrong-user-info")), - expected_username: "ghost_1", - expected_last_sync_updated: true, - } - .run() - .await; -} - #[tokio::test(flavor = "multi_thread")] async fn github_deleted() { UpdateTest { diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index 128e5c2c06e..dcf6a213596 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -74,60 +74,52 @@ impl UpdateUserFromGithub { ctx: &Arc, oauth_github: &OauthGithub, ) -> anyhow::Result { - // if the user's gh_id isn't positive, we don't even need to ask github about this, - // we know this user is invalid. Just make sure their username is the ghost username. - if self.account_id < 1 { - Ok(self.ghost_user(oauth_github.user_id)) - } else { - let github = ctx.github.as_ref(); - let token = ctx - .config - .gh_token_encryption - .decrypt(&oauth_github.encrypted_token)?; + let github = ctx.github.as_ref(); + let token = ctx + .config + .gh_token_encryption + .decrypt(&oauth_github.encrypted_token)?; - match github.current_user(&token).await { - Ok(github_user) => Ok(github_user), - // If the user is not found, the account has been deleted. Update to the ghost - // username. - Err(GitHubError::NotFound(_)) => Ok(self.ghost_user(oauth_github.user_id)), - // Unauthorized/forbidden could mean: - // - // - the token we have for this user is out-of-date - // - the user has revoked crates.io's oauth access - // - // In those cases, try to request the user's info via an unauthenticated GitHub - // API request, unless they are a GitHub Enterprise Managed User as indicated by an - // underscore in their username because we have to be authorized by the managing - // enterprise to see any information on enterprise managed users. - Err(GitHubError::Unauthorized(_)) | Err(GitHubError::Forbidden(_)) => { - // Enterprise managed users are the only ones that should contain underscores. - if oauth_github.login.contains('_') { - // We can't get updated info, so keep what we have. - Ok(GitHubUser { - login: oauth_github.login.clone(), - id: self.account_id as i32, - // The other fields are not used in `apply_update`. - avatar_url: Default::default(), - email: Default::default(), - name: Default::default(), - }) - } else { - match github.get_user_by_id(self.account_id).await { - Ok(github_user) => Ok(github_user), - Err(GitHubError::NotFound(_)) => { - Ok(self.ghost_user(oauth_github.user_id)) - } - // Not sure how we could get Unauthorized/Forbidden for an anonymous - // API request. We could get rate limited though; if that's the case, - // stop and try this user again later. - Err(e) => Err(e.into()), - } + match github.current_user(&token).await { + Ok(github_user) => Ok(github_user), + // If the user is not found, the account has been deleted. Update to the ghost + // username. + Err(GitHubError::NotFound(_)) => Ok(self.ghost_user(oauth_github.user_id)), + // Unauthorized/forbidden could mean: + // + // - the token we have for this user is out-of-date + // - the user has revoked crates.io's oauth access + // + // In those cases, try to request the user's info via an unauthenticated GitHub + // API request, unless they are a GitHub Enterprise Managed User as indicated by an + // underscore in their username because we have to be authorized by the managing + // enterprise to see any information on enterprise managed users. + Err(GitHubError::Unauthorized(_)) | Err(GitHubError::Forbidden(_)) => { + // Enterprise managed users are the only ones that should contain underscores. + if oauth_github.login.contains('_') { + // We can't get updated info, so keep what we have. + Ok(GitHubUser { + login: oauth_github.login.clone(), + id: self.account_id as i32, + // The other fields are not used in `apply_update`. + avatar_url: Default::default(), + email: Default::default(), + name: Default::default(), + }) + } else { + match github.get_user_by_id(self.account_id).await { + Ok(github_user) => Ok(github_user), + Err(GitHubError::NotFound(_)) => Ok(self.ghost_user(oauth_github.user_id)), + // Not sure how we could get Unauthorized/Forbidden for an anonymous + // API request. We could get rate limited though; if that's the case, + // stop and try this user again later. + Err(e) => Err(e.into()), } } - // If we get another sort of error, it may be transient; stop and try this user - // again later. - Err(e @ GitHubError::Other(_)) => Err(e.into()), } + // If we get another sort of error, it may be transient; stop and try this user + // again later. + Err(e @ GitHubError::Other(_)) => Err(e.into()), } } From dc44cff2307db4823458ddba03158ed747a2f4bc Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 16:47:40 -0400 Subject: [PATCH 11/12] Put info in text of log messages, not structured fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks DataDog 😞 --- src/worker/jobs/update_user_from_github.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index dcf6a213596..487284945ca 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -41,21 +41,20 @@ impl BackgroundJob for UpdateUserFromGithub { .await?; info!( - dry_run = self.dry_run, - user_id = oauth_github.user_id, - github_id = self.account_id, - old_username = oauth_github.login, - "Starting UpdateUserFromGithub" + "Starting UpdateUserFromGithub ({}): user_id {}, github_id {}, old username {}", + if self.dry_run { "DRY RUN" } else { "FOR REAL" }, + oauth_github.user_id, + self.account_id, + oauth_github.login, ); let github_user = self.refresh_user(&ctx, &oauth_github).await?; if self.dry_run { info!( - user_id = oauth_github.user_id, - old_username = oauth_github.login, - new_username = github_user.login, - "Dry run UpdateUserFromGithub proposed update" + "Dry run UpdateUserFromGithub proposed update for crates.io user {} \ + from username `{}` to username `{}`", + oauth_github.user_id, oauth_github.login, github_user.login, ); } else { self.apply_update(&oauth_github, &github_user, &mut conn) From 7efc1be7e55c9bff2d1d276541c13d8e9fce28a8 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 13 May 2026 16:54:12 -0400 Subject: [PATCH 12/12] Make priority of username updating bg jobs negative --- src/worker/jobs/update_user_from_github.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/worker/jobs/update_user_from_github.rs b/src/worker/jobs/update_user_from_github.rs index 487284945ca..908cf6bb41e 100644 --- a/src/worker/jobs/update_user_from_github.rs +++ b/src/worker/jobs/update_user_from_github.rs @@ -24,6 +24,8 @@ pub struct UpdateUserFromGithub { impl BackgroundJob for UpdateUserFromGithub { const JOB_NAME: &'static str = "update_user_from_github"; const DEDUPLICATED: bool = true; + // These jobs aren't urgent and shouldn't page anyone if they take a long time. + const PRIORITY: i16 = -15; type Context = Arc;