diff --git a/R/cr_citation_count.r b/R/cr_citation_count.r index 62ba008..2eecb6b 100644 --- a/R/cr_citation_count.r +++ b/R/cr_citation_count.r @@ -1,75 +1,71 @@ -#' Get a citation count via CrossRef OpenURL +#' Get a citation count via CrossRef #' #' @export #' -#' @param doi (character) One or more digital object identifiers. If -#' `async=FALSE` we do synchronous HTTP requests in an `lapply` call, but -#' if `async=TRUE`, we do asynchronous HTTP requests. -#' @param url (character) the url for the function (should be left to default) -#' @param key your Crossref OpenURL email address, either enter, or loads -#' from `.Rprofile`. We use a default, so you don't need to pass this. -#' @param async (logical) use async HTTP requests. Default: `FALSE` -#' @param ... Curl options passed on to [crul::HttpClient()] +#' @param doi (character) One or more digital object identifiers. +#' @param url (character) the url for the OpenURL function (used when no Plus +#' token is configured) +#' @param key your Crossref OpenURL email address. Used as the `pid` parameter +#' for the legacy OpenURL API when `plus_token` is not set. +#' @param async (logical) use async HTTP requests for the legacy OpenURL path. +#' Default: `FALSE`. Ignored when `plus_token` is set. +#' @param plus_token (character) Crossref Metadata Plus API token. When set, +#' uses the REST API (`https://api.crossref.org/works/{doi}`) with the +#' `Crossref-Plus-API-Token: Bearer ` header and sends requests in +#' async batches. Defaults to the `crossref_plus_token` environment variable. +#' @param batch_size (integer) number of concurrent async requests per batch +#' when using the Plus REST path. Defaults to the `crossref_batch_size` +#' environment variable, or 50 if unset. Should not exceed the +#' `x-concurrency-limit` returned by the Crossref API. +#' @param max_retries (integer) maximum number of retry attempts for failed +#' requests (HTTP 429 or 5xx) on the Plus REST path. Default: 3. +#' @param ... Curl options passed on to [crul::HttpClient()] or +#' [crul::HttpRequest()] #' -#' @return a data.frame, with columns `doi` and `count`. The count column -#' has numeric values that are the citation count for that DOI, or `NA` if -#' not found or no count available -#' -#' @details See https://www.crossref.org/labs/openurl/ for more info on this -#' Crossref API service. +#' @return a data.frame with columns `doi` and `count`. `count` is numeric or +#' `NA` when not found. #' -#' This number is also known as **cited-by** +#' @details +#' When `plus_token` is provided (or set via the `crossref_plus_token` env +#' var), the function uses the Crossref REST API with your Metadata Plus +#' subscription, which provides higher rate limits. Requests are sent in +#' async batches of `batch_size` concurrent connections. #' -#' Note that this number may be out of sync/may not match that that the -#' publisher is showing (if they show it) for the same DOI/article. -#' -#' We've contacted Crossref about this, and they have confirmed this. -#' Unfortunately, we can not do anything about this. -#' -#' I would imagine it's best to use this data instead of from the publishers, -#' and this data you can get programatically :) -#' -#' @section failure behavior: -#' When a DOI does not exist, we may not get a proper HTTP status code -#' to throw a proper stop status, so we grep on the text itself, and throw -#' a stop if only one DOI passed and not using async, or warning if more -#' than one DOI passed or if using async. +#' Without a Plus token the legacy OpenURL API is used +#' (`https://doi.crossref.org/openurl/`). Set `async = TRUE` to fire all +#' OpenURL requests concurrently (no batching). #' #' @seealso [cr_search()], [cr_r()] -#' @author Carl Boettiger \email{cboettig@@gmail.com}, -#' Scott Chamberlain +#' @author Carl Boettiger \email{cboettig@@gmail.com}, Scott Chamberlain #' @examples \dontrun{ -#' cr_citation_count(doi="10.1371/journal.pone.0042793") -#' cr_citation_count(doi="10.1016/j.fbr.2012.01.001") -#' ## many +#' cr_citation_count(doi = "10.1371/journal.pone.0042793") +#' +#' # Many DOIs, legacy async #' dois <- c("10.1016/j.fbr.2012.01.001", "10.1371/journal.pone.0042793") +#' cr_citation_count(doi = dois, async = TRUE) +#' +#' # Metadata Plus path (token read from crossref_plus_token env var) +#' Sys.setenv(crossref_plus_token = "my-token") #' cr_citation_count(doi = dois) -#' # DOI not found -#' cr_citation_count(doi="10.1016/j.fbr.2012") -#' -#' # asyc -#' dois <- c("10.1016/j.fbr.2012.01.001", "10.1371/journal.pone.0042793", -#' "10.1016/j.fbr.2012", "10.1109/tsp.2006.874779", "10.1007/bf02231542", -#' "10.1007/s00277-016-2782-z", "10.1002/9781118339893.wbeccp020", -#' "10.1177/011542659200700105", "10.1002/chin.197444438", -#' "10.1002/9781118619599.ch4", "10.1007/s00466-012-0724-8", -#' "10.1017/s0376892900029477", "10.1167/16.12.824") -#' res <- cr_citation_count(doi = dois, async = TRUE) -#' ## verbose curl -#' res <- cr_citation_count(doi = dois, async = TRUE, verbose = TRUE) -#' res -#' ## time comparison -#' system.time(cr_citation_count(doi = dois, async = TRUE)) -#' system.time(cr_citation_count(doi = dois, async = FALSE)) -#' -#' # from a set of random DOIs -#' cr_citation_count(cr_r(50), async = TRUE) +#' +#' # Explicit token and batch size +#' cr_citation_count(doi = dois, plus_token = "my-token", batch_size = 25L) #' } -cr_citation_count <- function(doi, url = "http://www.crossref.org/openurl/", - key = "cboettig@ropensci.org", async = FALSE, ...) { +cr_citation_count <- function(doi, + url = "http://www.crossref.org/openurl/", + key = "cboettig@ropensci.org", + async = FALSE, + plus_token = Sys.getenv("crossref_plus_token"), + batch_size = as.integer( + Sys.getenv("crossref_batch_size", unset = "50")), + max_retries = 3L, + ...) { - if (async) { + if (nchar(plus_token) > 0) { + cr_cc_rest_batched(doi, plus_token = plus_token, batch_size = batch_size, + max_retries = max_retries, ...) + } else if (async) { cr_cc_async(doi, url, key, ...) } else { out <- lapply(doi, cr_cc, ur = url, key = key, ...) @@ -77,6 +73,139 @@ cr_citation_count <- function(doi, url = "http://www.crossref.org/openurl/", } } +# Parse Crossref interval strings ("1s", "200ms") -> numeric seconds +parse_rl_interval <- function(s) { + if (is.null(s) || is.na(s) || !nzchar(trimws(s))) return(1) + s <- trimws(s) + if (grepl("ms$", s)) as.numeric(sub("ms$", "", s)) / 1000 + else if (grepl("s$", s)) as.numeric(sub("s$", "", s)) + else 1 +} + +# Extract x-rate-limit-* and x-concurrency-limit from the first response +# that carries them; returns a named list or NULL. +extract_rl_headers <- function(resps) { + for (r in resps) { + h <- r$response_headers + lim <- h[["x-rate-limit-limit"]] + if (!is.null(lim)) { + return(list( + limit = as.integer(lim), + interval = parse_rl_interval(h[["x-rate-limit-interval"]]), + concurrency = suppressWarnings(as.integer(h[["x-concurrency-limit"]])) + )) + } + } + NULL +} + +# Send one concurrent batch of DOIs. +# Returns list(data = , rl = ). +cr_cc_rest_async <- function(doi, plus_token, max_retries = 3L, ...) { + headers <- list( + `User-Agent` = rcrossref_ua(), + `X-USER-AGENT` = rcrossref_ua(), + `Crossref-Plus-API-Token` = paste0("Bearer ", plus_token) + ) + + make_reqs <- function(dois) lapply(dois, function(d) { + crul::HttpRequest$new( + url = sprintf("https://api.crossref.org/works/%s", + utils::URLencode(d, reserved = TRUE)), + headers = headers, + opts = list(...) + )$get() + }) + + pending <- doi + resolved <- list() # named by doi -> one-row data.frame + rl <- NULL + backoff <- 1 # seconds, doubles on each retry + + for (attempt in seq_len(max_retries + 1L)) { + if (attempt > 1L) Sys.sleep(backoff) + + cli <- crul::AsyncVaried$new(.list = make_reqs(pending)) + cli$request() + statuses <- cli$status_code() + resps <- cli$responses() + bodies <- cli$parse() + + if (is.null(rl)) rl <- extract_rl_headers(resps) + + retry_dois <- character(0) + retry_after <- NULL + + for (i in seq_along(pending)) { + d <- pending[[i]] + sc <- statuses[[i]] + + if (sc == 200L) { + resolved[[d]] <- tryCatch({ + r <- jsonlite::fromJSON(bodies[[i]]) + data.frame(doi = d, + count = as.numeric(r$message$`is-referenced-by-count`), + stringsAsFactors = FALSE) + }, error = function(e) { + warning("Failed to parse response for DOI: ", d, call. = FALSE) + data.frame(doi = d, count = NA_integer_, stringsAsFactors = FALSE) + }) + } else if (sc == 429L || sc >= 500L) { + retry_dois <- c(retry_dois, d) + if (sc == 429L && is.null(retry_after)) { + ra <- resps[[i]]$response_headers[["retry-after"]] + if (!is.null(ra)) retry_after <- as.numeric(ra) + } + } else { + warning("HTTP ", sc, " for DOI: ", d, call. = FALSE) + resolved[[d]] <- data.frame(doi = d, count = NA_integer_, + stringsAsFactors = FALSE) + } + } + + pending <- retry_dois + if (length(pending) == 0L) break + + backoff <- if (!is.null(retry_after)) retry_after + else min(backoff * 2, 60) + retry_after <- NULL + } + + # Any DOIs that exhausted retries + for (d in pending) { + warning("Max retries exceeded for DOI: ", d, call. = FALSE) + resolved[[d]] <- data.frame(doi = d, count = NA_integer_, + stringsAsFactors = FALSE) + } + + list(data = do.call(rbind, resolved[doi]), rl = rl) +} + +# Splits DOIs into batches and calls cr_cc_rest_async for each. +# Uses rate-limit headers from each batch to compute inter-batch sleep. +cr_cc_rest_batched <- function(doi, plus_token, batch_size = 50L, + max_retries = 3L, ...) { + if (is.na(batch_size) || batch_size < 1L) batch_size <- 50L + batches <- split(doi, ceiling(seq_along(doi) / batch_size)) + all_results <- vector("list", length(batches)) + rl <- NULL + + for (i in seq_along(batches)) { + # Sleep before this batch based on rate-limit headers from the previous one + if (i > 1L && !is.null(rl) && !is.na(rl$limit) && rl$limit > 0L) { + sleep_secs <- (length(batches[[i - 1L]]) / rl$limit) * rl$interval + if (sleep_secs > 0) Sys.sleep(sleep_secs) + } + + out <- cr_cc_rest_async(batches[[i]], plus_token = plus_token, + max_retries = max_retries, ...) + all_results[[i]] <- out$data + if (is.null(rl) && !is.null(out$rl)) rl <- out$rl + } + + do.call(rbind, all_results) +} + cr_cc <- function(doi, url, key, ...) { args <- list(id = paste("doi:", doi, sep = ""), pid = as.character(key), noredirect = as.logical(TRUE)) diff --git a/R/zzz.R b/R/zzz.R index 3305835..7b01ec8 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -21,16 +21,23 @@ rcrossref_ua <- function() { paste0(versions, collapse = " ") } +get_plus_token <- function() { + token <- Sys.getenv("crossref_plus_token") + if (identical(token, "")) NULL else token +} + cr_GET <- function(endpoint, args, todf = TRUE, on_error = warning, parse = TRUE, ...) { url <- sprintf("https://api.crossref.org/%s", endpoint) - cli <- crul::HttpClient$new( - url = url, - headers = list( - `User-Agent` = rcrossref_ua(), - `X-USER-AGENT` = rcrossref_ua() - ) + headers <- list( + `User-Agent` = rcrossref_ua(), + `X-USER-AGENT` = rcrossref_ua() ) + token <- get_plus_token() + if (!is.null(token)) { + headers[["Crossref-Plus-API-Token"]] <- paste0("Bearer ", token) + } + cli <- crul::HttpClient$new(url = url, headers = headers) if (length(args) == 0) { res <- cli$get(...) } else { diff --git a/tests/testthat/helper-rcrossref.R b/tests/testthat/helper-rcrossref.R index 1d4423a..ef15a41 100644 --- a/tests/testthat/helper-rcrossref.R +++ b/tests/testthat/helper-rcrossref.R @@ -3,7 +3,8 @@ library("vcr") invisible(vcr::vcr_configure( dir = "../fixtures", filter_sensitive_data = list( - "" = Sys.getenv("crossref_email") + "" = Sys.getenv("crossref_email"), + "" = Sys.getenv("crossref_plus_token") ) )) diff --git a/tests/testthat/test-cr_citation_count_plus.R b/tests/testthat/test-cr_citation_count_plus.R new file mode 100644 index 0000000..454759b --- /dev/null +++ b/tests/testthat/test-cr_citation_count_plus.R @@ -0,0 +1,280 @@ +context("testing cr_citation_count with Metadata Plus REST API") + +# --------------------------------------------------------------------------- +# Pure unit tests — no HTTP required +# --------------------------------------------------------------------------- + +test_that("get_plus_token returns NULL when env var is empty", { + withr::with_envvar(c(crossref_plus_token = ""), { + expect_null(rcrossref:::get_plus_token()) + }) +}) + +test_that("get_plus_token returns the token when env var is set", { + withr::with_envvar(c(crossref_plus_token = "my-secret-token"), { + expect_equal(rcrossref:::get_plus_token(), "my-secret-token") + }) +}) + +test_that("cr_citation_count has plus_token parameter that reads from env var", { + default_args <- formals(cr_citation_count) + expect_true("plus_token" %in% names(default_args)) + expect_true("batch_size" %in% names(default_args)) + expect_match(deparse(default_args$plus_token), "Sys.getenv") + expect_match(deparse(default_args$batch_size), "Sys.getenv") +}) + +test_that("batch splitting produces correct number of batches and sizes", { + dois <- paste0("10.1000/doi.", seq_len(7)) + batch_size <- 3L + batches <- split(dois, ceiling(seq_along(dois) / batch_size)) + expect_equal(length(batches), 3L) + expect_equal(as.integer(lengths(batches)), c(3L, 3L, 1L)) + # All original DOIs present, no duplicates or drops + expect_equal(sort(unlist(batches, use.names = FALSE)), sort(dois)) +}) + +test_that("batch splitting with batch_size >= length gives one batch", { + dois <- paste0("10.1000/doi.", seq_len(5)) + batches <- split(dois, ceiling(seq_along(dois) / 10L)) + expect_equal(length(batches), 1L) + expect_equal(length(batches[[1]]), 5L) +}) + +test_that("batch splitting with batch_size = 1 gives one batch per DOI", { + dois <- paste0("10.1000/doi.", seq_len(4)) + batches <- split(dois, ceiling(seq_along(dois) / 1L)) + expect_equal(length(batches), 4L) + expect_true(all(lengths(batches) == 1L)) +}) + +test_that("cr_cc_rest_batched guard clamps NA batch_size to 50", { + fn_src <- paste(deparse(body(rcrossref:::cr_cc_rest_batched)), collapse = " ") + expect_match(fn_src, "is.na\\(batch_size\\)") +}) + +test_that("cr_citation_count has max_retries parameter defaulting to 3", { + default_args <- formals(cr_citation_count) + expect_true("max_retries" %in% names(default_args)) + expect_equal(default_args$max_retries, 3L) +}) + +# --------------------------------------------------------------------------- +# parse_rl_interval — pure unit tests, no HTTP +# --------------------------------------------------------------------------- + +test_that("parse_rl_interval parses seconds correctly", { + expect_equal(rcrossref:::parse_rl_interval("1s"), 1) + expect_equal(rcrossref:::parse_rl_interval("60s"), 60) + expect_equal(rcrossref:::parse_rl_interval("0s"), 0) +}) + +test_that("parse_rl_interval parses milliseconds correctly", { + expect_equal(rcrossref:::parse_rl_interval("200ms"), 0.2) + expect_equal(rcrossref:::parse_rl_interval("1000ms"), 1) +}) + +test_that("parse_rl_interval defaults to 1 for NULL, NA, empty string", { + expect_equal(rcrossref:::parse_rl_interval(NULL), 1) + expect_equal(rcrossref:::parse_rl_interval(NA), 1) + expect_equal(rcrossref:::parse_rl_interval(""), 1) +}) + +test_that("parse_rl_interval defaults to 1 for unrecognised format", { + expect_equal(rcrossref:::parse_rl_interval("1m"), 1) +}) + +# --------------------------------------------------------------------------- +# extract_rl_headers — unit tests with mock response-like objects +# --------------------------------------------------------------------------- + +test_that("extract_rl_headers reads rate limit fields from response headers", { + mock_resps <- list( + list(response_headers = list( + `x-rate-limit-limit` = "50", + `x-rate-limit-interval` = "1s", + `x-concurrency-limit` = "5" + )) + ) + rl <- rcrossref:::extract_rl_headers(mock_resps) + expect_equal(rl$limit, 50L) + expect_equal(rl$interval, 1) + expect_equal(rl$concurrency, 5L) +}) + +test_that("extract_rl_headers returns NULL when headers are absent", { + mock_resps <- list(list(response_headers = list(`content-type` = "application/json"))) + expect_null(rcrossref:::extract_rl_headers(mock_resps)) +}) + +test_that("extract_rl_headers uses first response that has rate limit headers", { + mock_resps <- list( + list(response_headers = list(`content-type` = "text/plain")), + list(response_headers = list( + `x-rate-limit-limit` = "100", + `x-rate-limit-interval` = "500ms", + `x-concurrency-limit` = "10" + )) + ) + rl <- rcrossref:::extract_rl_headers(mock_resps) + expect_equal(rl$limit, 100L) + expect_equal(rl$interval, 0.5) +}) + +# --------------------------------------------------------------------------- +# HTTP tests via webmockr stubs — no cassette files needed. +# +# Note: webmockr patches crul::HttpRequest (used by AsyncVaried internally). +# If these tests fail with "Real HTTP connections are disabled" it means +# webmockr is not intercepting AsyncVaried — record cassettes inside Docker +# instead using vcr::use_cassette(..., record = "new_episodes"). +# --------------------------------------------------------------------------- + +make_rest_body <- function(doi, count) { + jsonlite::toJSON(list( + status = "ok", + `message-type` = jsonlite::unbox("work"), + `message-version` = jsonlite::unbox("1.0.0"), + message = list( + DOI = jsonlite::unbox(doi), + `is-referenced-by-count` = jsonlite::unbox(as.integer(count)) + ) + )) +} + +test_that("cr_citation_count REST path: single DOI returns correct data.frame", { + webmockr::enable("crul") + on.exit({ + webmockr::stub_registry_clear() + webmockr::disable("crul") + }) + + webmockr::stub_request("get", uri_regex = "api\\.crossref\\.org/works") %>% + webmockr::to_return( + body = make_rest_body("10.1371/journal.pone.0042793", 42L), + status = 200L, + headers = list(`content-type` = "application/json;charset=UTF-8") + ) + + result <- cr_citation_count( + doi = "10.1371/journal.pone.0042793", + plus_token = "test-token" + ) + + expect_is(result, "data.frame") + expect_named(result, c("doi", "count")) + expect_equal(nrow(result), 1L) + expect_is(result$doi, "character") + expect_is(result$count, "numeric") + expect_equal(result$count, 42) +}) + +test_that("cr_citation_count REST path: multiple DOIs return one row each", { + webmockr::enable("crul") + on.exit({ + webmockr::stub_registry_clear() + webmockr::disable("crul") + }) + + dois <- c("10.1371/journal.pone.0042793", "10.1016/j.fbr.2012.01.001") + # Single stub matching any Crossref works URL; count value isn't checked here + webmockr::stub_request("get", uri_regex = "api\\.crossref\\.org/works") %>% + webmockr::to_return( + body = make_rest_body(dois[[1]], 42L), + status = 200L, + headers = list(`content-type` = "application/json;charset=UTF-8") + ) + + result <- cr_citation_count(doi = dois, plus_token = "test-token") + + expect_is(result, "data.frame") + expect_equal(nrow(result), 2L) + expect_named(result, c("doi", "count")) + # doi column uses actual DOIs, not whatever the JSON response says + expect_equal(sort(result$doi), sort(dois)) +}) + +test_that("cr_citation_count REST path: non-retryable 4xx returns NA with HTTP warning", { + webmockr::enable("crul") + on.exit({ + webmockr::stub_registry_clear() + webmockr::disable("crul") + }) + + webmockr::stub_request("get", uri_regex = "api\\.crossref\\.org/works") %>% + webmockr::to_return( + body = "Resource not found.", + status = 404L, + headers = list(`content-type` = "text/plain") + ) + + expect_warning( + result <- cr_citation_count(doi = "10.9999/not.a.doi", plus_token = "test-token"), + regexp = "HTTP 404" + ) + expect_equal(nrow(result), 1L) + expect_true(is.na(result$count)) +}) + +test_that("cr_citation_count REST path: 429 with max_retries=0 returns NA with warning", { + webmockr::enable("crul") + on.exit({ + webmockr::stub_registry_clear() + webmockr::disable("crul") + }) + + webmockr::stub_request("get", uri_regex = "api\\.crossref\\.org/works") %>% + webmockr::to_return( + body = "", + status = 429L, + headers = list(`content-type` = "text/plain") + ) + + expect_warning( + result <- cr_citation_count( + doi = "10.1371/journal.pone.0042793", + plus_token = "test-token", + max_retries = 0L + ), + regexp = "Max retries exceeded" + ) + expect_equal(nrow(result), 1L) + expect_true(is.na(result$count)) +}) + +test_that("cr_citation_count REST path: batching collects all DOIs across rounds", { + webmockr::enable("crul") + on.exit({ + webmockr::stub_registry_clear() + webmockr::disable("crul") + }) + + n <- 5L + dois <- paste0("10.1000/doi.", seq_len(n)) + webmockr::stub_request("get", uri_regex = "api\\.crossref\\.org/works") %>% + webmockr::to_return( + body = make_rest_body(dois[[1]], 1L), + status = 200L, + headers = list(`content-type` = "application/json;charset=UTF-8") + ) + + # batch_size = 2 → ceil(5/2) = 3 rounds; all 5 DOIs should appear in result + result <- cr_citation_count(doi = dois, plus_token = "test-token", + batch_size = 2L) + + expect_equal(nrow(result), n) + expect_equal(sort(result$doi), sort(dois)) +}) + +test_that("cr_citation_count does NOT use REST path when plus_token is empty", { + # Confirm dispatch logic: when plus_token is "" the REST branch is skipped. + # We check this structurally rather than via HTTP, since making the legacy + # call would require the OpenURL endpoint. + withr::with_envvar(c(crossref_plus_token = ""), { + fn_src <- paste(deparse(body(cr_citation_count)), collapse = " ") + # The Plus branch is guarded by nchar(plus_token) > 0 + expect_match(fn_src, "nchar\\(plus_token\\).*>.*0") + # When that guard is FALSE, cr_cc_async or cr_cc is reached + expect_match(fn_src, "cr_cc_async|cr_cc\\b") + }) +})