In this report, we extract information about published JOSS papers
and generate
graphics as well as a summary table that can be downloaded and used for
further analyses.
suppressPackageStartupMessages({
library(tibble)
library(rcrossref)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(gh)
library(purrr)
library(jsonlite)
library(DT)
library(plotly)
library(citecorp)
library(readr)
library(rworldmap)
library(gt)
library(stringr)
library(openalexR)
})
## Keep track of the source of each column
source_track <- c()
## Determine whether to add a caption with today's date to the (non-interactive) plots
add_date_caption <- TRUE
if (add_date_caption) {
dcap <- lubridate::today()
} else {
dcap <- ""
}
## Get list of countries and populations (2022) from the rworldmap/gt packages
data("countrySynonyms")
country_names <- countrySynonyms |>
select(-ID) |>
pivot_longer(names_to = "tmp", values_to = "name", -ISO3) |>
filter(name != "") |>
select(-tmp)
## Country population data from the World Bank (https://data.worldbank.org/indicator/SP.POP.TOTL),
## distributed via the gt R package
country_populations <- countrypops |>
filter(year == 2022)
## Read archived version of summary data frame, to use for filling in
## information about software repositories (due to limit on API requests)
## Sort by the date when software repo info was last obtained
papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)
## Similarly for citation analysis, to avoid having to pull down the
## same information multiple times
citations_archive <- readr::read_delim(
url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
col_types = cols(.default = "c"), col_names = TRUE,
delim = "\t")
We get the information about published JOSS papers from Crossref,
using the rcrossref R package. The openalexR R
package is used to extract citation counts from OpenAlex.
## First check how many records there are in Crossref
issn <- "2475-9066"
joss_details <- rcrossref::cr_journals(issn, works = FALSE) %>%
pluck("data")
(total_dois <- joss_details$total_dois)
## [1] 3457
## Pull down all records from Crossref
papers <- rcrossref::cr_journals(issn, works = TRUE, cursor = "*",
cursor_max = joss_details$total_dois * 2) %>%
pluck("data")
## Only keep articles
papers <- papers %>%
dplyr::filter(type == "journal-article")
dim(papers)
## [1] 3457 28
dim(papers %>% distinct())
## [1] 3457 28
## Check that all papers were pulled down and stop otherwise
if (!(nrow(papers %>% distinct()) >= total_dois)) {
stop("Not all papers were pulled down from Crossref!")
}
## A few papers don't have alternative.ids - generate them from the DOI
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- papers$doi[noaltid]
## Get citation info from Crossref and merge with paper details
# cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
# papers <- papers %>% dplyr::left_join(
# cit %>% dplyr::rename(citation_count = count),
# by = c("alternative.id" = "doi")
# )
## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")
dim(papers)
## [1] 3456 28
dim(papers %>% distinct())
## [1] 3456 28
papers$alternative.id[duplicated(papers$alternative.id)]
## character(0)
source_track <- c(source_track,
structure(rep("crossref", ncol(papers)),
names = colnames(papers)))
## Get info from openalexR and merge with paper details
## Helper function to extract countries from affiliations. Note that this
## information is not available for all papers.
.get_countries <- function(df, wh = "first") {
if ((length(df) == 1 && is.na(df)) || is.null(df$affiliations)) {
""
} else {
if (wh == "first") {
## Only first affiliation for each author
tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |>
dplyr::filter(!duplicated(id) & !is.na(affiliations_country_code)) |>
pull(affiliations_country_code)
} else {
## All affiliations
tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |>
dplyr::filter(!is.na(affiliations_country_code)) |>
pull(affiliations_country_code)
}
if (length(tmp) > 0) {
tmp |>
unique() |>
paste(collapse = ";")
} else {
""
}
}
}
oa <- oa_fetch(entity = "works",
primary_location.source.id = "s4210214273") |>
mutate(affil_countries_all = vapply(authorships, .get_countries, "", wh = "all"),
affil_countries_first = vapply(authorships, .get_countries, "", wh = "first"))
dim(oa)
## [1] 3456 45
length(unique(oa$doi))
## [1] 3455
papers <- papers %>% dplyr::left_join(
oa %>% dplyr::mutate(alternative.id = sub("https://doi.org/", "", doi)) %>%
dplyr::select(alternative.id, cited_by_count, id,
affil_countries_all, affil_countries_first) %>%
dplyr::rename(citation_count = cited_by_count,
openalex_id = id),
by = "alternative.id"
)
dim(papers)
## [1] 3457 32
dim(papers %>% distinct())
## [1] 3457 32
source_track <- c(source_track,
structure(rep("OpenAlex", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
For each published paper, we use the JOSS API to get information about pre-review and review issue numbers, corresponding software repository etc.
joss_api <- list()
p <- 1
a0 <- NULL
a <- jsonlite::fromJSON(
url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
simplifyDataFrame = FALSE
)
while (length(a) > 0 && !identical(a, a0)) {
joss_api <- c(joss_api, a)
p <- p + 1
a0 <- a
a <- tryCatch({
jsonlite::fromJSON(
url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
simplifyDataFrame = FALSE
)},
error = function(e) return(numeric(0))
)
}
joss_api <- do.call(dplyr::bind_rows, lapply(joss_api, function(w) {
data.frame(api_title = w$title,
api_state = w$state,
author_affiliations = paste(unique(unlist(lapply(w$authors, "[[", "affiliation"))), collapse = ";"),
editor = paste(w$editor, collapse = ","),
reviewers = paste(w$reviewers, collapse = ","),
nbr_reviewers = length(w$reviewers),
repo_url = w$software_repository,
review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/",
"", w$paper_review),
doi = w$doi,
prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
w$meta_review_issue_id, NA_integer_),
languages = gsub(", ", ",", w$languages),
archive_doi = w$software_archive)
}))
dim(joss_api)
## [1] 3457 12
dim(joss_api %>% distinct())
## [1] 3457 12
## Check that all papers were pulled down and stop otherwise
if (!(nrow(joss_api %>% distinct()) >= total_dois)) {
stop("Not all papers were pulled down from the JOSS API!")
}
joss_api$repo_url[duplicated(joss_api$repo_url)]
## [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [2] "https://github.com/nomad-coe/greenX"
## [3] "https://github.com/idaholab/moose"
## [4] "https://gitlab.com/libreumg/dataquier.git"
## [5] "https://github.com/idaholab/moose"
## [6] "https://github.com/dynamicslab/pysindy"
## [7] "https://github.com/landlab/landlab"
## [8] "https://github.com/landlab/landlab"
## [9] "https://github.com/symmy596/SurfinPy"
## [10] "https://github.com/arviz-devs/arviz"
## [11] "https://github.com/bcgov/ssdtools"
## [12] "https://github.com/landlab/landlab"
## [13] "https://github.com/pvlib/pvlib-python"
## [14] "https://github.com/mlpack/mlpack"
## [15] "https://github.com/julia-wrobel/registr"
## [16] "https://github.com/barbagroup/pygbe"
papers <- papers %>% dplyr::left_join(joss_api, by = c("alternative.id" = "doi"))
dim(papers)
## [1] 3457 43
dim(papers %>% distinct())
## [1] 3457 43
papers$repo_url[duplicated(papers$repo_url)]
## [1] "https://github.com/mlpack/mlpack"
## [2] "https://github.com/QTC-UMD/rydiqule"
## [3] "https://github.com/nomad-coe/greenX"
## [4] "https://github.com/bcgov/ssdtools"
## [5] "https://github.com/barbagroup/pygbe"
## [6] "https://github.com/dynamicslab/pysindy"
## [7] "https://github.com/landlab/landlab"
## [8] "https://github.com/idaholab/moose"
## [9] "https://github.com/idaholab/moose"
## [10] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [11] "https://github.com/landlab/landlab"
## [12] "https://github.com/arviz-devs/arviz"
## [13] "https://github.com/symmy596/SurfinPy"
## [14] "https://github.com/julia-wrobel/registr"
## [15] "https://github.com/pvlib/pvlib-python"
## [16] "https://github.com/landlab/landlab"
## [17] "https://gitlab.com/libreumg/dataquier.git"
source_track <- c(source_track,
structure(rep("JOSS_API", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
From each pre-review and review issue, we extract information about review times and assigned labels.
## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues",
.limit = 15000, state = "all")
## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
data.frame(title = i$title,
number = i$number,
state = i$state,
opened = i$created_at,
closed = ifelse(!is.null(i$closed_at),
i$closed_at, NA_character_),
ncomments = i$comments,
labels = paste(setdiff(
vapply(i$labels, getElement,
name = "name", character(1L)),
c("review", "pre-review", "query-scope", "paused")),
collapse = ","))
}))
## Split into REVIEW, PRE-REVIEW, and other issues (the latter category
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) &
!grepl("\\[REVIEW\\]", title))
dim(issother)
## [1] 198 7
head(issother)
## title
## 1 Update wording of collaborative effort reviewer checklist item
## 2 Create Web App
## 3 [JOSS] zoom-lod-engine: A zoom-aware level-of-detail resolver using hysteresis
## 4 Review Comments
## 5 Paper Review comments
## 6 Invalid rejection SiA-WD: An R Shiny Application for Systematic Evaluation of Wearables in Behavioural and Stress Research
## number state opened closed ncomments labels
## 1 10126 closed 2026-02-27T15:51:09Z 2026-03-01T08:26:48Z 2
## 2 10121 closed 2026-02-26T15:48:21Z 2026-02-26T15:48:23Z 1
## 3 9931 closed 2026-02-04T10:22:06Z 2026-02-04T10:22:09Z 1
## 4 9927 closed 2026-02-02T16:49:54Z 2026-02-02T16:49:57Z 1
## 5 9920 closed 2026-01-30T07:05:08Z 2026-01-30T07:05:10Z 1
## 6 9911 closed 2026-01-28T12:00:02Z 2026-01-28T12:00:05Z 1
## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
dplyr::mutate(alternative.id = paste0("10.21105/joss.",
nbrzeros,
number)) %>%
dplyr::select(-nbrzeros) %>%
dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))
## For pre-review and review issues, respectively, get the number of
## issues closed each month, and the number of those that have the
## 'rejected' label
review_rejected <- iss %>%
dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
dplyr::filter(!is.na(closed)) %>%
dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
dplyr::group_by(closedmonth) %>%
dplyr::summarize(nbr_issues_closed = length(labels),
nbr_rejections = sum(grepl("rejected", labels))) %>%
dplyr::mutate(itype = "review")
prereview_rejected <- iss %>%
dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
dplyr::filter(!is.na(closed)) %>%
dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
dplyr::group_by(closedmonth) %>%
dplyr::summarize(nbr_issues_closed = length(labels),
nbr_rejections = sum(grepl("rejected", labels))) %>%
dplyr::mutate(itype = "pre-review")
all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected)
## Get only pre-review issues plus review issues opened before 2016-09-18,
## will use these as a proxy for the number of submissions
pi1 <- iss |>
dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) |>
dplyr::mutate(opened = as.Date(opened))
dim(pi1)
## [1] 6019 7
pi2 <- iss |>
dplyr::filter(grepl("\\[REVIEW\\]", title)) |>
dplyr::mutate(opened = as.Date(opened)) |>
dplyr::filter(opened <= as.Date("2016-09-18"))
dim(pi2)
## [1] 49 7
prereview_issues <- dplyr::bind_rows(pi1, pi2)
## For PRE-REVIEW issues, add information about the corresponding REVIEW
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
dplyr::filter(!grepl("withdrawn", labels)) %>%
dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>%
dplyr::filter(!duplicated(title)) %>%
dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
dplyr::rename_all(~ paste0("prerev_", .))
papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>%
dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
dplyr::mutate(prerev_opened = as.Date(prerev_opened),
prerev_closed = as.Date(prerev_closed),
review_opened = as.Date(review_opened),
review_closed = as.Date(review_closed)) %>%
dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
days_in_rev = review_closed - review_opened,
to_review = !is.na(review_opened))
dim(papers)
## [1] 3457 59
dim(papers %>% distinct())
## [1] 3457 59
source_track <- c(source_track,
structure(rep("joss-github", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
## Reorder so that software repositories that were interrogated longest
## ago are checked first
tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
na.last = FALSE)
software_urls <- papers$repo_url[tmporder]
software_urls[duplicated(software_urls)]
## [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [2] "https://gitlab.com/libreumg/dataquier.git"
## [3] "https://github.com/QTC-UMD/rydiqule"
## [4] "https://github.com/mlpack/mlpack"
## [5] "https://github.com/nomad-coe/greenX"
## [6] "https://github.com/bcgov/ssdtools"
## [7] "https://github.com/barbagroup/pygbe"
## [8] "https://github.com/dynamicslab/pysindy"
## [9] "https://github.com/landlab/landlab"
## [10] "https://github.com/idaholab/moose"
## [11] "https://github.com/idaholab/moose"
## [12] "https://github.com/arviz-devs/arviz"
## [13] "https://github.com/landlab/landlab"
## [14] "https://github.com/symmy596/SurfinPy"
## [15] "https://github.com/julia-wrobel/registr"
## [16] "https://github.com/pvlib/pvlib-python"
## [17] "https://github.com/landlab/landlab"
is_github <- grepl("github", software_urls)
length(is_github)
## [1] 3457
sum(is_github)
## [1] 3277
software_urls[!is_github]
## [1] "https://gitlab.kuleuven.be/ITSCreaLab/public-toolboxes/dyntapy"
## [2] "https://gitlab.com/morikawa-lab-osakau/vibir-parallel-compute"
## [3] "https://gitlab.com/ENKI-portal/ThermoCodegen"
## [4] "https://bitbucket.org/orionmhdteam/orion2_release1/src/master/"
## [5] "https://gitlab.com/oali/dxtr"
## [6] "https://gitlab.dune-project.org/copasi/dune-copasi"
## [7] "https://gitlab.com/bonsamurais/bonsai/util/ipcc"
## [8] "https://codebase.helmholtz.cloud/mussel/netlogo-northsea-species.git"
## [9] "https://gitlab.com/ffaucher/hawen"
## [10] "https://gitlab.com/cmbm-ethz/miop"
## [11] "https://gitlab.com/cosmograil/starred"
## [12] "https://gitlab.com/emd-dev/emd"
## [13] "https://gite.lirmm.fr/doccy/RedOak"
## [14] "https://bitbucket.org/rram/dvrlib/src/joss/"
## [15] "https://gitlab.com/mantik-ai/mantik"
## [16] "https://gitlab.com/djsmithbham/cnearest"
## [17] "https://gitlab.com/sails-dev/sails"
## [18] "https://gitlab.kitware.com/LBM/lattice-boltzmann-solver"
## [19] "https://gitlab.com/dsbowen/conditional-inference"
## [20] "https://gitlab.com/soleil-data-treatment/soleil-software-projects/remote-desktop"
## [21] "https://bitbucket.org/berkeleylab/esdr-pygdh/"
## [22] "https://code.europa.eu/kada/mafw"
## [23] "https://gitlab.com/drti/basic-tools"
## [24] "https://gitlab.com/moorepants/skijumpdesign"
## [25] "https://bitbucket.org/ocellarisproject/ocellaris"
## [26] "https://git.iws.uni-stuttgart.de/tools/frackit"
## [27] "https://gitlab.com/cmbm-ethz/pourbaix-diagrams"
## [28] "https://bitbucket.org/cloopsy/android/"
## [29] "https://gitlab.com/pythia-uq/pythia"
## [30] "https://gitlab.com/fduchate/predihood"
## [31] "https://jugit.fz-juelich.de/compflu/swalbe.jl/"
## [32] "https://gitlab.dune-project.org/dorie/dorie"
## [33] "https://gitlab.com/micromorph/ratel"
## [34] "https://gitlab.com/dmt-development/dmt-core"
## [35] "https://gitlab.com/wpettersson/kep_solver"
## [36] "https://gitlab.com/myqueue/myqueue"
## [37] "https://gitlab.com/dlr-ve/esy/remix/framework"
## [38] "https://gitlab.com/utopia-project/dantro"
## [39] "https://gitlab.com/gdetor/genetic_alg"
## [40] "https://framagit.org/GustaveCoste/off-product-environmental-impact/"
## [41] "https://gitlab.com/InspectorCell/inspectorcell"
## [42] "https://plmlab.math.cnrs.fr/lmrs/statistique/smmR"
## [43] "https://gitlab.com/dlr-dw/ontocode"
## [44] "https://gitlab.com/dlr-ve/esy/amiris/amiris"
## [45] "https://bitbucket.org/glotzer/rowan"
## [46] "https://gitlab.com/fame-framework/fame-io"
## [47] "https://code.usgs.gov/umesc/quant-ecology/fishstan/"
## [48] "https://gitlab.com/fame-framework/fame-core"
## [49] "https://gitlab.com/thartwig/asloth"
## [50] "https://gitlab.com/habermann_lab/phasik"
## [51] "https://gitlab.com/dlr-ve/autumn/"
## [52] "https://gitlab.com/ags-data-format-wg/ags-python-library"
## [53] "https://zivgitlab.uni-muenster.de/ag-salinga/fastatomstruct"
## [54] "https://gitlab.com/tue-umphy/software/parmesan"
## [55] "https://gitlab.com/celliern/scikit-fdiff/"
## [56] "https://gitlab.com/datafold-dev/datafold/"
## [57] "https://gitlab.com/tesch1/cppduals"
## [58] "https://gitlab.com/open-darts/open-darts"
## [59] "https://gitlab.com/materials-modeling/calorine"
## [60] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [61] "https://gitlab.mpikg.mpg.de/curcuraci/bmiptools"
## [62] "https://gitlab.com/grogra/groimp-plugins/api"
## [63] "https://gitlab.inria.fr/miet/miet"
## [64] "https://savannah.nongnu.org/projects/complot/"
## [65] "http://mutabit.com/repos.fossil/grafoscopio/"
## [66] "https://gitlab.com/cerfacs/batman"
## [67] "https://gitlab.com/manchester_qbi/manchester_qbi_public/madym_cxx/"
## [68] "https://gitlab.com/akantu/akantu"
## [69] "https://bitbucket.org/cardosan/brightway2-temporalis"
## [70] "https://gitlab.com/marinvaders/marinvaders"
## [71] "https://gitlab.kuleuven.be/gelenslab/publications/pycline"
## [72] "https://gitlab.gwdg.de/mpievolbio-it/crbhits"
## [73] "https://bitbucket.org/bmskinner/nuclear_morphology"
## [74] "https://git.rwth-aachen.de/ants/sensorlab/imea"
## [75] "https://bitbucket.org/sciencecapsule/sciencecapsule"
## [76] "https://gitlab.com/bioeconomy/forobs/biotrade/"
## [77] "https://gitlab.com/lheea/CN-AeroModels"
## [78] "https://gitlab.ruhr-uni-bochum.de/ee/cd2es"
## [79] "https://www.idpoisson.fr/fullswof/"
## [80] "https://gitlab.com/uniluxembourg/hpc/research/cadom/serializable-simpy"
## [81] "https://codeberg.org/benmagill/deflake.rs"
## [82] "https://gitlab.ifremer.fr/resourcecode/resourcecode"
## [83] "https://bitbucket.org/miketuri/perl-spice-sim-seus/"
## [84] "https://bitbucket.org/dolfin-adjoint/pyadjoint"
## [85] "https://gitlab.com/LMSAL_HUB/aia_hub/aiapy"
## [86] "https://gitlab.eudat.eu/coccon-kit/proffastpylot"
## [87] "https://git.mpib-berlin.mpg.de/castellum/castellum"
## [88] "https://gitlab.com/programgreg/tagginglatencyestimator"
## [89] "https://gitlab.com/gims-developers/gims"
## [90] "https://gitlab.awi.de/sicopolis/sicopolis"
## [91] "https://forgemia.inra.fr/migale/easy16s"
## [92] "https://gitlab.com/tamaas/tamaas"
## [93] "https://gitlab.com/ampere2/metalwalls"
## [94] "https://gitlab.pasteur.fr/vlegrand/ROCK"
## [95] "https://git.ligo.org/asimov/asimov"
## [96] "https://bitbucket.org/berkeleylab/hardware-control/src/main/"
## [97] "https://gitlab.inria.fr/bcoye/game-engine-scheduling-simulation"
## [98] "https://gitlab.com/petsc/petsc"
## [99] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"
## [100] "https://gitlab.com/jason-rumengan/pyarma"
## [101] "https://gitlab.com/culturalcartography/text2map"
## [102] "https://bitbucket.org/clhaley/Multitaper.jl"
## [103] "https://gitlab.com/project-dare/dare-platform"
## [104] "https://gitlab.com/sissopp_developers/sissopp"
## [105] "https://bitbucket.org/sbarbot/motorcycle/src/master/"
## [106] "https://earth.bsc.es/gitlab/wuruchi/autosubmitreact"
## [107] "https://gitlab.com/remram44/taguette"
## [108] "https://forgemia.inra.fr/pherosensor/pherosensor-toolbox"
## [109] "https://bitbucket.org/mpi4py/mpi4py-fft"
## [110] "https://gitlab.com/sigcorr/sigcorr"
## [111] "https://git.geomar.de/digital-earth/dasf/dasf-messaging-python"
## [112] "https://gitlab.inria.fr/mosaic/bvpy"
## [113] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"
## [114] "https://bitbucket.org/dghoshal/frieda"
## [115] "https://gitlab.ruhr-uni-bochum.de/reichp2y/proppy"
## [116] "https://gitlab.com/grogra/groimp-plugins/Pointcloud"
## [117] "https://gitlab.com/permafrostnet/teaspoon"
## [118] "https://gitlab.com/free-astro/siril"
## [119] "https://doi.org/10.17605/OSF.IO/3DS6A"
## [120] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [121] "https://gitlab.com/costrouc/pysrim"
## [122] "https://gitlab.com/ComputationalScience/idinn"
## [123] "https://gitlab.com/jtagusari/hrisk-noisemodelling"
## [124] "https://gitlab.com/moerman1/fhi-cc4s"
## [125] "https://gitlab.com/pyFBS/pyFBS"
## [126] "https://codeberg.org/cepsInria/ceps"
## [127] "https://gitlab.com/fibreglass/pivc"
## [128] "https://bitbucket.org/manuela_s/hcp/"
## [129] "https://gitlab.com/ProjectRHEA/flowsolverrhea"
## [130] "https://gitlab.com/libreumg/dataquier.git"
## [131] "https://gitlab.ethz.ch/holukas/dyco-dynamic-lag-compensation"
## [132] "https://gitlab.inria.fr/melissa/melissa"
## [133] "https://gitlab.com/cosapp/cosapp"
## [134] "https://gitlab.com/dlr-ve/esy/vencopy/vencopy"
## [135] "https://gitlab.com/jesseds/apav"
## [136] "https://gitlab.com/qc-devs/aqcnes"
## [137] "https://gitlab.com/vibes-developers/vibes"
## [138] "https://gitlab.uliege.be/smart_grids/public/gboml"
## [139] "https://gricad-gitlab.univ-grenoble-alpes.fr/deformvis/insarviz"
## [140] "https://gitlab.com/eidheim/Simple-Web-Server"
## [141] "https://bitbucket.org/basicsums/basicsums"
## [142] "https://framagit.org/GustaveCoste/eldam"
## [143] "https://gitlab.com/cracklet/cracklet.git"
## [144] "https://gitlab.com/cosmology-ethz/galsbi"
## [145] "https://git.ufz.de/despot/pysewer/"
## [146] "https://codebase.helmholtz.cloud/taimur.khan/DeepTrees"
## [147] "https://gitlab.com/materials-modeling/wulffpack"
## [148] "https://gitlab.com/EliseLei/easychem"
## [149] "https://gitlab.eclipse.org/eclipse/comma/comma"
## [150] "https://codeberg.org/JPHackstein/GREOPy"
## [151] "https://gitlab.com/robizzard/libcdict"
## [152] "https://bitbucket.org/robmoss/particle-filter-for-python/"
## [153] "https://bitbucket.org/mituq/muq2.git"
## [154] "https://gitlab.com/energyincities/besos/"
## [155] "https://gitlab.com/mmartin-lagarde/exonoodle-exoplanets/-/tree/master/"
## [156] "https://gitlab.com/utopia-project/utopia"
## [157] "https://gitlab.inria.fr/bramas/tbfmm"
## [158] "https://bitbucket.org/meg/cbcbeat"
## [159] "https://bitbucket.org/hammurabicode/hamx"
## [160] "https://gitlab.com/davidwoodburn/itrm"
## [161] "https://gitlab.com/tum-ciip/elsa"
## [162] "https://gitlab.com/cosmology-ethz/ufig"
## [163] "https://gitlab.com/picos-api/picos"
## [164] "https://gitlab.com/binary_c/binary_c-python/"
## [165] "https://bitbucket.org/cdegroot/wediff"
## [166] "https://gitlab.com/QComms/cqptoolkit"
## [167] "https://gitlab.com/toposens/public/ros-packages"
## [168] "https://gitlab.inria.fr/azais/treex"
## [169] "https://gitlab.com/pvst/asi"
## [170] "https://gitlab.com/chaver/choco-mining"
## [171] "https://gitlab.com/cosmograil/PyCS3"
## [172] "https://gitlab.com/MartinBeseda/sa-oo-vqe-qiskit.git"
## [173] "https://gitlab.com/davidtourigny/dynamic-fba"
## [174] "https://bitbucket.org/likask/mofem-cephas"
## [175] "https://bitbucket.org/cmutel/brightway2"
## [176] "https://gitlab.com/geekysquirrel/bigx"
## [177] "https://gitlab.com/dglaeser/fieldcompare"
## [178] "https://gitlab.com/dlr-ve/esy/sfctools/framework/"
## [179] "https://gitlab.com/davidwoodburn/r3f"
## [180] "https://gitlab.com/libreumg/dataquier.git"
df <- do.call(dplyr::bind_rows, lapply(unique(software_urls[is_github]), function(u) {
u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
if (grepl("/tree/", u0)) {
u0 <- strsplit(u0, "/tree/")[[1]][1]
}
if (grepl("/blob/", u0)) {
u0 <- strsplit(u0, "/blob/")[[1]][1]
}
info <- try({
gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
})
languages <- try({
gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"),
.limit = 500)
})
topics <- try({
gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"),
.accept = "application/vnd.github.mercy-preview+json", .limit = 500)
})
contribs <- try({
gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"),
.limit = 500)
})
if (!is(info, "try-error") && length(info) > 1) {
if (!is(contribs, "try-error")) {
if (length(contribs) == 0) {
repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
} else {
repo_nbr_contribs <- length(contribs)
repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
if (is.na(repo_nbr_contribs_2ormore)) {
print(contribs)
}
}
} else {
repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
}
if (!is(languages, "try-error")) {
if (length(languages) == 0) {
repolang <- ""
} else {
repolang <- paste(paste(names(unlist(languages)),
unlist(languages), sep = ":"), collapse = ",")
}
} else {
repolang <- ""
}
if (!is(topics, "try-error")) {
if (length(topics$names) == 0) {
repotopics <- ""
} else {
repotopics <- paste(unlist(topics$names), collapse = ",")
}
} else {
repotopics <- ""
}
data.frame(repo_url = u,
repo_created = info$created_at,
repo_updated = info$updated_at,
repo_pushed = info$pushed_at,
repo_nbr_stars = info$stargazers_count,
repo_language = ifelse(!is.null(info$language),
info$language, NA_character_),
repo_languages_bytes = repolang,
repo_topics = repotopics,
repo_license = ifelse(!is.null(info$license),
info$license$key, NA_character_),
repo_nbr_contribs = repo_nbr_contribs,
repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
)
} else {
NULL
}
})) %>%
dplyr::mutate(repo_created = as.Date(repo_created),
repo_updated = as.Date(repo_updated),
repo_pushed = as.Date(repo_pushed)) %>%
dplyr::distinct() %>%
dplyr::mutate(repo_info_obtained = lubridate::today())
if (length(unique(df$repo_url)) != length(df$repo_url)) {
print(length(unique(df$repo_url)))
print(length(df$repo_url))
print(df$repo_url[duplicated(df$repo_url)])
}
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)
## [1] 2864 12
## For papers not in df (i.e., for which we didn't get a valid response
## from the GitHub API query), use information from the archived data frame
dfarchive <- papers_archive %>%
dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
dplyr::filter(!(repo_url %in% df$repo_url)) %>%
dplyr::arrange(desc(repo_info_obtained)) %>%
dplyr::filter(!duplicated(repo_url))
head(dfarchive)
## # A tibble: 6 × 12
## repo_url repo_created repo_updated repo_pushed repo_nbr_stars repo_language
## <chr> <date> <date> <date> <int> <chr>
## 1 https://gi… 2016-09-25 2025-10-07 2024-02-03 21 R
## 2 https://gi… 2018-12-06 2025-11-02 2023-10-05 10 C#
## 3 https://gi… 2018-10-05 2025-10-20 2021-11-17 8 JavaScript
## 4 https://gi… 2024-04-15 2026-01-26 2025-11-19 11 Jupyter Note…
## 5 https://gi… 2020-06-02 2026-02-28 2025-02-10 112 Python
## 6 https://gi… 2022-10-13 2026-03-03 2026-03-03 73 Python
## # ℹ 6 more variables: repo_languages_bytes <chr>, repo_topics <chr>,
## # repo_license <chr>, repo_nbr_contribs <int>,
## # repo_nbr_contribs_2ormore <int>, repo_info_obtained <date>
dim(dfarchive)
## [1] 576 12
df <- dplyr::bind_rows(df, dfarchive)
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)
## [1] 3440 12
papers <- papers %>% dplyr::left_join(df, by = "repo_url")
dim(papers)
## [1] 3457 70
source_track <- c(source_track,
structure(rep("sw-github", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
dplyr::mutate(published.date = as.Date(published.print)) %>%
dplyr::mutate(
halfyear = paste0(year(published.date),
ifelse(month(published.date) <= 6, "H1", "H2"))
) %>% dplyr::mutate(
halfyear = factor(halfyear,
levels = paste0(rep(sort(unique(year(published.date))),
each = 2), c("H1", "H2")))
) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))
dim(papers)
## [1] 3457 70
dupidx <- which(papers$alternative.id %in% papers$alternative.id[duplicated(papers)])
papers[dupidx, ] %>% arrange(alternative.id) %>% head(n = 10)
## # A tibble: 0 × 70
## # ℹ 70 variables: alternative.id <chr>, container.title <chr>, created <chr>,
## # deposited <chr>, published.print <chr>, doi <chr>, indexed <chr>,
## # issn <chr>, issue <chr>, issued <chr>, member <chr>, page <chr>,
## # prefix <chr>, publisher <chr>, score <chr>, source <chr>,
## # reference.count <chr>, references.count <chr>,
## # is.referenced.by.count <chr>, title <chr>, type <chr>, url <chr>,
## # volume <chr>, short.container.title <chr>, author <list>, …
papers <- papers %>% dplyr::distinct()
dim(papers)
## [1] 3457 70
source_track <- c(source_track,
structure(rep("cleanup", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
In some cases, fetching information from (e.g.) the GitHub API fails for a subset of the publications. There are also other reasons for missing values (for example, the earliest submissions do not have an associated pre-review issue). The table below lists the number of missing values for each of the variables in the data frame.
DT::datatable(
data.frame(variable = colnames(papers),
nbr_missing = colSums(is.na(papers))) %>%
dplyr::mutate(source = source_track[variable]),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
monthly_pubs <- papers %>%
dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
dplyr::group_by(pubmonth) %>%
dplyr::summarize(npub = n())
ggplot(monthly_pubs,
aes(x = factor(pubmonth), y = npub)) +
geom_bar(stat = "identity") + theme_minimal() +
labs(x = "", y = "Number of published papers per month", caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
DT::datatable(
monthly_pubs %>%
dplyr::rename("Number of papers" = "npub",
"Month of publication" = "pubmonth"),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
yearly_pubs <- papers %>%
dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
dplyr::group_by(pubyear) %>%
dplyr::summarize(npub = n())
ggplot(yearly_pubs,
aes(x = factor(pubyear), y = npub)) +
geom_bar(stat = "identity") + theme_minimal() +
labs(x = "", y = "Number of published papers per year", caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
DT::datatable(
yearly_pubs %>%
dplyr::rename("Number of papers" = "npub",
"Year of publication" = "pubyear"),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
We use the number of opened pre-review issues in a month as a proxy for the number of submissions.
monthly_subs <- prereview_issues |>
dplyr::mutate(submonth = lubridate::floor_date(opened, "month")) |>
dplyr::group_by(submonth) |>
dplyr::summarize(nsub = n())
ggplot(monthly_subs,
aes(x = factor(submonth), y = nsub)) +
geom_bar(stat = "identity") + theme_minimal() +
labs(x = "", y = "Number of submissions per month", caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))