In this report, we extract information about published JOSS papers
and generate
graphics as well as a summary table that can be downloaded and used for
further analyses.
suppressPackageStartupMessages({
library(tibble)
library(rcrossref)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(gh)
library(purrr)
library(jsonlite)
library(DT)
library(plotly)
library(citecorp)
library(readr)
library(rworldmap)
library(gt)
library(stringr)
library(openalexR)
})
## Keep track of the source of each column
source_track <- c()
## Determine whether to add a caption with today's date to the (non-interactive) plots
add_date_caption <- TRUE
if (add_date_caption) {
dcap <- lubridate::today()
} else {
dcap <- ""
}
## Get list of countries and populations (2022) from the rworldmap/gt packages
data("countrySynonyms")
country_names <- countrySynonyms |>
select(-ID) |>
pivot_longer(names_to = "tmp", values_to = "name", -ISO3) |>
filter(name != "") |>
select(-tmp)
## Country population data from the World Bank (https://data.worldbank.org/indicator/SP.POP.TOTL),
## distributed via the gt R package
country_populations <- countrypops |>
filter(year == 2022)
## Read archived version of summary data frame, to use for filling in
## information about software repositories (due to limit on API requests)
## Sort by the date when software repo info was last obtained
papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)
## Similarly for citation analysis, to avoid having to pull down the
## same information multiple times
citations_archive <- readr::read_delim(
url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
col_types = cols(.default = "c"), col_names = TRUE,
delim = "\t")
We get the information about published JOSS papers from Crossref,
using the rcrossref
R package. The openalexR
R
package is used to extract citation counts from OpenAlex.
## First check how many records there are in Crossref
issn <- "2475-9066"
joss_details <- rcrossref::cr_journals(issn, works = FALSE) %>%
pluck("data")
joss_details$total_dois
## [1] 2968
## Pull down all records from Crossref
papers <- rcrossref::cr_journals(issn, works = TRUE, cursor = "*",
cursor_max = joss_details$total_dois * 2) %>%
pluck("data")
## Only keep articles
papers <- papers %>%
dplyr::filter(type == "journal-article")
dim(papers)
## [1] 2969 28
dim(papers %>% distinct())
## [1] 2969 28
## A few papers don't have alternative.ids - generate them from the DOI
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- papers$doi[noaltid]
## Get citation info from Crossref and merge with paper details
# cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
# papers <- papers %>% dplyr::left_join(
# cit %>% dplyr::rename(citation_count = count),
# by = c("alternative.id" = "doi")
# )
## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")
dim(papers)
## [1] 2968 28
dim(papers %>% distinct())
## [1] 2968 28
papers$alternative.id[duplicated(papers$alternative.id)]
## character(0)
source_track <- c(source_track,
structure(rep("crossref", ncol(papers)),
names = colnames(papers)))
## Get info from openalexR and merge with paper details
## Helper function to extract countries from affiliations. Note that this
## information is not available for all papers.
.get_countries <- function(df, wh = "first") {
if ((length(df) == 1 && is.na(df)) || is.null(df$affiliations)) {
""
} else {
if (wh == "first") {
## Only first affiliation for each author
tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |>
dplyr::filter(!duplicated(id) & !is.na(affiliations_country_code)) |>
pull(affiliations_country_code)
} else {
## All affiliations
tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |>
dplyr::filter(!is.na(affiliations_country_code)) |>
pull(affiliations_country_code)
}
if (length(tmp) > 0) {
tmp |>
unique() |>
paste(collapse = ";")
} else {
""
}
}
}
oa <- oa_fetch(entity = "works",
primary_location.source.id = "s4210214273") |>
mutate(affil_countries_all = vapply(authorships, .get_countries, "", wh = "all"),
affil_countries_first = vapply(authorships, .get_countries, "", wh = "first"))
## Warning in oa_request(oa_query(filter = filter_i, multiple_id = multiple_id, :
## The following work(s) have truncated lists of authors: W3005984879.
## Query each work separately by its identifier to get full list of authors.
## For example:
## lapply(c("W3005984879"), \(x) oa_fetch(identifier = x))
## Details at https://docs.openalex.org/api-entities/authors/limitations.
dim(oa)
## [1] 2972 45
length(unique(oa$doi))
## [1] 2972
papers <- papers %>% dplyr::left_join(
oa %>% dplyr::mutate(alternative.id = sub("https://doi.org/", "", doi)) %>%
dplyr::select(alternative.id, cited_by_count, id,
affil_countries_all, affil_countries_first) %>%
dplyr::rename(citation_count = cited_by_count,
openalex_id = id),
by = "alternative.id"
)
dim(papers)
## [1] 2968 32
dim(papers %>% distinct())
## [1] 2968 32
source_track <- c(source_track,
structure(rep("OpenAlex", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
For each published paper, we use the JOSS API to get information about pre-review and review issue numbers, corresponding software repository etc.
joss_api <- list()
p <- 1
a0 <- NULL
a <- jsonlite::fromJSON(
url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
simplifyDataFrame = FALSE
)
while (length(a) > 0 && !identical(a, a0)) {
joss_api <- c(joss_api, a)
p <- p + 1
a0 <- a
a <- tryCatch({
jsonlite::fromJSON(
url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
simplifyDataFrame = FALSE
)},
error = function(e) return(numeric(0))
)
}
joss_api <- do.call(dplyr::bind_rows, lapply(joss_api, function(w) {
data.frame(api_title = w$title,
api_state = w$state,
editor = paste(w$editor, collapse = ","),
reviewers = paste(w$reviewers, collapse = ","),
nbr_reviewers = length(w$reviewers),
repo_url = w$software_repository,
review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/",
"", w$paper_review),
doi = w$doi,
prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
w$meta_review_issue_id, NA_integer_),
languages = gsub(", ", ",", w$languages),
archive_doi = w$software_archive)
}))
dim(joss_api)
## [1] 2969 11
dim(joss_api %>% distinct())
## [1] 2969 11
joss_api$repo_url[duplicated(joss_api$repo_url)]
## [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [2] "https://github.com/idaholab/moose"
## [3] "https://gitlab.com/libreumg/dataquier.git"
## [4] "https://github.com/idaholab/moose"
## [5] "https://github.com/dynamicslab/pysindy"
## [6] "https://github.com/landlab/landlab"
## [7] "https://github.com/landlab/landlab"
## [8] "https://github.com/symmy596/SurfinPy"
## [9] "https://github.com/bcgov/ssdtools"
## [10] "https://github.com/landlab/landlab"
## [11] "https://github.com/pvlib/pvlib-python"
## [12] "https://github.com/mlpack/mlpack"
## [13] "https://github.com/julia-wrobel/registr"
## [14] "https://github.com/barbagroup/pygbe"
papers <- papers %>% dplyr::left_join(joss_api, by = c("alternative.id" = "doi"))
dim(papers)
## [1] 2968 42
dim(papers %>% distinct())
## [1] 2968 42
papers$repo_url[duplicated(papers$repo_url)]
## [1] "https://github.com/mlpack/mlpack"
## [2] "https://github.com/bcgov/ssdtools"
## [3] "https://github.com/barbagroup/pygbe"
## [4] "https://github.com/dynamicslab/pysindy"
## [5] "https://github.com/landlab/landlab"
## [6] "https://github.com/idaholab/moose"
## [7] "https://github.com/idaholab/moose"
## [8] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [9] "https://github.com/landlab/landlab"
## [10] "https://github.com/symmy596/SurfinPy"
## [11] "https://github.com/julia-wrobel/registr"
## [12] "https://github.com/pvlib/pvlib-python"
## [13] "https://gitlab.com/libreumg/dataquier.git"
## [14] "https://github.com/landlab/landlab"
source_track <- c(source_track,
structure(rep("JOSS_API", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
From each pre-review and review issue, we extract information about review times and assigned labels.
## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues",
.limit = 15000, state = "all")
## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
data.frame(title = i$title,
number = i$number,
state = i$state,
opened = i$created_at,
closed = ifelse(!is.null(i$closed_at),
i$closed_at, NA_character_),
ncomments = i$comments,
labels = paste(setdiff(
vapply(i$labels, getElement,
name = "name", character(1L)),
c("review", "pre-review", "query-scope", "paused")),
collapse = ","))
}))
## Split into REVIEW, PRE-REVIEW, and other issues (the latter category
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) &
!grepl("\\[REVIEW\\]", title))
dim(issother)
## [1] 163 7
head(issother)
## title number
## 1 from pydynpd import regression causes error. no solution so far. 8165
## 2 This repository can’t be reached 8161
## 3 Update reviewer_checklist.md 8090
## 4 README and Vignette 7970
## 5 Comments on Siciliani et al. 2025 7878
## 6 Comments are not showing up & editorial bot is not reacting 7724
## state opened closed ncomments labels
## 1 closed 2025-05-06T05:52:04Z 2025-05-06T05:52:07Z 1
## 2 closed 2025-05-04T16:41:18Z 2025-05-04T16:41:21Z 1
## 3 closed 2025-04-22T22:17:11Z 2025-04-23T07:55:15Z 1
## 4 closed 2025-04-01T14:31:53Z 2025-04-01T14:31:56Z 1
## 5 closed 2025-03-06T17:18:06Z 2025-03-06T17:18:08Z 1
## 6 closed 2025-01-27T10:55:47Z 2025-01-29T06:32:05Z 0
## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
dplyr::mutate(alternative.id = paste0("10.21105/joss.",
nbrzeros,
number)) %>%
dplyr::select(-nbrzeros) %>%
dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))
## For pre-review and review issues, respectively, get the number of
## issues closed each month, and the number of those that have the
## 'rejected' label
review_rejected <- iss %>%
dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
dplyr::filter(!is.na(closed)) %>%
dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
dplyr::group_by(closedmonth) %>%
dplyr::summarize(nbr_issues_closed = length(labels),
nbr_rejections = sum(grepl("rejected", labels))) %>%
dplyr::mutate(itype = "review")
prereview_rejected <- iss %>%
dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
dplyr::filter(!is.na(closed)) %>%
dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
dplyr::group_by(closedmonth) %>%
dplyr::summarize(nbr_issues_closed = length(labels),
nbr_rejections = sum(grepl("rejected", labels))) %>%
dplyr::mutate(itype = "pre-review")
all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected)
## For PRE-REVIEW issues, add information about the corresponding REVIEW
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
dplyr::filter(!grepl("withdrawn", labels)) %>%
dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>%
dplyr::filter(!duplicated(title)) %>%
dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
dplyr::rename_all(~ paste0("prerev_", .))
papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>%
dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
dplyr::mutate(prerev_opened = as.Date(prerev_opened),
prerev_closed = as.Date(prerev_closed),
review_opened = as.Date(review_opened),
review_closed = as.Date(review_closed)) %>%
dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
days_in_rev = review_closed - review_opened,
to_review = !is.na(review_opened))
dim(papers)
## [1] 2968 58
dim(papers %>% distinct())
## [1] 2968 58
source_track <- c(source_track,
structure(rep("joss-github", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
## Reorder so that software repositories that were interrogated longest
## ago are checked first
tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
na.last = FALSE)
software_urls <- papers$repo_url[tmporder]
software_urls[duplicated(software_urls)]
## [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [2] "https://gitlab.com/libreumg/dataquier.git"
## [3] "https://github.com/bcgov/ssdtools"
## [4] "https://github.com/barbagroup/pygbe"
## [5] "https://github.com/symmy596/SurfinPy"
## [6] "https://github.com/pvlib/pvlib-python"
## [7] "https://github.com/mlpack/mlpack"
## [8] "https://github.com/dynamicslab/pysindy"
## [9] "https://github.com/landlab/landlab"
## [10] "https://github.com/idaholab/moose"
## [11] "https://github.com/idaholab/moose"
## [12] "https://github.com/landlab/landlab"
## [13] "https://github.com/julia-wrobel/registr"
## [14] "https://github.com/landlab/landlab"
is_github <- grepl("github", software_urls)
length(is_github)
## [1] 2968
sum(is_github)
## [1] 2807
software_urls[!is_github]
## [1] "https://bitbucket.org/orionmhdteam/orion2_release1/src/master/"
## [2] "https://gitlab.kuleuven.be/ITSCreaLab/public-toolboxes/dyntapy"
## [3] "https://gitlab.com/morikawa-lab-osakau/vibir-parallel-compute"
## [4] "https://gitlab.com/ENKI-portal/ThermoCodegen"
## [5] "https://gitlab.dune-project.org/copasi/dune-copasi"
## [6] "https://gitlab.com/bonsamurais/bonsai/util/ipcc"
## [7] "https://gitlab.com/cosmograil/starred"
## [8] "https://gitlab.com/emd-dev/emd"
## [9] "https://gitlab.com/ffaucher/hawen"
## [10] "https://codebase.helmholtz.cloud/mussel/netlogo-northsea-species.git"
## [11] "https://gite.lirmm.fr/doccy/RedOak"
## [12] "https://gitlab.com/sails-dev/sails"
## [13] "https://bitbucket.org/rram/dvrlib/src/joss/"
## [14] "https://gitlab.com/mantik-ai/mantik"
## [15] "https://gitlab.kitware.com/LBM/lattice-boltzmann-solver"
## [16] "https://gitlab.com/dsbowen/conditional-inference"
## [17] "https://gitlab.com/soleil-data-treatment/soleil-software-projects/remote-desktop"
## [18] "https://bitbucket.org/ocellarisproject/ocellaris"
## [19] "https://git.iws.uni-stuttgart.de/tools/frackit"
## [20] "https://bitbucket.org/berkeleylab/esdr-pygdh/"
## [21] "https://gitlab.com/moorepants/skijumpdesign"
## [22] "https://gitlab.com/drti/basic-tools"
## [23] "https://gitlab.com/cmbm-ethz/pourbaix-diagrams"
## [24] "https://bitbucket.org/cloopsy/android/"
## [25] "https://gitlab.com/pythia-uq/pythia"
## [26] "https://jugit.fz-juelich.de/compflu/swalbe.jl/"
## [27] "https://gitlab.com/fduchate/predihood"
## [28] "https://gitlab.com/dmt-development/dmt-core"
## [29] "https://gitlab.dune-project.org/dorie/dorie"
## [30] "https://gitlab.com/wpettersson/kep_solver"
## [31] "https://gitlab.com/myqueue/myqueue"
## [32] "https://gitlab.com/dlr-ve/esy/remix/framework"
## [33] "https://gitlab.com/gdetor/genetic_alg"
## [34] "https://gitlab.com/utopia-project/dantro"
## [35] "https://gitlab.com/dlr-dw/ontocode"
## [36] "https://gitlab.com/InspectorCell/inspectorcell"
## [37] "https://framagit.org/GustaveCoste/off-product-environmental-impact/"
## [38] "https://plmlab.math.cnrs.fr/lmrs/statistique/smmR"
## [39] "https://gitlab.com/dlr-ve/esy/amiris/amiris"
## [40] "https://gitlab.com/fame-framework/fame-io"
## [41] "https://gitlab.com/fame-framework/fame-core"
## [42] "https://bitbucket.org/glotzer/rowan"
## [43] "https://code.usgs.gov/umesc/quant-ecology/fishstan/"
## [44] "https://gitlab.com/thartwig/asloth"
## [45] "https://gitlab.com/habermann_lab/phasik"
## [46] "https://gitlab.com/dlr-ve/autumn/"
## [47] "https://gitlab.com/ags-data-format-wg/ags-python-library"
## [48] "https://gitlab.com/datafold-dev/datafold/"
## [49] "https://gitlab.com/tesch1/cppduals"
## [50] "https://gitlab.com/tue-umphy/software/parmesan"
## [51] "https://gitlab.com/open-darts/open-darts"
## [52] "https://gitlab.com/materials-modeling/calorine"
## [53] "https://gitlab.com/celliern/scikit-fdiff/"
## [54] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [55] "https://gitlab.mpikg.mpg.de/curcuraci/bmiptools"
## [56] "https://gitlab.com/akantu/akantu"
## [57] "https://savannah.nongnu.org/projects/complot/"
## [58] "http://mutabit.com/repos.fossil/grafoscopio/"
## [59] "https://gitlab.com/cerfacs/batman"
## [60] "https://bitbucket.org/cardosan/brightway2-temporalis"
## [61] "https://gitlab.inria.fr/miet/miet"
## [62] "https://gitlab.com/manchester_qbi/manchester_qbi_public/madym_cxx/"
## [63] "https://gitlab.gwdg.de/mpievolbio-it/crbhits"
## [64] "https://git.rwth-aachen.de/ants/sensorlab/imea"
## [65] "https://gitlab.com/marinvaders/marinvaders"
## [66] "https://bitbucket.org/bmskinner/nuclear_morphology"
## [67] "https://gitlab.com/lheea/CN-AeroModels"
## [68] "https://bitbucket.org/sciencecapsule/sciencecapsule"
## [69] "https://gitlab.com/bioeconomy/forobs/biotrade/"
## [70] "https://gitlab.ifremer.fr/resourcecode/resourcecode"
## [71] "https://gitlab.ruhr-uni-bochum.de/ee/cd2es"
## [72] "https://www.idpoisson.fr/fullswof/"
## [73] "https://bitbucket.org/miketuri/perl-spice-sim-seus/"
## [74] "https://bitbucket.org/dolfin-adjoint/pyadjoint"
## [75] "https://gitlab.com/LMSAL_HUB/aia_hub/aiapy"
## [76] "https://gitlab.com/gims-developers/gims"
## [77] "https://gitlab.awi.de/sicopolis/sicopolis"
## [78] "https://forgemia.inra.fr/migale/easy16s"
## [79] "https://gitlab.com/programgreg/tagginglatencyestimator"
## [80] "https://git.mpib-berlin.mpg.de/castellum/castellum"
## [81] "https://gitlab.eudat.eu/coccon-kit/proffastpylot"
## [82] "https://c4science.ch/source/tamaas/"
## [83] "https://gitlab.com/ampere2/metalwalls"
## [84] "https://gitlab.pasteur.fr/vlegrand/ROCK"
## [85] "https://git.ligo.org/asimov/asimov"
## [86] "https://gitlab.inria.fr/bcoye/game-engine-scheduling-simulation"
## [87] "https://gitlab.com/petsc/petsc"
## [88] "https://bitbucket.org/berkeleylab/hardware-control/src/main/"
## [89] "https://gitlab.com/jason-rumengan/pyarma"
## [90] "https://gitlab.com/culturalcartography/text2map"
## [91] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"
## [92] "https://gitlab.com/sissopp_developers/sissopp"
## [93] "https://gitlab.com/project-dare/dare-platform"
## [94] "https://earth.bsc.es/gitlab/wuruchi/autosubmitreact"
## [95] "https://bitbucket.org/sbarbot/motorcycle/src/master/"
## [96] "https://forgemia.inra.fr/pherosensor/pherosensor-toolbox"
## [97] "https://bitbucket.org/clhaley/Multitaper.jl"
## [98] "https://gitlab.com/remram44/taguette"
## [99] "https://git.geomar.de/digital-earth/dasf/dasf-messaging-python"
## [100] "https://bitbucket.org/mpi4py/mpi4py-fft"
## [101] "https://gitlab.com/sigcorr/sigcorr"
## [102] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"
## [103] "https://gitlab.inria.fr/mosaic/bvpy"
## [104] "https://gitlab.com/costrouc/pysrim"
## [105] "https://bitbucket.org/dghoshal/frieda"
## [106] "https://doi.org/10.17605/OSF.IO/3DS6A"
## [107] "https://gitlab.com/permafrostnet/teaspoon"
## [108] "https://gitlab.com/free-astro/siril"
## [109] "https://gitlab.ruhr-uni-bochum.de/reichp2y/proppy"
## [110] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [111] "https://gitlab.com/moerman1/fhi-cc4s"
## [112] "https://gitlab.com/jtagusari/hrisk-noisemodelling"
## [113] "https://gitlab.com/pyFBS/pyFBS"
## [114] "https://bitbucket.org/manuela_s/hcp/"
## [115] "https://gitlab.com/libreumg/dataquier.git"
## [116] "https://gitlab.com/fibreglass/pivc"
## [117] "https://gitlab.com/ProjectRHEA/flowsolverrhea"
## [118] "https://gitlab.ethz.ch/holukas/dyco-dynamic-lag-compensation"
## [119] "https://gitlab.inria.fr/melissa/melissa"
## [120] "https://gitlab.com/qc-devs/aqcnes"
## [121] "https://gitlab.com/jesseds/apav"
## [122] "https://gitlab.com/cosapp/cosapp"
## [123] "https://gitlab.com/vibes-developers/vibes"
## [124] "https://gitlab.com/dlr-ve/esy/vencopy/vencopy"
## [125] "https://gitlab.uliege.be/smart_grids/public/gboml"
## [126] "https://bitbucket.org/basicsums/basicsums"
## [127] "https://gitlab.com/eidheim/Simple-Web-Server"
## [128] "https://framagit.org/GustaveCoste/eldam"
## [129] "https://gricad-gitlab.univ-grenoble-alpes.fr/deformvis/insarviz"
## [130] "https://gitlab.com/cracklet/cracklet.git"
## [131] "https://git.ufz.de/despot/pysewer/"
## [132] "https://gitlab.com/materials-modeling/wulffpack"
## [133] "https://bitbucket.org/robmoss/particle-filter-for-python/"
## [134] "https://gitlab.com/energyincities/besos/"
## [135] "https://bitbucket.org/mituq/muq2.git"
## [136] "https://gitlab.com/robizzard/libcdict"
## [137] "https://gitlab.inria.fr/bramas/tbfmm"
## [138] "https://bitbucket.org/meg/cbcbeat"
## [139] "https://gitlab.com/mmartin-lagarde/exonoodle-exoplanets/-/tree/master/"
## [140] "https://gitlab.com/utopia-project/utopia"
## [141] "https://bitbucket.org/hammurabicode/hamx"
## [142] "https://gitlab.com/davidwoodburn/itrm"
## [143] "https://gitlab.com/tum-ciip/elsa"
## [144] "https://gitlab.com/binary_c/binary_c-python/"
## [145] "https://gitlab.com/picos-api/picos"
## [146] "https://bitbucket.org/cdegroot/wediff"
## [147] "https://gitlab.com/QComms/cqptoolkit"
## [148] "https://gitlab.com/toposens/public/ros-packages"
## [149] "https://gitlab.inria.fr/azais/treex"
## [150] "https://gitlab.com/pvst/asi"
## [151] "https://gitlab.com/chaver/choco-mining"
## [152] "https://gitlab.com/cosmograil/PyCS3"
## [153] "https://gitlab.com/davidtourigny/dynamic-fba"
## [154] "https://gitlab.com/MartinBeseda/sa-oo-vqe-qiskit.git"
## [155] "https://bitbucket.org/likask/mofem-cephas"
## [156] "https://bitbucket.org/cmutel/brightway2"
## [157] "https://gitlab.com/davidwoodburn/r3f"
## [158] "https://gitlab.com/libreumg/dataquier.git"
## [159] "https://gitlab.com/geekysquirrel/bigx"
## [160] "https://gitlab.com/dglaeser/fieldcompare"
## [161] "https://gitlab.com/dlr-ve/esy/sfctools/framework/"
df <- do.call(dplyr::bind_rows, lapply(unique(software_urls[is_github]), function(u) {
u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
if (grepl("/tree/", u0)) {
u0 <- strsplit(u0, "/tree/")[[1]][1]
}
if (grepl("/blob/", u0)) {
u0 <- strsplit(u0, "/blob/")[[1]][1]
}
info <- try({
gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
})
languages <- try({
gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"),
.limit = 500)
})
topics <- try({
gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"),
.accept = "application/vnd.github.mercy-preview+json", .limit = 500)
})
contribs <- try({
gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"),
.limit = 500)
})
if (!is(info, "try-error") && length(info) > 1) {
if (!is(contribs, "try-error")) {
if (length(contribs) == 0) {
repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
} else {
repo_nbr_contribs <- length(contribs)
repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
if (is.na(repo_nbr_contribs_2ormore)) {
print(contribs)
}
}
} else {
repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
}
if (!is(languages, "try-error")) {
if (length(languages) == 0) {
repolang <- ""
} else {
repolang <- paste(paste(names(unlist(languages)),
unlist(languages), sep = ":"), collapse = ",")
}
} else {
repolang <- ""
}
if (!is(topics, "try-error")) {
if (length(topics$names) == 0) {
repotopics <- ""
} else {
repotopics <- paste(unlist(topics$names), collapse = ",")
}
} else {
repotopics <- ""
}
data.frame(repo_url = u,
repo_created = info$created_at,
repo_updated = info$updated_at,
repo_pushed = info$pushed_at,
repo_nbr_stars = info$stargazers_count,
repo_language = ifelse(!is.null(info$language),
info$language, NA_character_),
repo_languages_bytes = repolang,
repo_topics = repotopics,
repo_license = ifelse(!is.null(info$license),
info$license$key, NA_character_),
repo_nbr_contribs = repo_nbr_contribs,
repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
)
} else {
NULL
}
})) %>%
dplyr::mutate(repo_created = as.Date(repo_created),
repo_updated = as.Date(repo_updated),
repo_pushed = as.Date(repo_pushed)) %>%
dplyr::distinct() %>%
dplyr::mutate(repo_info_obtained = lubridate::today())
if (length(unique(df$repo_url)) != length(df$repo_url)) {
print(length(unique(df$repo_url)))
print(length(df$repo_url))
print(df$repo_url[duplicated(df$repo_url)])
}
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)
## [1] 1671 12
## For papers not in df (i.e., for which we didn't get a valid response
## from the GitHub API query), use information from the archived data frame
dfarchive <- papers_archive %>%
dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
dplyr::filter(!(repo_url %in% df$repo_url)) %>%
dplyr::arrange(desc(repo_info_obtained)) %>%
dplyr::filter(!duplicated(repo_url))
head(dfarchive)
## # A tibble: 6 × 12
## repo_url repo_created repo_updated repo_pushed repo_nbr_stars repo_language
## <chr> <date> <date> <date> <int> <chr>
## 1 https://gi… 2020-01-24 2024-10-20 2025-01-22 80 Python
## 2 https://gi… 2021-01-26 2025-04-10 2025-04-28 34 Python
## 3 https://gi… 2020-04-14 2025-04-17 2025-03-07 58 Python
## 4 https://gi… 2020-10-01 2025-03-21 2025-03-21 9 Python
## 5 https://gi… 2022-05-30 2025-01-23 2023-10-01 8 Python
## 6 https://gi… 2021-05-26 2025-04-19 2025-04-25 35 Python
## # ℹ 6 more variables: repo_languages_bytes <chr>, repo_topics <chr>,
## # repo_license <chr>, repo_nbr_contribs <int>,
## # repo_nbr_contribs_2ormore <int>, repo_info_obtained <date>
dim(dfarchive)
## [1] 1283 12
df <- dplyr::bind_rows(df, dfarchive)
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)
## [1] 2954 12
papers <- papers %>% dplyr::left_join(df, by = "repo_url")
dim(papers)
## [1] 2968 69
source_track <- c(source_track,
structure(rep("sw-github", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
dplyr::mutate(published.date = as.Date(published.print)) %>%
dplyr::mutate(
halfyear = paste0(year(published.date),
ifelse(month(published.date) <= 6, "H1", "H2"))
) %>% dplyr::mutate(
halfyear = factor(halfyear,
levels = paste0(rep(sort(unique(year(published.date))),
each = 2), c("H1", "H2")))
) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))
dim(papers)
## [1] 2968 69
dupidx <- which(papers$alternative.id %in% papers$alternative.id[duplicated(papers)])
papers[dupidx, ] %>% arrange(alternative.id) %>% head(n = 10)
## # A tibble: 0 × 69
## # ℹ 69 variables: alternative.id <chr>, container.title <chr>, created <chr>,
## # deposited <chr>, published.print <chr>, doi <chr>, indexed <chr>,
## # issn <chr>, issue <chr>, issued <chr>, member <chr>, page <chr>,
## # prefix <chr>, publisher <chr>, score <chr>, source <chr>,
## # reference.count <chr>, references.count <chr>,
## # is.referenced.by.count <chr>, title <chr>, type <chr>, url <chr>,
## # volume <chr>, short.container.title <chr>, author <list>, …
papers <- papers %>% dplyr::distinct()
dim(papers)
## [1] 2968 69
source_track <- c(source_track,
structure(rep("cleanup", length(setdiff(colnames(papers),
names(source_track)))),
names = setdiff(colnames(papers), names(source_track))))
In some cases, fetching information from (e.g.) the GitHub API fails for a subset of the publications. There are also other reasons for missing values (for example, the earliest submissions do not have an associated pre-review issue). The table below lists the number of missing values for each of the variables in the data frame.
DT::datatable(
data.frame(variable = colnames(papers),
nbr_missing = colSums(is.na(papers))) %>%
dplyr::mutate(source = source_track[variable]),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
monthly_pubs <- papers %>%
dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
dplyr::group_by(pubmonth) %>%
dplyr::summarize(npub = n())
ggplot(monthly_pubs,
aes(x = factor(pubmonth), y = npub)) +
geom_bar(stat = "identity") + theme_minimal() +
labs(x = "", y = "Number of published papers per month", caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
DT::datatable(
monthly_pubs %>%
dplyr::rename("Number of papers" = "npub",
"Month of publications" = "pubmonth"),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
yearly_pubs <- papers %>%
dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
dplyr::group_by(pubyear) %>%
dplyr::summarize(npub = n())
ggplot(yearly_pubs,
aes(x = factor(pubyear), y = npub)) +
geom_bar(stat = "identity") + theme_minimal() +
labs(x = "", y = "Number of published papers per year", caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
DT::datatable(
yearly_pubs %>%
dplyr::rename("Number of papers" = "npub",
"Year of publications" = "pubyear"),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
The plots below illustrate the fraction of pre-review and review issues closed during each month that have the ‘rejected’ label attached.
ggplot(all_rejected,
aes(x = factor(closedmonth), y = nbr_rejections/nbr_issues_closed)) +
geom_bar(stat = "identity") +
theme_minimal() +
facet_wrap(~ itype, ncol = 1) +
labs(x = "Month of issue closing", y = "Fraction of issues rejected",
caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Papers with 20 or more citations are grouped in the “>=20” category.
ggplot(papers %>%
dplyr::mutate(citation_count = replace(citation_count,
citation_count >= 20, ">=20")) %>%
dplyr::mutate(citation_count = factor(citation_count,
levels = c(0:20, ">=20"))) %>%
dplyr::group_by(citation_count) %>%
dplyr::tally(),
aes(x = citation_count, y = n)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(x = "OpenAlex citation count", y = "Number of publications", caption = dcap)
The table below sorts the JOSS papers in decreasing order by the number of citations in OpenAlex.
DT::datatable(
papers %>%
dplyr::mutate(url = paste0("<a href='", url, "' target='_blank'>",
url,"</a>")) %>%
dplyr::arrange(desc(citation_count)) %>%
dplyr::select(title, url, published.date, citation_count),
escape = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
plotly::ggplotly(
ggplot(papers, aes(x = published.date, y = citation_count, label = title)) +
geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() +
geom_smooth() +
labs(x = "Date of publication", y = "OpenAlex citation count", caption = dcap) +
theme(axis.title = element_text(size = 15)),
tooltip = c("label", "x", "y")
)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Here, we plot the citation count for all papers published within each half year, sorted in decreasing order.
ggplot(papers %>% dplyr::group_by(halfyear) %>%
dplyr::arrange(desc(citation_count)) %>%
dplyr::mutate(idx = seq_along(citation_count)),
aes(x = idx, y = citation_count)) +
geom_point(alpha = 0.5) +
facet_wrap(~ halfyear, scales = "free") +
theme_bw() +
labs(x = "Index", y = "OpenAlex citation count", caption = dcap)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
In these plots we investigate whether the time a submission spends in the pre-review or review stage (or their sum) has changed over time. The blue curve corresponds to a rolling median for submissions over 120 days.
## Helper functions (modified from https://stackoverflow.com/questions/65147186/geom-smooth-with-median-instead-of-mean)
rolling_median <- function(formula, data, xwindow = 120, ...) {
## Get order of x-values and sort x/y
ordr <- order(data$x)
x <- data$x[ordr]
y <- data$y[ordr]
## Initialize vector for smoothed y-values
ys <- rep(NA, length(x))
## Calculate median y-value for each unique x-value
for (xs in setdiff(unique(x), NA)) {
## Get x-values in the window, and calculate median of corresponding y
j <- ((xs - xwindow/2) < x) & (x < (xs + xwindow/2))
ys[x == xs] <- median(y[j], na.rm = TRUE)
}
y <- ys
structure(list(x = x, y = y, f = approxfun(x, y)), class = "rollmed")
}
predict.rollmed <- function(mod, newdata, ...) {
setNames(mod$f(newdata$x), newdata$x)
}
ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) +
geom_point() +
geom_smooth(formula = y ~ x, method = "rolling_median",
se = FALSE, method.args = list(xwindow = 120)) +
theme_bw() +
labs(x = "Date of pre-review opening", y = "Number of days in pre-review",
caption = dcap) +
theme(axis.title = element_text(size = 15))
ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) +
geom_point() +
geom_smooth(formula = y ~ x, method = "rolling_median",
se = FALSE, method.args = list(xwindow = 120)) +
theme_bw() +
labs(x = "Date of review opening", y = "Number of days in review",
caption = dcap) +
theme(axis.title = element_text(size = 15))
ggplot(papers, aes(x = prerev_opened,
y = as.numeric(days_in_pre) + as.numeric(days_in_rev))) +
geom_point() +
geom_smooth(formula = y ~ x, method = "rolling_median",
se = FALSE, method.args = list(xwindow = 120)) +
theme_bw() +
labs(x = "Date of pre-review opening", y = "Number of days in pre-review + review",
caption = dcap) +
theme(axis.title = element_text(size = 15))
Next, we consider the languages used by the submissions, both as reported by JOSS and based on the information encoded in available GitHub repositories (for the latter, we also record the number of bytes of code written in each language). Note that a given submission can use multiple languages.
## Language information from JOSS
sspl <- strsplit(papers$languages, ",")
all_languages <- unique(unlist(sspl))
langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) {
data.frame(language = l,
nbr_submissions_JOSS_API = sum(vapply(sspl, function(v) l %in% v, 0)))
}))
## Language information from GitHub software repos
a <- lapply(strsplit(papers$repo_languages_bytes, ","), function(w) strsplit(w, ":"))
a <- a[sapply(a, length) > 0]
langbytes <- as.data.frame(t(as.data.frame(a))) %>%
setNames(c("language", "bytes")) %>%
dplyr::mutate(bytes = as.numeric(bytes)) %>%
dplyr::filter(!is.na(language)) %>%
dplyr::group_by(language) %>%
dplyr::summarize(nbr_bytes_GitHub = sum(bytes),
nbr_repos_GitHub = length(bytes)) %>%
dplyr::arrange(desc(nbr_bytes_GitHub))
langs <- dplyr::full_join(langs, langbytes, by = "language")
ggplot(langs %>% dplyr::arrange(desc(nbr_submissions_JOSS_API)) %>%
dplyr::filter(nbr_submissions_JOSS_API > 10) %>%
dplyr::mutate(language = factor(language, levels = language)),
aes(x = language, y = nbr_submissions_JOSS_API)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
labs(x = "", y = "Number of submissions", caption = dcap) +
theme(axis.title = element_text(size = 15))
DT::datatable(
langs %>% dplyr::arrange(desc(nbr_bytes_GitHub)),
escape = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
ggplot(langs, aes(x = nbr_repos_GitHub, y = nbr_bytes_GitHub)) +
geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() +
theme_bw() +
labs(x = "Number of repos using the language",
y = "Total number of bytes of code\nwritten in the language",
caption = dcap) +
theme(axis.title = element_text(size = 15))
ggplotly(
ggplot(papers, aes(x = citation_count, y = repo_nbr_stars,
label = title)) +
geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() +
theme_bw() +
labs(x = "OpenAlex citation count", y = "Number of stars, GitHub repo",
caption = dcap) +
theme(axis.title = element_text(size = 15)),
tooltip = c("label", "x", "y")
)
ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) +
geom_histogram(bins = 50) +
theme_bw() +
labs(x = "Time (days) from repo creation to JOSS pre-review start",
caption = dcap) +
theme(axis.title = element_text(size = 15))
ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) +
geom_histogram(bins = 50) +
theme_bw() +
labs(x = "Time (days) from closure of JOSS review to most recent commit in repo",
caption = dcap) +
theme(axis.title = element_text(size = 15)) +
facet_wrap(~ year(published.date), scales = "free_y")
Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.
ggplot(papers %>%
dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
dplyr::mutate(year = year(published.date)),
aes(x = nbr_reviewers)) + geom_bar() +
facet_wrap(~ year) + theme_bw() +
labs(x = "Number of reviewers", y = "Number of submissions", caption = dcap)
Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.
reviewers <- papers %>%
dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
dplyr::mutate(year = year(published.date)) %>%
dplyr::select(reviewers, year) %>%
tidyr::separate_rows(reviewers, sep = ",")
## Most active reviewers
DT::datatable(
reviewers %>% dplyr::group_by(reviewers) %>%
dplyr::summarize(nbr_reviews = length(year),
timespan = paste(unique(c(min(year), max(year))),
collapse = " - ")) %>%
dplyr::arrange(desc(nbr_reviews)),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE)
)
ggplot(papers %>%
dplyr::mutate(year = year(published.date),
`r/pyOpenSci` = factor(
grepl("rOpenSci|pyOpenSci", prerev_labels),
levels = c("TRUE", "FALSE"))),
aes(x = editor)) + geom_bar(aes(fill = `r/pyOpenSci`)) +
theme_bw() + facet_wrap(~ year, ncol = 1) +
scale_fill_manual(values = c(`TRUE` = "grey65", `FALSE` = "grey35")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
labs(x = "Editor", y = "Number of submissions", caption = dcap)
all_licenses <- sort(unique(papers$repo_license))
license_levels = c(grep("apache", all_licenses, value = TRUE),
grep("bsd", all_licenses, value = TRUE),
grep("mit", all_licenses, value = TRUE),
grep("gpl", all_licenses, value = TRUE),
grep("mpl", all_licenses, value = TRUE))
license_levels <- c(license_levels, setdiff(all_licenses, license_levels))
ggplot(papers %>%
dplyr::mutate(repo_license = factor(repo_license,
levels = license_levels)),
aes(x = repo_license)) +
geom_bar() +
theme_bw() +
labs(x = "Software license", y = "Number of submissions", caption = dcap) +
theme(axis.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
facet_wrap(~ year(published.date), scales = "free_y")
## For plots below, replace licenses present in less
## than 2.5% of the submissions by 'other'
tbl <- table(papers$repo_license)
to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)])
ggplot(papers %>%
dplyr::mutate(year = year(published.date)) %>%
dplyr::mutate(repo_license = replace(repo_license,
repo_license %in% to_replace,
"other")) %>%
dplyr::mutate(year = factor(year),
repo_license = factor(
repo_license,
levels = license_levels[license_levels %in% repo_license]
)) %>%
dplyr::group_by(year, repo_license, .drop = FALSE) %>%
dplyr::count() %>%
dplyr::mutate(year = as.integer(as.character(year))),
aes(x = year, y = n, fill = repo_license)) + geom_area() +
theme_minimal() +
scale_fill_brewer(palette = "Set1", name = "Software\nlicense",
na.value = "grey") +
theme(axis.title = element_text(size = 15)) +
labs(x = "Year", y = "Number of submissions", caption = dcap)
ggplot(papers %>%
dplyr::mutate(year = year(published.date)) %>%
dplyr::mutate(repo_license = replace(repo_license,
repo_license %in% to_replace,
"other")) %>%
dplyr::mutate(year = factor(year),
repo_license = factor(
repo_license,
levels = license_levels[license_levels %in% repo_license]
)) %>%
dplyr::group_by(year, repo_license, .drop = FALSE) %>%
dplyr::summarize(n = n()) %>%
dplyr::mutate(freq = n/sum(n)) %>%
dplyr::mutate(year = as.integer(as.character(year))),
aes(x = year, y = freq, fill = repo_license)) + geom_area() +
theme_minimal() +
scale_fill_brewer(palette = "Set1", name = "Software\nlicense",
na.value = "grey") +
theme(axis.title = element_text(size = 15)) +
labs(x = "Year", y = "Fraction of submissions", caption = dcap)
a <- unlist(strsplit(papers$repo_topics, ","))
a <- a[!is.na(a)]
topicfreq <- table(a)
colors <- viridis::viridis(100)
set.seed(1234)
wordcloud::wordcloud(
names(topicfreq), sqrt(topicfreq), min.freq = 1, max.words = 300,
random.order = FALSE, rot.per = 0.05, use.r.layout = FALSE,
colors = colors, scale = c(10, 0.1), random.color = TRUE,
ordered.colors = FALSE, vfont = c("serif", "plain")
)
DT::datatable(as.data.frame(topicfreq) %>%
dplyr::rename(topic = a, nbr_repos = Freq) %>%
dplyr::arrange(desc(nbr_repos)),
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE))
Here, we take a more detailed look at the papers that cite JOSS papers, using data from the Open Citations Corpus.
## Split into several queries
## Randomize the splitting since a whole query may fail if one ID is not recognized
papidx <- seq_len(nrow(papers))
idxL <- split(sample(papidx, length(papidx), replace = FALSE), ceiling(papidx / 50))
citationsL <- lapply(idxL, function(idx) {
tryCatch({
citecorp::oc_coci_cites(doi = papers$alternative.id[idx]) %>%
dplyr::distinct() %>%
dplyr::mutate(citation_info_obtained = as.character(lubridate::today()))
}, error = function(e) {
NULL
})
})
citationsL <- citationsL[vapply(citationsL, function(df) !is.null(df) && nrow(df) > 0, FALSE)]
if (length(citationsL) > 0) {
citations <- do.call(dplyr::bind_rows, citationsL)
} else {
citations <- NULL
}
dim(citations)
## NULL
if (!is.null(citations) && is.data.frame(citations) && "oci" %in% colnames(citations)) {
citations <- citations %>%
dplyr::filter(!(oci %in% citations_archive$oci))
tmpj <- rcrossref::cr_works(dois = unique(citations$citing))$data %>%
dplyr::select(contains("doi"), contains("container.title"), contains("issn"),
contains("type"), contains("publisher"), contains("prefix"))
citations <- citations %>% dplyr::left_join(tmpj, by = c("citing" = "doi"))
## bioRxiv preprints don't have a 'container.title' or 'issn', but we'll assume
## that they can be
## identified from the prefix 10.1101 - set the container.title
## for these records manually; we may or may not want to count these
## (would it count citations twice, both preprint and publication?)
citations$container.title[citations$prefix == "10.1101"] <- "bioRxiv"
## JOSS is represented by 'The Journal of Open Source Software' as well as
## 'Journal of Open Source Software'
citations$container.title[citations$container.title ==
"Journal of Open Source Software"] <-
"The Journal of Open Source Software"
## Remove real self citations (cited DOI = citing DOI)
citations <- citations %>% dplyr::filter(cited != citing)
## Merge with the archive
citations <- dplyr::bind_rows(citations, citations_archive)
} else {
citations <- citations_archive
if (is.null(citations[["citation_info_obtained"]])) {
citations$citation_info_obtained <- NA_character_
}
}
citations$citation_info_obtained[is.na(citations$citation_info_obtained)] <-
"2021-08-11"
write.table(citations, file = "joss_submission_citations.tsv",
row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
## Latest successful update of new citation data
max(as.Date(citations$citation_info_obtained))
## [1] "2025-04-12"
## Number of JOSS papers with >0 citations included in this collection
length(unique(citations$cited))
## [1] 1844
## Number of JOSS papers with >0 citations according to OpenAlex
length(which(papers$citation_count > 0))
## [1] 2221
## Number of citations from Open Citations Corpus vs OpenAlex
df0 <- papers %>% dplyr::select(doi, citation_count) %>%
dplyr::full_join(citations %>% dplyr::group_by(cited) %>%
dplyr::tally() %>%
dplyr::mutate(n = replace(n, is.na(n), 0)),
by = c("doi" = "cited"))
## Total citation count OpenAlex
sum(df0$citation_count, na.rm = TRUE)
## [1] 92892
## Total citation count Open Citations Corpus
sum(df0$n, na.rm = TRUE)
## [1] 92735
## Ratio of total citation count Open Citations Corpus/OpenAlex
sum(df0$n, na.rm = TRUE)/sum(df0$citation_count, na.rm = TRUE)
## [1] 0.9983099
ggplot(df0, aes(x = citation_count, y = n)) +
geom_abline(slope = 1, intercept = 0) +
geom_point(size = 3, alpha = 0.5) +
labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count",
caption = dcap) +
theme_bw()
## Zoom in
ggplot(df0, aes(x = citation_count, y = n)) +
geom_abline(slope = 1, intercept = 0) +
geom_point(size = 3, alpha = 0.5) +
labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count",
caption = dcap) +
theme_bw() +
coord_cartesian(xlim = c(0, 75), ylim = c(0, 75))
## Number of journals citing JOSS papers
length(unique(citations$container.title))
## [1] 10467
length(unique(citations$issn))
## [1] 7585
topcit <- citations %>% dplyr::group_by(container.title) %>%
dplyr::summarize(nbr_citations_of_joss_papers = length(cited),
nbr_cited_joss_papers = length(unique(cited)),
nbr_citing_papers = length(unique(citing)),
nbr_selfcitations_of_joss_papers = sum(author_sc == "yes"),
fraction_selfcitations = signif(nbr_selfcitations_of_joss_papers /
nbr_citations_of_joss_papers, digits = 3)) %>%
dplyr::arrange(desc(nbr_cited_joss_papers))
DT::datatable(topcit,
escape = FALSE, rownames = FALSE,
filter = list(position = 'top', clear = FALSE),
options = list(scrollX = TRUE))
plotly::ggplotly(
ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
label = container.title)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") +
geom_point(size = 3, alpha = 0.5) +
theme_bw() +
labs(caption = dcap, x = "Number of citations of JOSS papers",
y = "Number of cited JOSS papers")
)
plotly::ggplotly(
ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
label = container.title)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") +
geom_point(size = 3, alpha = 0.5) +
theme_bw() +
coord_cartesian(xlim = c(0, 100), ylim = c(0, 50)) +
labs(caption = dcap, x = "Number of citations of JOSS papers",
y = "Number of cited JOSS papers")
)
write.table(topcit, file = "joss_submission_citations_byjournal.tsv",
row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
The tibble object with all data collected above is serialized to a file that can be downloaded and reused.
head(papers) %>% as.data.frame()
## alternative.id container.title created deposited
## 1 10.21105/joss.05467 Journal of Open Source Software 2024-01-23 2024-01-23
## 2 10.21105/joss.05854 Journal of Open Source Software 2024-01-11 2024-01-11
## 3 10.21105/joss.03596 Journal of Open Source Software 2022-02-10 2022-02-10
## 4 10.21105/joss.05074 Journal of Open Source Software 2023-06-13 2023-06-13
## 5 10.21105/joss.05451 Journal of Open Source Software 2023-09-20 2023-09-20
## 6 10.21105/joss.05149 Journal of Open Source Software 2023-04-20 2023-04-20
## published.print doi indexed issn issue issued
## 1 2024-01-23 10.21105/joss.05467 2024-09-19 2475-9066 93 2024-01-23
## 2 2024-01-11 10.21105/joss.05854 2024-05-29 2475-9066 93 2024-01-11
## 3 2022-02-10 10.21105/joss.03596 2025-03-19 2475-9066 70 2022-02-10
## 4 2023-06-13 10.21105/joss.05074 2024-03-03 2475-9066 86 2023-06-13
## 5 2023-09-20 10.21105/joss.05451 2024-03-03 2475-9066 89 2023-09-20
## 6 2023-04-20 10.21105/joss.05149 2024-03-03 2475-9066 84 2023-04-20
## member page prefix publisher score source reference.count
## 1 8722 5467 10.21105 The Open Journal 0 Crossref 16
## 2 8722 5854 10.21105 The Open Journal 0 Crossref 33
## 3 8722 3596 10.21105 The Open Journal 0 Crossref 26
## 4 8722 5074 10.21105 The Open Journal 0 Crossref 9
## 5 8722 5451 10.21105 The Open Journal 0 Crossref 17
## 6 8722 5149 10.21105 The Open Journal 0 Crossref 12
## references.count is.referenced.by.count
## 1 16 1
## 2 33 1
## 3 26 4
## 4 9 0
## 5 17 0
## 6 12 0
## title
## 1 Foundry-ML - Software and Services to Simplify Access\nto Machine Learning Datasets in Materials Science
## 2 PhysioLabXR: A Python Platform for Real-Time,\nMulti-modal, Brain–Computer Interfaces and Extended Reality\nExperiments
## 3 Nempy: A Python package for modelling the Australian National Electricity Market dispatch procedure
## 4 pycoxmunk: A python package for computing sea surface\nreflectance
## 5 Spikeometric: Linear Non-Linear Cascade Spiking Neural\nNetworks with Pytorch Geometric
## 6 PyExperimenter: Easily distribute experiments and track\nresults
## type url volume
## 1 journal-article https://doi.org/10.21105/joss.05467 9
## 2 journal-article https://doi.org/10.21105/joss.05854 9
## 3 journal-article https://doi.org/10.21105/joss.03596 7
## 4 journal-article https://doi.org/10.21105/joss.05074 8
## 5 journal-article https://doi.org/10.21105/joss.05451 8
## 6 journal-article https://doi.org/10.21105/joss.05149 8
## short.container.title
## 1 JOSS
## 2 JOSS
## 3 JOSS
## 4 JOSS
## 5 JOSS
## 6 JOSS
## author
## 1 http://orcid.org/0000-0002-9373-0058, http://orcid.org/0000-0002-3917-605X, http://orcid.org/0000-0002-1323-5939, NA, http://orcid.org/0000-0001-6817-7265, NA, NA, NA, NA, NA, NA, NA, http://orcid.org/0000-0003-2229-6730, NA, NA, http://orcid.org/0000-0002-7652-6776, http://orcid.org/0000-0001-9438-4284, http://orcid.org/0000-0002-4911-0046, http://orcid.org/0000-0003-2129-5269, http://orcid.org/0000-0002-5326-4902, FALSE, FALSE, FALSE, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, FALSE, NA, NA, FALSE, FALSE, FALSE, FALSE, FALSE, KJ, Aristana, Logan, Steve, Marcus, Isaac, Ethan, Aadit, Ribhav, Zoa, Jingrui, Xiangguo, Ryan, Lane, Doyeon, Michael, Paul M., Dane, Ian, Ben, Schmidt, Scourtas, Ward, Wangen, Schwarting, Darling, Truelove, Ambadkar, Bose, Katok, Wei, Li, Jacobs, Schultz, Kim, Ferris, Voyles, Morgan, Foster, Blaiszik, first, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional, additional
## 2 http://orcid.org/0000-0001-5187-200X, http://orcid.org/0000-0003-1856-5627, http://orcid.org/0009-0006-2304-7591, http://orcid.org/0009-0000-1824-970X, http://orcid.org/0009-0005-1211-6101, http://orcid.org/0000-0001-9978-7090, http://orcid.org/0000-0002-9738-1342, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, Ziheng ‘Leo’, Haowen ‘John’, Ziwen, Yunxiang, June Pyo, Steven, Paul, Li, Wei, Xie, Peng, Suh, Feiner, Sajda, first, additional, additional, additional, additional, additional, additional
## 3 Nicholas, Anna, Iain, Gorman, Bruce, MacGill, first, additional, additional
## 4 http://orcid.org/0000-0003-3880-6774, FALSE, Simon R., Proud, first
## 5 http://orcid.org/0009-0009-0584-9293, NA, http://orcid.org/0000-0002-4262-5549, FALSE, NA, FALSE, Jakob L., Herman, Mikkel Elle, Sønstebø, Brunborg, Lepperød, first, additional, additional
## 6 http://orcid.org/0000-0001-9954-462X, http://orcid.org/0000-0002-2415-2186, http://orcid.org/0000-0001-8057-4650, NA, http://orcid.org/0000-0001-9447-0609, http://orcid.org/0000-0002-1231-4985, http://orcid.org/0000-0002-9293-2424, http://orcid.org/0000-0001-9782-6818, FALSE, FALSE, FALSE, NA, FALSE, FALSE, FALSE, FALSE, Tanja, Alexander, Lukas, Lukas, Helena, Jonas, Felix, Marcel, Tornede, Tornede, Fehring, Gehring, Graf, Hanselle, Mohr, Wever, first, additional, additional, additional, additional, additional, additional, additional
## citation_count openalex_id affil_countries_all
## 1 3 https://openalex.org/W4391136690
## 2 1 https://openalex.org/W4390732915
## 3 4 https://openalex.org/W4211164564 AU
## 4 0 https://openalex.org/W4380629387 GB
## 5 0 https://openalex.org/W4386852434 NO
## 6 0 https://openalex.org/W4366590441
## affil_countries_first
## 1
## 2
## 3 AU
## 4 GB
## 5 NO
## 6
## api_title
## 1 Foundry-ML - Software and Services to Simplify Access to Machine Learning Datasets in Materials Science
## 2 PhysioLabXR: A Python Platform for Real-Time, Multi-modal, Brain–Computer Interfaces and Extended Reality Experiments
## 3 Nempy: A Python package for modelling the Australian National Electricity Market dispatch procedure
## 4 pycoxmunk: A python package for computing sea surface reflectance
## 5 Spikeometric: Linear Non-Linear Cascade Spiking Neural Networks with Pytorch Geometric
## 6 PyExperimenter: Easily distribute experiments and track results
## api_state editor reviewers nbr_reviewers
## 1 accepted @Fei-Tao @duhd1993,@marshallmcdonnell 2
## 2 accepted @mstimberg @lucask07,@nastaran62 2
## 3 accepted @timtroendle @noah80,@robinroche 2
## 4 accepted @pdebuyl @arthur-e,@molinav 2
## 5 accepted @jbytecode @clinssen,@Saran-nns 2
## 6 accepted @timtroendle @ArsamAryandoust,@schnorr 2
## repo_url review_issue_id prereview_issue_id
## 1 https://github.com/MLMI2-CSSI/foundry 5467 5412
## 2 https://github.com/PhysioLabXR/PhysioLabXR 5854 5838
## 3 https://github.com/UNSW-CEEM/nempy 3596 3576
## 4 https://github.com/simonrp84/PyCoxMunk 5074 4919
## 5 https://github.com/bioAI-Oslo/Spikeometric 5451 5207
## 6 https://github.com/tornede/py_experimenter 5149 4974
## languages archive_doi
## 1 Python https://doi.org/10.5281/zenodo.10494644
## 2 Python,C++,Cython https://doi.org/10.5281/zenodo.10471500
## 3 Python https://doi.org/10.5281/zenodo.5989170
## 4 Python https://doi.org/10.5281/zenodo.8020079
## 5 Python https://doi.org/10.5281/zenodo.8358903
## 6 Python https://doi.org/10.5281/zenodo.7838280
## review_title
## 1 Foundry-ML - Software and Services to Simplify Access to Machine Learning Datasets in Materials Science
## 2 PhysioLabXR: A Python Platform for Real-Time, Multi-modal, Brain–Computer Interfaces and Extended Reality Experiments
## 3 Nempy: A Python package for modelling the Australian National Electricity Market dispatch procedure
## 4 pycoxmunk: A python package for computing sea surface reflectance
## 5 Spikeometric - Linear Non-Linear Cascade Spiking Neural Networks with PyTorch Geometric
## 6 PyExperimenter: Easily distribute experiments and track results
## review_number review_state review_opened review_closed review_ncomments
## 1 5467 closed 2023-05-16 2024-01-23 95
## 2 5854 closed 2023-09-18 2024-01-11 96
## 3 3596 closed 2021-08-10 2022-02-10 83
## 4 5074 closed 2023-01-13 2023-06-13 94
## 5 5451 closed 2023-05-08 2023-09-20 79
## 6 5149 closed 2023-02-13 2023-04-20 59
## review_labels
## 1 accepted,TeX,Python,recommend-accept,published,Track: 2 (BCM)
## 2 accepted,TeX,Python,C++,recommend-accept,published,Track: 2 (BCM)
## 3 accepted,Python,recommend-accept,published
## 4 accepted,TeX,Python,recommend-accept,published,waitlisted,Track: 6 (ESE)
## 5 accepted,TeX,Python,recommend-accept,published,Track: 5 (DSAIS)
## 6 accepted,TeX,Python,recommend-accept,published,Track: 5 (DSAIS)
## prerev_title
## 1 Foundry-ML - Software and Services to Simplify Access to Machine Learning Datasets in Materials Science
## 2 PhysioLabXR: A Python Platform for Real-Time, Multi-modal, Brain–Computer Interfaces and Extended Reality Experiments
## 3 Nempy: A Python package for modelling the Australian National Electricity Market dispatch procedure
## 4 pycoxmunk: A python package for computing sea surface reflectance
## 5 Spikeometric - Linear Non-Linear Cascade Spiking Neural Networks with PyTorch Geometric
## 6 PyExperimenter: Easily distribute experiments and track results
## prerev_state prerev_opened prerev_closed prerev_ncomments
## 1 closed 2023-04-26 2023-05-16 36
## 2 closed 2023-09-11 2023-09-18 26
## 3 closed 2021-08-06 2021-08-10 23
## 4 closed 2022-11-08 2023-01-13 20
## 5 closed 2023-03-02 2023-05-08 31
## 6 closed 2022-11-27 2023-02-13 34
## prerev_labels days_in_pre days_in_rev to_review
## 1 TeX,Python,Track: 2 (BCM) 20 days 252 days TRUE
## 2 TeX,Python,C++,Track: 2 (BCM) 7 days 115 days TRUE
## 3 Python,waitlisted 4 days 184 days TRUE
## 4 TeX,Python,waitlisted,Track: 6 (ESE) 66 days 151 days TRUE
## 5 TeX,Python,Track: 5 (DSAIS) 67 days 135 days TRUE
## 6 TeX,Python,waitlisted,Track: 5 (DSAIS) 78 days 66 days TRUE
## repo_created repo_updated repo_pushed repo_nbr_stars repo_language
## 1 2020-01-24 2024-10-20 2025-01-22 80 Python
## 2 2021-01-26 2025-04-10 2025-04-28 34 Python
## 3 2020-04-14 2025-04-17 2025-03-07 58 Python
## 4 2020-10-01 2025-03-21 2025-03-21 9 Python
## 5 2022-05-30 2025-01-23 2023-10-01 8 Python
## 6 2021-05-26 2025-04-19 2025-04-25 35 Python
## repo_languages_bytes
## 1 Python:131974
## 2 Python:1561419,Cython:13346,C:12644,C++:1827,Shell:1786
## 3 Python:892252
## 4 Python:117779
## 5 Python:131497
## 6 Python:156927,TeX:6250
## repo_topics
## 1 data-science,machine-learning,materials-science,datasets,chemistry
## 2
## 3
## 4 earth-observation,oceanography,reflectance,remote-sensing,satellite
## 5
## 6 database,executor,experiments,python
## repo_license repo_nbr_contribs repo_nbr_contribs_2ormore repo_info_obtained
## 1 mit 19 19 2025-04-30
## 2 gpl-3.0 17 14 2025-04-30
## 3 bsd-3-clause 7 5 2025-04-30
## 4 gpl-3.0 2 2 2025-04-30
## 5 gpl-3.0 3 3 2025-04-30
## 6 mit 5 4 2025-04-30
## published.date halfyear nbr_authors
## 1 2024-01-23 2024H1 20
## 2 2024-01-11 2024H1 7
## 3 2022-02-10 2022H1 3
## 4 2023-06-13 2023H1 1
## 5 2023-09-20 2023H2 3
## 6 2023-04-20 2023H1 8
saveRDS(papers, file = "joss_submission_analytics.rds")
To read the current version of this file directly from GitHub, use the following code:
papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.7.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: UTC
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] openalexR_2.0.1 stringr_1.5.1 gt_1.0.0 rworldmap_1.3-8
## [5] sp_2.2-0 readr_2.1.5 citecorp_0.3.0 plotly_4.10.4
## [9] DT_0.33 jsonlite_2.0.0 purrr_1.0.4 gh_1.4.1
## [13] lubridate_1.9.4 ggplot2_3.5.2 tidyr_1.3.1 dplyr_1.1.4
## [17] rcrossref_1.2.009 tibble_3.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 farver_2.1.2 viridis_0.6.5
## [5] urltools_1.7.3 fields_16.3.1 fastmap_1.2.0 lazyeval_0.2.2
## [9] promises_1.3.2 digest_0.6.37 dotCall64_1.2 timechange_0.3.0
## [13] mime_0.13 lifecycle_1.0.4 terra_1.8-42 magrittr_2.0.3
## [17] compiler_4.5.0 rlang_1.1.6 sass_0.4.10 tools_4.5.0
## [21] wordcloud_2.6 utf8_1.2.5 yaml_2.3.10 data.table_1.17.0
## [25] knitr_1.50 labeling_0.4.3 fauxpas_0.5.2 htmlwidgets_1.6.4
## [29] bit_4.6.0 curl_6.2.2 plyr_1.8.9 xml2_1.3.8
## [33] RColorBrewer_1.1-3 httpcode_0.3.0 miniUI_0.1.2 withr_3.0.2
## [37] triebeard_0.4.1 grid_4.5.0 xtable_1.8-4 gitcreds_0.1.2
## [41] scales_1.4.0 crul_1.5.0 cli_3.6.5 rmarkdown_2.29
## [45] crayon_1.5.3 generics_0.1.3 httr_1.4.7 tzdb_0.5.0
## [49] cachem_1.1.0 splines_4.5.0 maps_3.4.2.1 parallel_4.5.0
## [53] vctrs_0.6.5 Matrix_1.7-3 hms_1.1.3 bit64_4.6.0-1
## [57] crosstalk_1.2.1 jquerylib_0.1.4 glue_1.8.0 spam_2.11-1
## [61] codetools_0.2-20 stringi_1.8.7 gtable_0.3.6 later_1.4.2
## [65] raster_3.6-32 pillar_1.10.2 rappdirs_0.3.3 htmltools_0.5.8.1
## [69] R6_2.6.1 httr2_1.1.2 vroom_1.6.5 evaluate_1.0.3
## [73] shiny_1.10.0 lattice_0.22-6 httpuv_1.6.16 bslib_0.9.0
## [77] Rcpp_1.0.14 gridExtra_2.3 nlme_3.1-168 mgcv_1.9-1
## [81] whisker_0.4.1 xfun_0.52 pkgconfig_2.0.3