Introduction

In this report, we extract information about published JOSS papers and generate
graphics as well as a summary table that can be downloaded and used for further analyses.

Load required R packages

suppressPackageStartupMessages({
    library(tibble)
    library(rcrossref)
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    library(lubridate)
    library(gh)
    library(purrr)
    library(jsonlite)
    library(DT)
    library(plotly)
    library(citecorp)
    library(readr)
    library(rworldmap)
    library(gt)
    library(stringr)
    library(openalexR)
})
## Keep track of the source of each column
source_track <- c()

## Determine whether to add a caption with today's date to the (non-interactive) plots
add_date_caption <- TRUE
if (add_date_caption) {
    dcap <- lubridate::today()
} else {
    dcap <- ""
}
## Get list of countries and populations (2022) from the rworldmap/gt packages
data("countrySynonyms")
country_names <- countrySynonyms |>
    select(-ID) |>
    pivot_longer(names_to = "tmp", values_to = "name", -ISO3) |>
    filter(name != "") |>
    select(-tmp)

## Country population data from the World Bank (https://data.worldbank.org/indicator/SP.POP.TOTL),
## distributed via the gt R package
country_populations <- countrypops |> 
    filter(year == 2022)
## Read archived version of summary data frame, to use for filling in 
## information about software repositories (due to limit on API requests)
## Sort by the date when software repo info was last obtained
papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
    dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)

## Similarly for citation analysis, to avoid having to pull down the 
## same information multiple times
citations_archive <- readr::read_delim(
    url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
    col_types = cols(.default = "c"), col_names = TRUE,
    delim = "\t")

Collect information about papers

Pull down paper info from Crossref and citation information from OpenAlex

We get the information about published JOSS papers from Crossref, using the rcrossref R package. The openalexR R package is used to extract citation counts from OpenAlex.

## First check how many records there are in Crossref
issn <- "2475-9066"
joss_details <- rcrossref::cr_journals(issn, works = FALSE) %>%
    pluck("data")
(total_dois <- joss_details$total_dois)
## [1] 3457
## Pull down all records from Crossref
papers <- rcrossref::cr_journals(issn, works = TRUE, cursor = "*",
    cursor_max = joss_details$total_dois * 2) %>%
    pluck("data")

## Only keep articles
papers <- papers %>%
    dplyr::filter(type == "journal-article") 
dim(papers)
## [1] 3457   28
dim(papers %>% distinct())
## [1] 3457   28
## Check that all papers were pulled down and stop otherwise
if (!(nrow(papers %>% distinct()) >= total_dois)) {
    stop("Not all papers were pulled down from Crossref!")
}

## A few papers don't have alternative.ids - generate them from the DOI
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- papers$doi[noaltid]

## Get citation info from Crossref and merge with paper details
# cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
# papers <- papers %>% dplyr::left_join(
#     cit %>% dplyr::rename(citation_count = count), 
#     by = c("alternative.id" = "doi")
# )

## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")
dim(papers)
## [1] 3456   28
dim(papers %>% distinct())
## [1] 3456   28
papers$alternative.id[duplicated(papers$alternative.id)]
## character(0)
source_track <- c(source_track, 
                  structure(rep("crossref", ncol(papers)), 
                            names = colnames(papers)))
## Get info from openalexR and merge with paper details
## Helper function to extract countries from affiliations. Note that this 
## information is not available for all papers.
.get_countries <- function(df, wh = "first") {
    if ((length(df) == 1 && is.na(df)) || is.null(df$affiliations)) {
        ""
    } else {
        if (wh == "first") {
            ## Only first affiliation for each author
            tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 
                dplyr::filter(!duplicated(id) & !is.na(affiliations_country_code)) |>
                pull(affiliations_country_code)
        } else {
            ## All affiliations
            tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 
                dplyr::filter(!is.na(affiliations_country_code)) |>
                pull(affiliations_country_code)
        }
        if (length(tmp) > 0) {
            tmp |>
                unique() |>
                paste(collapse = ";")
        } else {
            ""
        }
    }
}

oa <- oa_fetch(entity = "works", 
               primary_location.source.id = "s4210214273") |>
    mutate(affil_countries_all = vapply(authorships, .get_countries, "", wh = "all"),
           affil_countries_first = vapply(authorships, .get_countries, "", wh = "first"))
dim(oa)
## [1] 3456   45
length(unique(oa$doi))
## [1] 3455
papers <- papers %>% dplyr::left_join(
    oa %>% dplyr::mutate(alternative.id = sub("https://doi.org/", "", doi)) %>%
        dplyr::select(alternative.id, cited_by_count, id,
                      affil_countries_all, affil_countries_first) %>%
        dplyr::rename(citation_count = cited_by_count, 
                      openalex_id = id),
    by = "alternative.id"
)
dim(papers)
## [1] 3457   32
dim(papers %>% distinct())
## [1] 3457   32
source_track <- c(source_track, 
                  structure(rep("OpenAlex", length(setdiff(colnames(papers),
                                                           names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Pull down info from JOSS API

For each published paper, we use the JOSS API to get information about pre-review and review issue numbers, corresponding software repository etc.

joss_api <- list()
p <- 1
a0 <- NULL
a <- jsonlite::fromJSON(
    url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
    simplifyDataFrame = FALSE
)
while (length(a) > 0 && !identical(a, a0)) {
    joss_api <- c(joss_api, a)
    p <- p + 1
    a0 <- a
    a <- tryCatch({
        jsonlite::fromJSON(
            url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
            simplifyDataFrame = FALSE
        )}, 
        error = function(e) return(numeric(0))
    )
}

joss_api <- do.call(dplyr::bind_rows, lapply(joss_api, function(w) {
    data.frame(api_title = w$title, 
               api_state = w$state,
               author_affiliations = paste(unique(unlist(lapply(w$authors, "[[", "affiliation"))), collapse = ";"),
               editor = paste(w$editor, collapse = ","),
               reviewers = paste(w$reviewers, collapse = ","),
               nbr_reviewers = length(w$reviewers),
               repo_url = w$software_repository,
               review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/", 
                                     "", w$paper_review),
               doi = w$doi,
               prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
                                           w$meta_review_issue_id, NA_integer_),
               languages = gsub(", ", ",", w$languages),
               archive_doi = w$software_archive)
}))
dim(joss_api)
## [1] 3457   12
dim(joss_api %>% distinct())
## [1] 3457   12
## Check that all papers were pulled down and stop otherwise
if (!(nrow(joss_api %>% distinct()) >= total_dois)) {
    stop("Not all papers were pulled down from the JOSS API!")
}
joss_api$repo_url[duplicated(joss_api$repo_url)]
##  [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
##  [2] "https://github.com/nomad-coe/greenX"          
##  [3] "https://github.com/idaholab/moose"            
##  [4] "https://gitlab.com/libreumg/dataquier.git"    
##  [5] "https://github.com/idaholab/moose"            
##  [6] "https://github.com/dynamicslab/pysindy"       
##  [7] "https://github.com/landlab/landlab"           
##  [8] "https://github.com/landlab/landlab"           
##  [9] "https://github.com/symmy596/SurfinPy"         
## [10] "https://github.com/arviz-devs/arviz"          
## [11] "https://github.com/bcgov/ssdtools"            
## [12] "https://github.com/landlab/landlab"           
## [13] "https://github.com/pvlib/pvlib-python"        
## [14] "https://github.com/mlpack/mlpack"             
## [15] "https://github.com/julia-wrobel/registr"      
## [16] "https://github.com/barbagroup/pygbe"
papers <- papers %>% dplyr::left_join(joss_api, by = c("alternative.id" = "doi"))
dim(papers)
## [1] 3457   43
dim(papers %>% distinct())
## [1] 3457   43
papers$repo_url[duplicated(papers$repo_url)]
##  [1] "https://github.com/mlpack/mlpack"             
##  [2] "https://github.com/QTC-UMD/rydiqule"          
##  [3] "https://github.com/nomad-coe/greenX"          
##  [4] "https://github.com/bcgov/ssdtools"            
##  [5] "https://github.com/barbagroup/pygbe"          
##  [6] "https://github.com/dynamicslab/pysindy"       
##  [7] "https://github.com/landlab/landlab"           
##  [8] "https://github.com/idaholab/moose"            
##  [9] "https://github.com/idaholab/moose"            
## [10] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [11] "https://github.com/landlab/landlab"           
## [12] "https://github.com/arviz-devs/arviz"          
## [13] "https://github.com/symmy596/SurfinPy"         
## [14] "https://github.com/julia-wrobel/registr"      
## [15] "https://github.com/pvlib/pvlib-python"        
## [16] "https://github.com/landlab/landlab"           
## [17] "https://gitlab.com/libreumg/dataquier.git"
source_track <- c(source_track, 
                  structure(rep("JOSS_API", length(setdiff(colnames(papers),
                                                           names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Combine with info from GitHub issues

From each pre-review and review issue, we extract information about review times and assigned labels.

## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues", 
             .limit = 15000, state = "all")
## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
    data.frame(title = i$title, 
               number = i$number,
               state = i$state,
               opened = i$created_at,
               closed = ifelse(!is.null(i$closed_at),
                               i$closed_at, NA_character_),
               ncomments = i$comments,
               labels = paste(setdiff(
                   vapply(i$labels, getElement, 
                          name = "name", character(1L)),
                   c("review", "pre-review", "query-scope", "paused")),
                   collapse = ","))
}))

## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 
                                      !grepl("\\[REVIEW\\]", title))
dim(issother)
## [1] 198   7
head(issother)
##                                                                                                                        title
## 1                                                             Update wording of collaborative effort reviewer checklist item
## 2                                                                                                             Create Web App
## 3                                             [JOSS] zoom-lod-engine: A zoom-aware level-of-detail resolver using hysteresis
## 4                                                                                                            Review Comments
## 5                                                                                                      Paper Review comments
## 6 Invalid rejection SiA-WD: An R Shiny Application for Systematic Evaluation of Wearables in Behavioural and Stress Research
##   number  state               opened               closed ncomments labels
## 1  10126 closed 2026-02-27T15:51:09Z 2026-03-01T08:26:48Z         2       
## 2  10121 closed 2026-02-26T15:48:21Z 2026-02-26T15:48:23Z         1       
## 3   9931 closed 2026-02-04T10:22:06Z 2026-02-04T10:22:09Z         1       
## 4   9927 closed 2026-02-02T16:49:54Z 2026-02-02T16:49:57Z         1       
## 5   9920 closed 2026-01-30T07:05:08Z 2026-01-30T07:05:10Z         1       
## 6   9911 closed 2026-01-28T12:00:02Z 2026-01-28T12:00:05Z         1
## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
    paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
    dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
    dplyr::mutate(alternative.id = paste0("10.21105/joss.", 
                                          nbrzeros,
                                          number)) %>%
    dplyr::select(-nbrzeros) %>% 
    dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
    dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))
## For pre-review and review issues, respectively, get the number of 
## issues closed each month, and the number of those that have the 
## 'rejected' label
review_rejected <- iss %>% 
    dplyr::filter(grepl("\\[REVIEW\\]", title)) %>% 
    dplyr::filter(!is.na(closed)) %>%
    dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
    dplyr::group_by(closedmonth) %>%
    dplyr::summarize(nbr_issues_closed = length(labels),
                     nbr_rejections = sum(grepl("rejected", labels))) %>%
    dplyr::mutate(itype = "review")

prereview_rejected <- iss %>% 
    dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 
    dplyr::filter(!is.na(closed)) %>%
    dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
    dplyr::group_by(closedmonth) %>%
    dplyr::summarize(nbr_issues_closed = length(labels),
                     nbr_rejections = sum(grepl("rejected", labels))) %>%
    dplyr::mutate(itype = "pre-review")

all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected)
## Get only pre-review issues plus review issues opened before 2016-09-18, 
## will use these as a proxy for the number of submissions
pi1 <- iss |>
    dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) |>
    dplyr::mutate(opened = as.Date(opened))
dim(pi1)
## [1] 6019    7
pi2 <- iss |>
    dplyr::filter(grepl("\\[REVIEW\\]", title)) |>
    dplyr::mutate(opened = as.Date(opened)) |>
    dplyr::filter(opened <= as.Date("2016-09-18"))
dim(pi2)
## [1] 49  7
prereview_issues <- dplyr::bind_rows(pi1, pi2)
## For PRE-REVIEW issues, add information about the corresponding REVIEW 
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
    dplyr::filter(!grepl("withdrawn", labels)) %>%
    dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 
    dplyr::filter(!duplicated(title)) %>% 
    dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
    dplyr::rename_all(~ paste0("prerev_", .))

papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 
    dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
    dplyr::mutate(prerev_opened = as.Date(prerev_opened),
                  prerev_closed = as.Date(prerev_closed),
                  review_opened = as.Date(review_opened),
                  review_closed = as.Date(review_closed)) %>% 
    dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
                  days_in_rev = review_closed - review_opened,
                  to_review = !is.na(review_opened))
dim(papers)
## [1] 3457   59
dim(papers %>% distinct())
## [1] 3457   59
source_track <- c(source_track, 
                  structure(rep("joss-github", length(setdiff(colnames(papers),
                                                              names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Add information from software repositories

## Reorder so that software repositories that were interrogated longest 
## ago are checked first
tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
                  na.last = FALSE)
software_urls <- papers$repo_url[tmporder]
software_urls[duplicated(software_urls)]
##  [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
##  [2] "https://gitlab.com/libreumg/dataquier.git"    
##  [3] "https://github.com/QTC-UMD/rydiqule"          
##  [4] "https://github.com/mlpack/mlpack"             
##  [5] "https://github.com/nomad-coe/greenX"          
##  [6] "https://github.com/bcgov/ssdtools"            
##  [7] "https://github.com/barbagroup/pygbe"          
##  [8] "https://github.com/dynamicslab/pysindy"       
##  [9] "https://github.com/landlab/landlab"           
## [10] "https://github.com/idaholab/moose"            
## [11] "https://github.com/idaholab/moose"            
## [12] "https://github.com/arviz-devs/arviz"          
## [13] "https://github.com/landlab/landlab"           
## [14] "https://github.com/symmy596/SurfinPy"         
## [15] "https://github.com/julia-wrobel/registr"      
## [16] "https://github.com/pvlib/pvlib-python"        
## [17] "https://github.com/landlab/landlab"
is_github <- grepl("github", software_urls)
length(is_github)
## [1] 3457
sum(is_github)
## [1] 3277
software_urls[!is_github]
##   [1] "https://gitlab.kuleuven.be/ITSCreaLab/public-toolboxes/dyntapy"                  
##   [2] "https://gitlab.com/morikawa-lab-osakau/vibir-parallel-compute"                   
##   [3] "https://gitlab.com/ENKI-portal/ThermoCodegen"                                    
##   [4] "https://bitbucket.org/orionmhdteam/orion2_release1/src/master/"                  
##   [5] "https://gitlab.com/oali/dxtr"                                                    
##   [6] "https://gitlab.dune-project.org/copasi/dune-copasi"                              
##   [7] "https://gitlab.com/bonsamurais/bonsai/util/ipcc"                                 
##   [8] "https://codebase.helmholtz.cloud/mussel/netlogo-northsea-species.git"            
##   [9] "https://gitlab.com/ffaucher/hawen"                                               
##  [10] "https://gitlab.com/cmbm-ethz/miop"                                               
##  [11] "https://gitlab.com/cosmograil/starred"                                           
##  [12] "https://gitlab.com/emd-dev/emd"                                                  
##  [13] "https://gite.lirmm.fr/doccy/RedOak"                                              
##  [14] "https://bitbucket.org/rram/dvrlib/src/joss/"                                     
##  [15] "https://gitlab.com/mantik-ai/mantik"                                             
##  [16] "https://gitlab.com/djsmithbham/cnearest"                                         
##  [17] "https://gitlab.com/sails-dev/sails"                                              
##  [18] "https://gitlab.kitware.com/LBM/lattice-boltzmann-solver"                         
##  [19] "https://gitlab.com/dsbowen/conditional-inference"                                
##  [20] "https://gitlab.com/soleil-data-treatment/soleil-software-projects/remote-desktop"
##  [21] "https://bitbucket.org/berkeleylab/esdr-pygdh/"                                   
##  [22] "https://code.europa.eu/kada/mafw"                                                
##  [23] "https://gitlab.com/drti/basic-tools"                                             
##  [24] "https://gitlab.com/moorepants/skijumpdesign"                                     
##  [25] "https://bitbucket.org/ocellarisproject/ocellaris"                                
##  [26] "https://git.iws.uni-stuttgart.de/tools/frackit"                                  
##  [27] "https://gitlab.com/cmbm-ethz/pourbaix-diagrams"                                  
##  [28] "https://bitbucket.org/cloopsy/android/"                                          
##  [29] "https://gitlab.com/pythia-uq/pythia"                                             
##  [30] "https://gitlab.com/fduchate/predihood"                                           
##  [31] "https://jugit.fz-juelich.de/compflu/swalbe.jl/"                                  
##  [32] "https://gitlab.dune-project.org/dorie/dorie"                                     
##  [33] "https://gitlab.com/micromorph/ratel"                                             
##  [34] "https://gitlab.com/dmt-development/dmt-core"                                     
##  [35] "https://gitlab.com/wpettersson/kep_solver"                                       
##  [36] "https://gitlab.com/myqueue/myqueue"                                              
##  [37] "https://gitlab.com/dlr-ve/esy/remix/framework"                                   
##  [38] "https://gitlab.com/utopia-project/dantro"                                        
##  [39] "https://gitlab.com/gdetor/genetic_alg"                                           
##  [40] "https://framagit.org/GustaveCoste/off-product-environmental-impact/"             
##  [41] "https://gitlab.com/InspectorCell/inspectorcell"                                  
##  [42] "https://plmlab.math.cnrs.fr/lmrs/statistique/smmR"                               
##  [43] "https://gitlab.com/dlr-dw/ontocode"                                              
##  [44] "https://gitlab.com/dlr-ve/esy/amiris/amiris"                                     
##  [45] "https://bitbucket.org/glotzer/rowan"                                             
##  [46] "https://gitlab.com/fame-framework/fame-io"                                       
##  [47] "https://code.usgs.gov/umesc/quant-ecology/fishstan/"                             
##  [48] "https://gitlab.com/fame-framework/fame-core"                                     
##  [49] "https://gitlab.com/thartwig/asloth"                                              
##  [50] "https://gitlab.com/habermann_lab/phasik"                                         
##  [51] "https://gitlab.com/dlr-ve/autumn/"                                               
##  [52] "https://gitlab.com/ags-data-format-wg/ags-python-library"                        
##  [53] "https://zivgitlab.uni-muenster.de/ag-salinga/fastatomstruct"                     
##  [54] "https://gitlab.com/tue-umphy/software/parmesan"                                  
##  [55] "https://gitlab.com/celliern/scikit-fdiff/"                                       
##  [56] "https://gitlab.com/datafold-dev/datafold/"                                       
##  [57] "https://gitlab.com/tesch1/cppduals"                                              
##  [58] "https://gitlab.com/open-darts/open-darts"                                        
##  [59] "https://gitlab.com/materials-modeling/calorine"                                  
##  [60] "https://gitlab.com/mauricemolli/petitRADTRANS"                                   
##  [61] "https://gitlab.mpikg.mpg.de/curcuraci/bmiptools"                                 
##  [62] "https://gitlab.com/grogra/groimp-plugins/api"                                    
##  [63] "https://gitlab.inria.fr/miet/miet"                                               
##  [64] "https://savannah.nongnu.org/projects/complot/"                                   
##  [65] "http://mutabit.com/repos.fossil/grafoscopio/"                                    
##  [66] "https://gitlab.com/cerfacs/batman"                                               
##  [67] "https://gitlab.com/manchester_qbi/manchester_qbi_public/madym_cxx/"              
##  [68] "https://gitlab.com/akantu/akantu"                                                
##  [69] "https://bitbucket.org/cardosan/brightway2-temporalis"                            
##  [70] "https://gitlab.com/marinvaders/marinvaders"                                      
##  [71] "https://gitlab.kuleuven.be/gelenslab/publications/pycline"                       
##  [72] "https://gitlab.gwdg.de/mpievolbio-it/crbhits"                                    
##  [73] "https://bitbucket.org/bmskinner/nuclear_morphology"                              
##  [74] "https://git.rwth-aachen.de/ants/sensorlab/imea"                                  
##  [75] "https://bitbucket.org/sciencecapsule/sciencecapsule"                             
##  [76] "https://gitlab.com/bioeconomy/forobs/biotrade/"                                  
##  [77] "https://gitlab.com/lheea/CN-AeroModels"                                          
##  [78] "https://gitlab.ruhr-uni-bochum.de/ee/cd2es"                                      
##  [79] "https://www.idpoisson.fr/fullswof/"                                              
##  [80] "https://gitlab.com/uniluxembourg/hpc/research/cadom/serializable-simpy"          
##  [81] "https://codeberg.org/benmagill/deflake.rs"                                       
##  [82] "https://gitlab.ifremer.fr/resourcecode/resourcecode"                             
##  [83] "https://bitbucket.org/miketuri/perl-spice-sim-seus/"                             
##  [84] "https://bitbucket.org/dolfin-adjoint/pyadjoint"                                  
##  [85] "https://gitlab.com/LMSAL_HUB/aia_hub/aiapy"                                      
##  [86] "https://gitlab.eudat.eu/coccon-kit/proffastpylot"                                
##  [87] "https://git.mpib-berlin.mpg.de/castellum/castellum"                              
##  [88] "https://gitlab.com/programgreg/tagginglatencyestimator"                          
##  [89] "https://gitlab.com/gims-developers/gims"                                         
##  [90] "https://gitlab.awi.de/sicopolis/sicopolis"                                       
##  [91] "https://forgemia.inra.fr/migale/easy16s"                                         
##  [92] "https://gitlab.com/tamaas/tamaas"                                                
##  [93] "https://gitlab.com/ampere2/metalwalls"                                           
##  [94] "https://gitlab.pasteur.fr/vlegrand/ROCK"                                         
##  [95] "https://git.ligo.org/asimov/asimov"                                              
##  [96] "https://bitbucket.org/berkeleylab/hardware-control/src/main/"                    
##  [97] "https://gitlab.inria.fr/bcoye/game-engine-scheduling-simulation"                 
##  [98] "https://gitlab.com/petsc/petsc"                                                  
##  [99] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"                          
## [100] "https://gitlab.com/jason-rumengan/pyarma"                                        
## [101] "https://gitlab.com/culturalcartography/text2map"                                 
## [102] "https://bitbucket.org/clhaley/Multitaper.jl"                                     
## [103] "https://gitlab.com/project-dare/dare-platform"                                   
## [104] "https://gitlab.com/sissopp_developers/sissopp"                                   
## [105] "https://bitbucket.org/sbarbot/motorcycle/src/master/"                            
## [106] "https://earth.bsc.es/gitlab/wuruchi/autosubmitreact"                             
## [107] "https://gitlab.com/remram44/taguette"                                            
## [108] "https://forgemia.inra.fr/pherosensor/pherosensor-toolbox"                        
## [109] "https://bitbucket.org/mpi4py/mpi4py-fft"                                         
## [110] "https://gitlab.com/sigcorr/sigcorr"                                              
## [111] "https://git.geomar.de/digital-earth/dasf/dasf-messaging-python"                  
## [112] "https://gitlab.inria.fr/mosaic/bvpy"                                             
## [113] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"                      
## [114] "https://bitbucket.org/dghoshal/frieda"                                           
## [115] "https://gitlab.ruhr-uni-bochum.de/reichp2y/proppy"                               
## [116] "https://gitlab.com/grogra/groimp-plugins/Pointcloud"                             
## [117] "https://gitlab.com/permafrostnet/teaspoon"                                       
## [118] "https://gitlab.com/free-astro/siril"                                             
## [119] "https://doi.org/10.17605/OSF.IO/3DS6A"                                           
## [120] "https://gitlab.com/mauricemolli/petitRADTRANS"                                   
## [121] "https://gitlab.com/costrouc/pysrim"                                              
## [122] "https://gitlab.com/ComputationalScience/idinn"                                   
## [123] "https://gitlab.com/jtagusari/hrisk-noisemodelling"                               
## [124] "https://gitlab.com/moerman1/fhi-cc4s"                                            
## [125] "https://gitlab.com/pyFBS/pyFBS"                                                  
## [126] "https://codeberg.org/cepsInria/ceps"                                             
## [127] "https://gitlab.com/fibreglass/pivc"                                              
## [128] "https://bitbucket.org/manuela_s/hcp/"                                            
## [129] "https://gitlab.com/ProjectRHEA/flowsolverrhea"                                   
## [130] "https://gitlab.com/libreumg/dataquier.git"                                       
## [131] "https://gitlab.ethz.ch/holukas/dyco-dynamic-lag-compensation"                    
## [132] "https://gitlab.inria.fr/melissa/melissa"                                         
## [133] "https://gitlab.com/cosapp/cosapp"                                                
## [134] "https://gitlab.com/dlr-ve/esy/vencopy/vencopy"                                   
## [135] "https://gitlab.com/jesseds/apav"                                                 
## [136] "https://gitlab.com/qc-devs/aqcnes"                                               
## [137] "https://gitlab.com/vibes-developers/vibes"                                       
## [138] "https://gitlab.uliege.be/smart_grids/public/gboml"                               
## [139] "https://gricad-gitlab.univ-grenoble-alpes.fr/deformvis/insarviz"                 
## [140] "https://gitlab.com/eidheim/Simple-Web-Server"                                    
## [141] "https://bitbucket.org/basicsums/basicsums"                                       
## [142] "https://framagit.org/GustaveCoste/eldam"                                         
## [143] "https://gitlab.com/cracklet/cracklet.git"                                        
## [144] "https://gitlab.com/cosmology-ethz/galsbi"                                        
## [145] "https://git.ufz.de/despot/pysewer/"                                              
## [146] "https://codebase.helmholtz.cloud/taimur.khan/DeepTrees"                          
## [147] "https://gitlab.com/materials-modeling/wulffpack"                                 
## [148] "https://gitlab.com/EliseLei/easychem"                                            
## [149] "https://gitlab.eclipse.org/eclipse/comma/comma"                                  
## [150] "https://codeberg.org/JPHackstein/GREOPy"                                         
## [151] "https://gitlab.com/robizzard/libcdict"                                           
## [152] "https://bitbucket.org/robmoss/particle-filter-for-python/"                       
## [153] "https://bitbucket.org/mituq/muq2.git"                                            
## [154] "https://gitlab.com/energyincities/besos/"                                        
## [155] "https://gitlab.com/mmartin-lagarde/exonoodle-exoplanets/-/tree/master/"          
## [156] "https://gitlab.com/utopia-project/utopia"                                        
## [157] "https://gitlab.inria.fr/bramas/tbfmm"                                            
## [158] "https://bitbucket.org/meg/cbcbeat"                                               
## [159] "https://bitbucket.org/hammurabicode/hamx"                                        
## [160] "https://gitlab.com/davidwoodburn/itrm"                                           
## [161] "https://gitlab.com/tum-ciip/elsa"                                                
## [162] "https://gitlab.com/cosmology-ethz/ufig"                                          
## [163] "https://gitlab.com/picos-api/picos"                                              
## [164] "https://gitlab.com/binary_c/binary_c-python/"                                    
## [165] "https://bitbucket.org/cdegroot/wediff"                                           
## [166] "https://gitlab.com/QComms/cqptoolkit"                                            
## [167] "https://gitlab.com/toposens/public/ros-packages"                                 
## [168] "https://gitlab.inria.fr/azais/treex"                                             
## [169] "https://gitlab.com/pvst/asi"                                                     
## [170] "https://gitlab.com/chaver/choco-mining"                                          
## [171] "https://gitlab.com/cosmograil/PyCS3"                                             
## [172] "https://gitlab.com/MartinBeseda/sa-oo-vqe-qiskit.git"                            
## [173] "https://gitlab.com/davidtourigny/dynamic-fba"                                    
## [174] "https://bitbucket.org/likask/mofem-cephas"                                       
## [175] "https://bitbucket.org/cmutel/brightway2"                                         
## [176] "https://gitlab.com/geekysquirrel/bigx"                                           
## [177] "https://gitlab.com/dglaeser/fieldcompare"                                        
## [178] "https://gitlab.com/dlr-ve/esy/sfctools/framework/"                               
## [179] "https://gitlab.com/davidwoodburn/r3f"                                            
## [180] "https://gitlab.com/libreumg/dataquier.git"
df <- do.call(dplyr::bind_rows, lapply(unique(software_urls[is_github]), function(u) {
    u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
    if (grepl("/tree/", u0)) {
        u0 <- strsplit(u0, "/tree/")[[1]][1]
    }
    if (grepl("/blob/", u0)) {
        u0 <- strsplit(u0, "/blob/")[[1]][1]
    }
    info <- try({
        gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
    })
    languages <- try({
        gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"), 
           .limit = 500)
    })
    topics <- try({
        gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"), 
           .accept = "application/vnd.github.mercy-preview+json", .limit = 500)
    })
    contribs <- try({
        gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 
           .limit = 500)
    })
    if (!is(info, "try-error") && length(info) > 1) {
        if (!is(contribs, "try-error")) {
            if (length(contribs) == 0) {
                repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
            } else {
                repo_nbr_contribs <- length(contribs)
                repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
                if (is.na(repo_nbr_contribs_2ormore)) {
                    print(contribs)
                }
            }
        } else {
            repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
        }
        
        if (!is(languages, "try-error")) {
            if (length(languages) == 0) {
                repolang <- ""
            } else {
                repolang <- paste(paste(names(unlist(languages)), 
                                        unlist(languages), sep = ":"), collapse = ",")
            }
        } else {
            repolang <- ""
        }
        
        if (!is(topics, "try-error")) {
            if (length(topics$names) == 0) {
                repotopics <- ""
            } else {
                repotopics <- paste(unlist(topics$names), collapse = ",")
            }
        } else {
            repotopics <- ""
        }
        
        data.frame(repo_url = u, 
                   repo_created = info$created_at,
                   repo_updated = info$updated_at,
                   repo_pushed = info$pushed_at,
                   repo_nbr_stars = info$stargazers_count,
                   repo_language = ifelse(!is.null(info$language),
                                          info$language, NA_character_),
                   repo_languages_bytes = repolang,
                   repo_topics = repotopics,
                   repo_license = ifelse(!is.null(info$license),
                                         info$license$key, NA_character_),
                   repo_nbr_contribs = repo_nbr_contribs,
                   repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
        )
    } else {
        NULL
    }
})) %>%
    dplyr::mutate(repo_created = as.Date(repo_created),
                  repo_updated = as.Date(repo_updated),
                  repo_pushed = as.Date(repo_pushed)) %>%
    dplyr::distinct() %>%
    dplyr::mutate(repo_info_obtained = lubridate::today())
if (length(unique(df$repo_url)) != length(df$repo_url)) {
    print(length(unique(df$repo_url)))
    print(length(df$repo_url))
    print(df$repo_url[duplicated(df$repo_url)])
}
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)
## [1] 2864   12
## For papers not in df (i.e., for which we didn't get a valid response
## from the GitHub API query), use information from the archived data frame
dfarchive <- papers_archive %>% 
    dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
    dplyr::filter(!(repo_url %in% df$repo_url)) %>%
    dplyr::arrange(desc(repo_info_obtained)) %>%
    dplyr::filter(!duplicated(repo_url))
head(dfarchive)
## # A tibble: 6 × 12
##   repo_url    repo_created repo_updated repo_pushed repo_nbr_stars repo_language
##   <chr>       <date>       <date>       <date>               <int> <chr>        
## 1 https://gi… 2016-09-25   2025-10-07   2024-02-03              21 R            
## 2 https://gi… 2018-12-06   2025-11-02   2023-10-05              10 C#           
## 3 https://gi… 2018-10-05   2025-10-20   2021-11-17               8 JavaScript   
## 4 https://gi… 2024-04-15   2026-01-26   2025-11-19              11 Jupyter Note…
## 5 https://gi… 2020-06-02   2026-02-28   2025-02-10             112 Python       
## 6 https://gi… 2022-10-13   2026-03-03   2026-03-03              73 Python       
## # ℹ 6 more variables: repo_languages_bytes <chr>, repo_topics <chr>,
## #   repo_license <chr>, repo_nbr_contribs <int>,
## #   repo_nbr_contribs_2ormore <int>, repo_info_obtained <date>
dim(dfarchive)
## [1] 576  12
df <- dplyr::bind_rows(df, dfarchive)
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)
## [1] 3440   12
papers <- papers %>% dplyr::left_join(df, by = "repo_url")
dim(papers)
## [1] 3457   70
source_track <- c(source_track, 
                  structure(rep("sw-github", length(setdiff(colnames(papers),
                                                            names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Clean up a bit

## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
    dplyr::mutate(published.date = as.Date(published.print)) %>% 
    dplyr::mutate(
        halfyear = paste0(year(published.date), 
                          ifelse(month(published.date) <= 6, "H1", "H2"))
    ) %>% dplyr::mutate(
        halfyear = factor(halfyear, 
                          levels = paste0(rep(sort(unique(year(published.date))), 
                                              each = 2), c("H1", "H2")))
    ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))
dim(papers)
## [1] 3457   70
dupidx <- which(papers$alternative.id %in% papers$alternative.id[duplicated(papers)])
papers[dupidx, ] %>% arrange(alternative.id) %>% head(n = 10)
## # A tibble: 0 × 70
## # ℹ 70 variables: alternative.id <chr>, container.title <chr>, created <chr>,
## #   deposited <chr>, published.print <chr>, doi <chr>, indexed <chr>,
## #   issn <chr>, issue <chr>, issued <chr>, member <chr>, page <chr>,
## #   prefix <chr>, publisher <chr>, score <chr>, source <chr>,
## #   reference.count <chr>, references.count <chr>,
## #   is.referenced.by.count <chr>, title <chr>, type <chr>, url <chr>,
## #   volume <chr>, short.container.title <chr>, author <list>, …
papers <- papers %>% dplyr::distinct()
dim(papers)
## [1] 3457   70
source_track <- c(source_track, 
                  structure(rep("cleanup", length(setdiff(colnames(papers),
                                                          names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Tabulate number of missing values

In some cases, fetching information from (e.g.) the GitHub API fails for a subset of the publications. There are also other reasons for missing values (for example, the earliest submissions do not have an associated pre-review issue). The table below lists the number of missing values for each of the variables in the data frame.

DT::datatable(
    data.frame(variable = colnames(papers),
               nbr_missing = colSums(is.na(papers))) %>%
        dplyr::mutate(source = source_track[variable]),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of published papers per month

monthly_pubs <- papers %>% 
    dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
    dplyr::group_by(pubmonth) %>%
    dplyr::summarize(npub = n())
ggplot(monthly_pubs, 
       aes(x = factor(pubmonth), y = npub)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    labs(x = "", y = "Number of published papers per month", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DT::datatable(
    monthly_pubs %>% 
        dplyr::rename("Number of papers" = "npub",
                      "Month of publication" = "pubmonth"),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of published papers per year

yearly_pubs <- papers %>% 
    dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
    dplyr::group_by(pubyear) %>%
    dplyr::summarize(npub = n())
ggplot(yearly_pubs, 
       aes(x = factor(pubyear), y = npub)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    labs(x = "", y = "Number of published papers per year", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DT::datatable(
    yearly_pubs %>% 
        dplyr::rename("Number of papers" = "npub",
                      "Year of publication" = "pubyear"),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of submissions per month

We use the number of opened pre-review issues in a month as a proxy for the number of submissions.

monthly_subs <- prereview_issues |>
    dplyr::mutate(submonth = lubridate::floor_date(opened, "month")) |>
    dplyr::group_by(submonth) |>
    dplyr::summarize(nsub = n())
ggplot(monthly_subs, 
       aes(x = factor(submonth), y = nsub)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    labs(x = "", y = "Number of submissions per month", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))