Introduction

In this report, we extract information about published JOSS papers and generate
graphics as well as a summary table that can be downloaded and used for further analyses.

Load required R packages

suppressPackageStartupMessages({
    library(tibble)
    library(rcrossref)
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    library(lubridate)
    library(gh)
    library(purrr)
    library(jsonlite)
    library(DT)
    library(plotly)
    library(citecorp)
    library(readr)
    library(rworldmap)
    library(gt)
    library(stringr)
    library(openalexR)
})

## Keep track of the source of each column
source_track <- c()

## Determine whether to add a caption with today's date to the (non-interactive) plots
add_date_caption <- TRUE
if (add_date_caption) {
    dcap <- lubridate::today()
} else {
    dcap <- ""
}

## Get list of countries and populations (2022) from the rworldmap/gt packages
data("countrySynonyms")
country_names <- countrySynonyms |>
    select(-ID) |>
    pivot_longer(names_to = "tmp", values_to = "name", -ISO3) |>
    filter(name != "") |>
    select(-tmp)

## Country population data from the World Bank (https://data.worldbank.org/indicator/SP.POP.TOTL),
## distributed via the gt R package
country_populations <- countrypops |> 
    filter(year == 2022)

## Read archived version of summary data frame, to use for filling in 
## information about software repositories (due to limit on API requests)
## Sort by the date when software repo info was last obtained
papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
    dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)

## Similarly for citation analysis, to avoid having to pull down the 
## same information multiple times
citations_archive <- readr::read_delim(
    url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
    col_types = cols(.default = "c"), col_names = TRUE,
    delim = "\t")

Collect information about papers

Pull down paper info from Crossref and citation information from OpenAlex

We get the information about published JOSS papers from Crossref, using the rcrossref R package. The openalexR R package is used to extract citation counts from OpenAlex.

## First check how many records there are in Crossref
issn <- "2475-9066"
joss_details <- rcrossref::cr_journals(issn, works = FALSE) %>%
    pluck("data")
(total_dois <- joss_details$total_dois)

## [1] 3559

## Pull down all records from Crossref
papers <- rcrossref::cr_journals(issn, works = TRUE, cursor = "*",
    cursor_max = joss_details$total_dois * 2) %>%
    pluck("data")

## Only keep articles
papers <- papers %>%
    dplyr::filter(type == "journal-article") 
dim(papers)

## [1] 3559   28

dim(papers %>% distinct())

## [1] 3559   28

## Check that all papers were pulled down and stop otherwise
if (!(nrow(papers %>% distinct()) >= total_dois)) {
    stop("Not all papers were pulled down from Crossref!")
}

## A few papers don't have alternative.ids - generate them from the DOI
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- papers$doi[noaltid]

## Get citation info from Crossref and merge with paper details
# cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
# papers <- papers %>% dplyr::left_join(
#     cit %>% dplyr::rename(citation_count = count), 
#     by = c("alternative.id" = "doi")
# )

## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")
dim(papers)

## [1] 3558   28

dim(papers %>% distinct())

## [1] 3558   28

papers$alternative.id[duplicated(papers$alternative.id)]

## character(0)

source_track <- c(source_track, 
                  structure(rep("crossref", ncol(papers)), 
                            names = colnames(papers)))

## Get info from openalexR and merge with paper details
## Helper function to extract countries from affiliations. Note that this 
## information is not available for all papers.
.get_countries <- function(df, wh = "first") {
    if ((length(df) == 1 && is.na(df)) || is.null(df$affiliations)) {
        ""
    } else {
        if (wh == "first") {
            ## Only first affiliation for each author
            tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 
                dplyr::filter(!duplicated(id) & !is.na(affiliations_country_code)) |>
                pull(affiliations_country_code)
        } else {
            ## All affiliations
            tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 
                dplyr::filter(!is.na(affiliations_country_code)) |>
                pull(affiliations_country_code)
        }
        if (length(tmp) > 0) {
            tmp |>
                unique() |>
                paste(collapse = ";")
        } else {
            ""
        }
    }
}

oa <- oa_fetch(entity = "works", 
               primary_location.source.id = "s4210214273") |>
    mutate(affil_countries_all = vapply(authorships, .get_countries, "", wh = "all"),
           affil_countries_first = vapply(authorships, .get_countries, "", wh = "first"))
dim(oa)

## [1] 3558   45

length(unique(oa$doi))

## [1] 3556

papers <- papers %>% dplyr::left_join(
    oa %>% dplyr::mutate(alternative.id = sub("https://doi.org/", "", doi)) %>%
        dplyr::select(alternative.id, cited_by_count, id,
                      affil_countries_all, affil_countries_first) %>%
        dplyr::rename(citation_count = cited_by_count, 
                      openalex_id = id),
    by = "alternative.id"
)
dim(papers)

## [1] 3560   32

dim(papers %>% distinct())

## [1] 3560   32

source_track <- c(source_track, 
                  structure(rep("OpenAlex", length(setdiff(colnames(papers),
                                                           names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Pull down info from JOSS API

For each published paper, we use the JOSS API to get information about pre-review and review issue numbers, corresponding software repository etc.

joss_api <- list()
p <- 1
a0 <- NULL
a <- jsonlite::fromJSON(
    url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
    simplifyDataFrame = FALSE
)
while (length(a) > 0 && !identical(a, a0)) {
    joss_api <- c(joss_api, a)
    p <- p + 1
    a0 <- a
    a <- tryCatch({
        jsonlite::fromJSON(
            url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
            simplifyDataFrame = FALSE
        )}, 
        error = function(e) return(numeric(0))
    )
}

joss_api <- do.call(dplyr::bind_rows, lapply(joss_api, function(w) {
    data.frame(api_title = w$title, 
               api_state = w$state,
               author_affiliations = paste(unique(unlist(lapply(w$authors, "[[", "affiliation"))), collapse = ";"),
               editor = paste(w$editor, collapse = ","),
               reviewers = paste(w$reviewers, collapse = ","),
               nbr_reviewers = length(w$reviewers),
               repo_url = w$software_repository,
               review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/", 
                                     "", w$paper_review),
               doi = w$doi,
               prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
                                           w$meta_review_issue_id, NA_integer_),
               languages = gsub(", ", ",", w$languages),
               archive_doi = w$software_archive)
}))
dim(joss_api)

## [1] 3559   12

dim(joss_api %>% distinct())

## [1] 3559   12

## Check that all papers were pulled down and stop otherwise
if (!(nrow(joss_api %>% distinct()) >= total_dois)) {
    stop("Not all papers were pulled down from the JOSS API!")
}
joss_api$repo_url[duplicated(joss_api$repo_url)]

##  [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
##  [2] "https://github.com/nomad-coe/greenX"          
##  [3] "https://github.com/mdhaber/scipy"             
##  [4] "https://github.com/idaholab/moose"            
##  [5] "https://gitlab.com/libreumg/dataquier.git"    
##  [6] "https://github.com/idaholab/moose"            
##  [7] "https://github.com/dynamicslab/pysindy"       
##  [8] "https://github.com/landlab/landlab"           
##  [9] "https://github.com/landlab/landlab"           
## [10] "https://github.com/symmy596/SurfinPy"         
## [11] "https://github.com/arviz-devs/arviz"          
## [12] "https://github.com/bcgov/ssdtools"            
## [13] "https://github.com/landlab/landlab"           
## [14] "https://github.com/pvlib/pvlib-python"        
## [15] "https://github.com/mlpack/mlpack"             
## [16] "https://github.com/julia-wrobel/registr"      
## [17] "https://github.com/barbagroup/pygbe"

papers <- papers %>% dplyr::left_join(joss_api, by = c("alternative.id" = "doi"))
dim(papers)

## [1] 3560   43

dim(papers %>% distinct())

## [1] 3560   43

papers$repo_url[duplicated(papers$repo_url)]

##  [1] "https://github.com/mlpack/mlpack"             
##  [2] "https://github.com/QTC-UMD/rydiqule"          
##  [3] "https://github.com/nomad-coe/greenX"          
##  [4] "https://github.com/bcgov/ssdtools"            
##  [5] "https://github.com/barbagroup/pygbe"          
##  [6] "https://github.com/dynamicslab/pysindy"       
##  [7] "https://github.com/landlab/landlab"           
##  [8] "https://github.com/idaholab/moose"            
##  [9] "https://github.com/idaholab/moose"            
## [10] "https://github.com/adityapt/deepcausalmmm/"   
## [11] "https://gitlab.com/mauricemolli/petitRADTRANS"
## [12] "https://github.com/landlab/landlab"           
## [13] "https://github.com/arviz-devs/arviz"          
## [14] "https://github.com/symmy596/SurfinPy"         
## [15] "https://github.com/mdhaber/scipy"             
## [16] "https://github.com/julia-wrobel/registr"      
## [17] "https://github.com/pvlib/pvlib-python"        
## [18] "https://github.com/landlab/landlab"           
## [19] "https://gitlab.com/libreumg/dataquier.git"

source_track <- c(source_track, 
                  structure(rep("JOSS_API", length(setdiff(colnames(papers),
                                                           names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Combine with info from GitHub issues

From each pre-review and review issue, we extract information about review times and assigned labels.

## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues", 
             .limit = 15000, state = "all")

## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
    data.frame(title = i$title, 
               number = i$number,
               state = i$state,
               opened = i$created_at,
               closed = ifelse(!is.null(i$closed_at),
                               i$closed_at, NA_character_),
               ncomments = i$comments,
               labels = paste(setdiff(
                   vapply(i$labels, getElement, 
                          name = "name", character(1L)),
                   c("review", "pre-review", "query-scope", "paused")),
                   collapse = ","))
}))

## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 
                                      !grepl("\\[REVIEW\\]", title))
dim(issother)

## [1] 201   7

head(issother)

##                                                                            title
## 1                         Clarify wording on collaborative effort checklist item
## 2                                                Please withdraw this submission
## 3                                                 Please withdraw the submission
## 4                 Update wording of collaborative effort reviewer checklist item
## 5                                                                 Create Web App
## 6 [JOSS] zoom-lod-engine: A zoom-aware level-of-detail resolver using hysteresis
##   number  state               opened               closed ncomments labels
## 1  10450 closed 2026-04-21T09:37:27Z 2026-04-21T12:21:04Z         0       
## 2  10226 closed 2026-03-18T00:48:15Z 2026-03-18T00:48:17Z         1       
## 3  10225 closed 2026-03-18T00:45:53Z 2026-03-18T00:45:57Z         1       
## 4  10126 closed 2026-02-27T15:51:09Z 2026-03-01T08:26:48Z         2       
## 5  10121 closed 2026-02-26T15:48:21Z 2026-02-26T15:48:23Z         1       
## 6   9931 closed 2026-02-04T10:22:06Z 2026-02-04T10:22:09Z         1

## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
    paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
    dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
    dplyr::mutate(alternative.id = paste0("10.21105/joss.", 
                                          nbrzeros,
                                          number)) %>%
    dplyr::select(-nbrzeros) %>% 
    dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
    dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))

## For pre-review and review issues, respectively, get the number of 
## issues closed each month, and the number of those that have the 
## 'rejected' label
review_rejected <- iss %>% 
    dplyr::filter(grepl("\\[REVIEW\\]", title)) %>% 
    dplyr::filter(!is.na(closed)) %>%
    dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
    dplyr::group_by(closedmonth) %>%
    dplyr::summarize(nbr_issues_closed = length(labels),
                     nbr_rejections = sum(grepl("rejected", labels))) %>%
    dplyr::mutate(itype = "review")

prereview_rejected <- iss %>% 
    dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 
    dplyr::filter(!is.na(closed)) %>%
    dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
    dplyr::group_by(closedmonth) %>%
    dplyr::summarize(nbr_issues_closed = length(labels),
                     nbr_rejections = sum(grepl("rejected", labels))) %>%
    dplyr::mutate(itype = "pre-review")

all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected)

## Get only pre-review issues plus review issues opened before 2016-09-18, 
## will use these as a proxy for the number of submissions
pi1 <- iss |>
    dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) |>
    dplyr::mutate(opened = as.Date(opened))
dim(pi1)

## [1] 6342    7

pi2 <- iss |>
    dplyr::filter(grepl("\\[REVIEW\\]", title)) |>
    dplyr::mutate(opened = as.Date(opened)) |>
    dplyr::filter(opened <= as.Date("2016-09-18"))
dim(pi2)

## [1] 49  7

prereview_issues <- dplyr::bind_rows(pi1, pi2)

## For PRE-REVIEW issues, add information about the corresponding REVIEW 
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
    dplyr::filter(!grepl("withdrawn", labels)) %>%
    dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 
    dplyr::filter(!duplicated(title)) %>% 
    dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
    dplyr::rename_all(~ paste0("prerev_", .))

papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 
    dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
    dplyr::mutate(prerev_opened = as.Date(prerev_opened),
                  prerev_closed = as.Date(prerev_closed),
                  review_opened = as.Date(review_opened),
                  review_closed = as.Date(review_closed)) %>% 
    dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
                  days_in_rev = review_closed - review_opened,
                  to_review = !is.na(review_opened))
dim(papers)

## [1] 3560   59

dim(papers %>% distinct())

## [1] 3560   59

source_track <- c(source_track, 
                  structure(rep("joss-github", length(setdiff(colnames(papers),
                                                              names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Add information from software repositories

## Reorder so that software repositories that were interrogated longest 
## ago are checked first
tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
                  na.last = FALSE)
software_urls <- papers$repo_url[tmporder]
software_urls[duplicated(software_urls)]

##  [1] "https://gitlab.com/mauricemolli/petitRADTRANS"
##  [2] "https://gitlab.com/libreumg/dataquier.git"    
##  [3] "https://github.com/mlpack/mlpack"             
##  [4] "https://github.com/QTC-UMD/rydiqule"          
##  [5] "https://github.com/nomad-coe/greenX"          
##  [6] "https://github.com/bcgov/ssdtools"            
##  [7] "https://github.com/barbagroup/pygbe"          
##  [8] "https://github.com/dynamicslab/pysindy"       
##  [9] "https://github.com/landlab/landlab"           
## [10] "https://github.com/idaholab/moose"            
## [11] "https://github.com/idaholab/moose"            
## [12] "https://github.com/adityapt/deepcausalmmm/"   
## [13] "https://github.com/landlab/landlab"           
## [14] "https://github.com/arviz-devs/arviz"          
## [15] "https://github.com/symmy596/SurfinPy"         
## [16] "https://github.com/mdhaber/scipy"             
## [17] "https://github.com/julia-wrobel/registr"      
## [18] "https://github.com/pvlib/pvlib-python"        
## [19] "https://github.com/landlab/landlab"

is_github <- grepl("github", software_urls)
length(is_github)

## [1] 3560

sum(is_github)

## [1] 3377

software_urls[!is_github]

##   [1] "https://gitlab.dune-project.org/copasi/dune-copasi"                              
##   [2] "https://gitlab.com/bonsamurais/bonsai/util/ipcc"                                 
##   [3] "https://gite.lirmm.fr/doccy/RedOak"                                              
##   [4] "https://gitlab.kuleuven.be/ITSCreaLab/public-toolboxes/dyntapy"                  
##   [5] "https://gitlab.com/morikawa-lab-osakau/vibir-parallel-compute"                   
##   [6] "https://gitlab.com/ENKI-portal/ThermoCodegen"                                    
##   [7] "https://gitlab.com/oali/dxtr"                                                    
##   [8] "https://bitbucket.org/orionmhdteam/orion2_release1/src/master/"                  
##   [9] "https://gitlab.com/cmbm-ethz/miop"                                               
##  [10] "https://gitlab.com/ffaucher/hawen"                                               
##  [11] "https://codebase.helmholtz.cloud/mussel/netlogo-northsea-species.git"            
##  [12] "https://gitlab.com/emd-dev/emd"                                                  
##  [13] "https://gitlab.com/cosmograil/starred"                                           
##  [14] "https://bitbucket.org/rram/dvrlib/src/joss/"                                     
##  [15] "https://gitlab.com/djsmithbham/cnearest"                                         
##  [16] "https://gitlab.com/mantik-ai/mantik"                                             
##  [17] "https://gitlab.com/sails-dev/sails"                                              
##  [18] "https://gitlab.kitware.com/LBM/lattice-boltzmann-solver"                         
##  [19] "https://gitlab.com/dsbowen/conditional-inference"                                
##  [20] "https://gitlab.com/soleil-data-treatment/soleil-software-projects/remote-desktop"
##  [21] "https://bitbucket.org/berkeleylab/esdr-pygdh/"                                   
##  [22] "https://code.europa.eu/kada/mafw"                                                
##  [23] "https://git.iws.uni-stuttgart.de/tools/frackit"                                  
##  [24] "https://gitlab.com/drti/basic-tools"                                             
##  [25] "https://gitlab.com/moorepants/skijumpdesign"                                     
##  [26] "https://bitbucket.org/ocellarisproject/ocellaris"                                
##  [27] "https://gitlab.com/cmbm-ethz/pourbaix-diagrams"                                  
##  [28] "https://bitbucket.org/cloopsy/android/"                                          
##  [29] "https://gitlab.com/pythia-uq/pythia"                                             
##  [30] "https://gitlab.com/fduchate/predihood"                                           
##  [31] "https://gitlab.dune-project.org/dorie/dorie"                                     
##  [32] "https://gitlab.com/myqueue/myqueue"                                              
##  [33] "https://gitlab.com/micromorph/ratel"                                             
##  [34] "https://gitlab.com/dmt-development/dmt-core"                                     
##  [35] "https://gitlab.com/wpettersson/kep_solver"                                       
##  [36] "https://jugit.fz-juelich.de/compflu/swalbe.jl/"                                  
##  [37] "https://gitlab.com/dlr-ve/esy/remix/framework"                                   
##  [38] "https://gitlab.com/gdetor/genetic_alg"                                           
##  [39] "https://gitlab.com/utopia-project/dantro"                                        
##  [40] "https://framagit.org/GustaveCoste/off-product-environmental-impact/"             
##  [41] "https://gitlab.com/InspectorCell/inspectorcell"                                  
##  [42] "https://gitlab.com/dlr-dw/ontocode"                                              
##  [43] "https://plmlab.math.cnrs.fr/lmrs/statistique/smmR"                               
##  [44] "https://gitlab.com/dlr-ve/esy/amiris/amiris"                                     
##  [45] "https://bitbucket.org/glotzer/rowan"                                             
##  [46] "https://code.usgs.gov/umesc/quant-ecology/fishstan/"                             
##  [47] "https://gitlab.com/thartwig/asloth"                                              
##  [48] "https://gitlab.com/fame-framework/fame-core"                                     
##  [49] "https://gitlab.com/fame-framework/fame-io"                                       
##  [50] "https://gitlab.com/habermann_lab/phasik"                                         
##  [51] "https://gitlab.com/dlr-ve/autumn/"                                               
##  [52] "https://gitlab.com/ags-data-format-wg/ags-python-library"                        
##  [53] "https://zivgitlab.uni-muenster.de/ag-salinga/fastatomstruct"                     
##  [54] "https://gitlab.com/tue-umphy/software/parmesan"                                  
##  [55] "https://gitlab.com/datafold-dev/datafold/"                                       
##  [56] "https://gitlab.com/tesch1/cppduals"                                              
##  [57] "https://gitlab.com/open-darts/open-darts"                                        
##  [58] "https://gitlab.com/materials-modeling/calorine"                                  
##  [59] "https://gitlab.com/celliern/scikit-fdiff/"                                       
##  [60] "https://gitlab.mpikg.mpg.de/curcuraci/bmiptools"                                 
##  [61] "https://gitlab.com/grogra/groimp-plugins/api"                                    
##  [62] "https://gitlab.com/mauricemolli/petitRADTRANS"                                   
##  [63] "https://gitlab.inria.fr/miet/miet"                                               
##  [64] "https://savannah.nongnu.org/projects/complot/"                                   
##  [65] "http://mutabit.com/repos.fossil/grafoscopio/"                                    
##  [66] "https://bitbucket.org/cardosan/brightway2-temporalis"                            
##  [67] "https://gitlab.com/manchester_qbi/manchester_qbi_public/madym_cxx/"              
##  [68] "https://gitlab.com/akantu/akantu"                                                
##  [69] "https://gitlab.com/cerfacs/batman"                                               
##  [70] "https://gitlab.com/marinvaders/marinvaders"                                      
##  [71] "https://gitlab.kuleuven.be/gelenslab/publications/pycline"                       
##  [72] "https://bitbucket.org/bmskinner/nuclear_morphology"                              
##  [73] "https://gitlab.gwdg.de/mpievolbio-it/crbhits"                                    
##  [74] "https://git.rwth-aachen.de/ants/sensorlab/imea"                                  
##  [75] "https://bitbucket.org/sciencecapsule/sciencecapsule"                             
##  [76] "https://gitlab.com/lheea/CN-AeroModels"                                          
##  [77] "https://gitlab.ruhr-uni-bochum.de/ee/cd2es"                                      
##  [78] "https://gitlab.com/uniluxembourg/hpc/research/cadom/serializable-simpy"          
##  [79] "https://codeberg.org/benmagill/deflake.rs"                                       
##  [80] "https://gitlab.ifremer.fr/resourcecode/resourcecode"                             
##  [81] "https://www.idpoisson.fr/fullswof/"                                              
##  [82] "https://gitlab.com/bioeconomy/forobs/biotrade/"                                  
##  [83] "https://bitbucket.org/miketuri/perl-spice-sim-seus/"                             
##  [84] "https://bitbucket.org/dolfin-adjoint/pyadjoint"                                  
##  [85] "https://gitlab.com/LMSAL_HUB/aia_hub/aiapy"                                      
##  [86] "https://gitlab.com/programgreg/tagginglatencyestimator"                          
##  [87] "https://forgemia.inra.fr/migale/easy16s"                                         
##  [88] "https://gitlab.eudat.eu/coccon-kit/proffastpylot"                                
##  [89] "https://gitlab.awi.de/sicopolis/sicopolis"                                       
##  [90] "https://gitlab.com/tamaas/tamaas"                                                
##  [91] "https://gitlab.com/ampere2/metalwalls"                                           
##  [92] "https://gitlab.com/gims-developers/gims"                                         
##  [93] "https://git.mpib-berlin.mpg.de/castellum/castellum"                              
##  [94] "https://gitlab.pasteur.fr/vlegrand/ROCK"                                         
##  [95] "https://git.ligo.org/asimov/asimov"                                              
##  [96] "https://bitbucket.org/berkeleylab/hardware-control/src/main/"                    
##  [97] "https://gitlab.inria.fr/bcoye/game-engine-scheduling-simulation"                 
##  [98] "https://gitlab.com/petsc/petsc"                                                  
##  [99] "https://gitlab.com/jason-rumengan/pyarma"                                        
## [100] "https://gitlab.com/culturalcartography/text2map"                                 
## [101] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"                          
## [102] "https://bitbucket.org/clhaley/Multitaper.jl"                                     
## [103] "https://bitbucket.org/sbarbot/motorcycle/src/master/"                            
## [104] "https://gitlab.com/sissopp_developers/sissopp"                                   
## [105] "https://gitlab.com/project-dare/dare-platform"                                   
## [106] "https://forgemia.inra.fr/pherosensor/pherosensor-toolbox"                        
## [107] "https://earth.bsc.es/gitlab/wuruchi/autosubmitreact"                             
## [108] "https://gitlab.com/remram44/taguette"                                            
## [109] "https://bitbucket.org/mpi4py/mpi4py-fft"                                         
## [110] "https://gitlab.com/sigcorr/sigcorr"                                              
## [111] "https://git.geomar.de/digital-earth/dasf/dasf-messaging-python"                  
## [112] "https://gitlab.fysik.su.se/operando-catalysis-spectroscopy/polariseval/"         
## [113] "https://gitlab.inria.fr/mosaic/bvpy"                                             
## [114] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"                      
## [115] "https://bitbucket.org/dghoshal/frieda"                                           
## [116] "https://gitlab.ruhr-uni-bochum.de/reichp2y/proppy"                               
## [117] "https://gitlab.com/grogra/groimp-plugins/Pointcloud"                             
## [118] "https://doi.org/10.17605/OSF.IO/3DS6A"                                           
## [119] "https://gitlab.com/mauricemolli/petitRADTRANS"                                   
## [120] "https://gitlab.com/free-astro/siril"                                             
## [121] "https://gitlab.com/permafrostnet/teaspoon"                                       
## [122] "https://gitlab.com/costrouc/pysrim"                                              
## [123] "https://gitlab.com/ComputationalScience/idinn"                                   
## [124] "https://gitlab.com/jtagusari/hrisk-noisemodelling"                               
## [125] "https://gitlab.com/moerman1/fhi-cc4s"                                            
## [126] "https://gitlab.com/pyFBS/pyFBS"                                                  
## [127] "https://gitlab.com/sunpeek/sunpeek/"                                             
## [128] "https://codeberg.org/cepsInria/ceps"                                             
## [129] "https://gitlab.com/fibreglass/pivc"                                              
## [130] "https://gitlab.com/libreumg/dataquier.git"                                       
## [131] "https://bitbucket.org/manuela_s/hcp/"                                            
## [132] "https://gitlab.com/ProjectRHEA/flowsolverrhea"                                   
## [133] "https://gitlab.ethz.ch/holukas/dyco-dynamic-lag-compensation"                    
## [134] "https://gitlab.inria.fr/melissa/melissa"                                         
## [135] "https://gitlab.com/cosapp/cosapp"                                                
## [136] "https://gitlab.com/dlr-ve/esy/vencopy/vencopy"                                   
## [137] "https://gitlab.com/jesseds/apav"                                                 
## [138] "https://gitlab.com/qc-devs/aqcnes"                                               
## [139] "https://gitlab.com/vibes-developers/vibes"                                       
## [140] "https://gitlab.uliege.be/smart_grids/public/gboml"                               
## [141] "https://gricad-gitlab.univ-grenoble-alpes.fr/deformvis/insarviz"                 
## [142] "https://gitlab.com/eidheim/Simple-Web-Server"                                    
## [143] "https://bitbucket.org/basicsums/basicsums"                                       
## [144] "https://gitlab.com/cosmology-ethz/galsbi"                                        
## [145] "https://framagit.org/GustaveCoste/eldam"                                         
## [146] "https://gitlab.com/cracklet/cracklet.git"                                        
## [147] "https://gitlab.com/EliseLei/easychem"                                            
## [148] "https://codebase.helmholtz.cloud/taimur.khan/DeepTrees"                          
## [149] "https://git.ufz.de/despot/pysewer/"                                              
## [150] "https://bitbucket.org/brunopostle/homemaker"                                     
## [151] "https://gitlab.com/materials-modeling/wulffpack"                                 
## [152] "https://gitlab.eclipse.org/eclipse/comma/comma"                                  
## [153] "https://codeberg.org/JPHackstein/GREOPy"                                         
## [154] "https://gitlab.com/robizzard/libcdict"                                           
## [155] "https://bitbucket.org/robmoss/particle-filter-for-python/"                       
## [156] "https://gitlab.com/energyincities/besos/"                                        
## [157] "https://bitbucket.org/mituq/muq2.git"                                            
## [158] "https://gitlab.com/mmartin-lagarde/exonoodle-exoplanets/-/tree/master/"          
## [159] "https://gitlab.inria.fr/bramas/tbfmm"                                            
## [160] "https://bitbucket.org/meg/cbcbeat"                                               
## [161] "https://gitlab.com/utopia-project/utopia"                                        
## [162] "https://bitbucket.org/hammurabicode/hamx"                                        
## [163] "https://gitlab.com/davidwoodburn/itrm"                                           
## [164] "https://gitlab.com/tum-ciip/elsa"                                                
## [165] "https://gitlab.com/cosmology-ethz/ufig"                                          
## [166] "https://gitlab.com/binary_c/binary_c-python/"                                    
## [167] "https://gitlab.com/picos-api/picos"                                              
## [168] "https://bitbucket.org/cdegroot/wediff"                                           
## [169] "https://gitlab.com/QComms/cqptoolkit"                                            
## [170] "https://gitlab.com/toposens/public/ros-packages"                                 
## [171] "https://gitlab.inria.fr/azais/treex"                                             
## [172] "https://gitlab.com/pvst/asi"                                                     
## [173] "https://gitlab.com/davidtourigny/dynamic-fba"                                    
## [174] "https://gitlab.com/chaver/choco-mining"                                          
## [175] "https://gitlab.com/cosmograil/PyCS3"                                             
## [176] "https://bitbucket.org/likask/mofem-cephas"                                       
## [177] "https://bitbucket.org/cmutel/brightway2"                                         
## [178] "https://gitlab.com/MartinBeseda/sa-oo-vqe-qiskit.git"                            
## [179] "https://gitlab.com/geekysquirrel/bigx"                                           
## [180] "https://gitlab.com/dglaeser/fieldcompare"                                        
## [181] "https://gitlab.com/davidwoodburn/r3f"                                            
## [182] "https://gitlab.com/libreumg/dataquier.git"                                       
## [183] "https://gitlab.com/dlr-ve/esy/sfctools/framework/"

df <- do.call(dplyr::bind_rows, lapply(unique(software_urls[is_github]), function(u) {
    u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
    if (grepl("/tree/", u0)) {
        u0 <- strsplit(u0, "/tree/")[[1]][1]
    }
    if (grepl("/blob/", u0)) {
        u0 <- strsplit(u0, "/blob/")[[1]][1]
    }
    info <- try({
        gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
    })
    languages <- try({
        gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"), 
           .limit = 500)
    })
    topics <- try({
        gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"), 
           .accept = "application/vnd.github.mercy-preview+json", .limit = 500)
    })
    contribs <- try({
        gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 
           .limit = 500)
    })
    if (!is(info, "try-error") && length(info) > 1) {
        if (!is(contribs, "try-error")) {
            if (length(contribs) == 0) {
                repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
            } else {
                repo_nbr_contribs <- length(contribs)
                repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
                if (is.na(repo_nbr_contribs_2ormore)) {
                    print(contribs)
                }
            }
        } else {
            repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
        }
        
        if (!is(languages, "try-error")) {
            if (length(languages) == 0) {
                repolang <- ""
            } else {
                repolang <- paste(paste(names(unlist(languages)), 
                                        unlist(languages), sep = ":"), collapse = ",")
            }
        } else {
            repolang <- ""
        }
        
        if (!is(topics, "try-error")) {
            if (length(topics$names) == 0) {
                repotopics <- ""
            } else {
                repotopics <- paste(unlist(topics$names), collapse = ",")
            }
        } else {
            repotopics <- ""
        }
        
        data.frame(repo_url = u, 
                   repo_created = info$created_at,
                   repo_updated = info$updated_at,
                   repo_pushed = info$pushed_at,
                   repo_nbr_stars = info$stargazers_count,
                   repo_language = ifelse(!is.null(info$language),
                                          info$language, NA_character_),
                   repo_languages_bytes = repolang,
                   repo_topics = repotopics,
                   repo_license = ifelse(!is.null(info$license),
                                         info$license$key, NA_character_),
                   repo_nbr_contribs = repo_nbr_contribs,
                   repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
        )
    } else {
        NULL
    }
})) %>%
    dplyr::mutate(repo_created = as.Date(repo_created),
                  repo_updated = as.Date(repo_updated),
                  repo_pushed = as.Date(repo_pushed)) %>%
    dplyr::distinct() %>%
    dplyr::mutate(repo_info_obtained = lubridate::today())

if (length(unique(df$repo_url)) != length(df$repo_url)) {
    print(length(unique(df$repo_url)))
    print(length(df$repo_url))
    print(df$repo_url[duplicated(df$repo_url)])
}
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)

## [1] 2268   12

## For papers not in df (i.e., for which we didn't get a valid response
## from the GitHub API query), use information from the archived data frame
dfarchive <- papers_archive %>% 
    dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
    dplyr::filter(!(repo_url %in% df$repo_url)) %>%
    dplyr::arrange(desc(repo_info_obtained)) %>%
    dplyr::filter(!duplicated(repo_url))
head(dfarchive)

## # A tibble: 6 × 12
##   repo_url    repo_created repo_updated repo_pushed repo_nbr_stars repo_language
##   <chr>       <date>       <date>       <date>               <int> <chr>        
## 1 https://gi… 2021-04-08   2026-04-27   2026-04-27              22 C++          
## 2 https://gi… 2015-08-27   2026-05-05   2026-05-05             176 MATLAB       
## 3 https://gi… 2013-11-05   2026-04-22   2026-05-01              75 C++          
## 4 https://gi… 2020-12-16   2026-04-24   2026-03-28              20 TeX          
## 5 https://gi… 2021-12-07   2026-03-30   2026-04-01              15 R            
## 6 https://gi… 2022-04-21   2026-05-05   2026-05-05              47 Python       
## # ℹ 6 more variables: repo_languages_bytes <chr>, repo_topics <chr>,
## #   repo_license <chr>, repo_nbr_contribs <int>,
## #   repo_nbr_contribs_2ormore <int>, repo_info_obtained <date>

dim(dfarchive)

## [1] 1273   12

df <- dplyr::bind_rows(df, dfarchive)
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)

## [1] 3541   12

papers <- papers %>% dplyr::left_join(df, by = "repo_url")
dim(papers)

## [1] 3560   70

source_track <- c(source_track, 
                  structure(rep("sw-github", length(setdiff(colnames(papers),
                                                            names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Clean up a bit

## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
    dplyr::mutate(published.date = as.Date(published.print)) %>% 
    dplyr::mutate(
        halfyear = paste0(year(published.date), 
                          ifelse(month(published.date) <= 6, "H1", "H2"))
    ) %>% dplyr::mutate(
        halfyear = factor(halfyear, 
                          levels = paste0(rep(sort(unique(year(published.date))), 
                                              each = 2), c("H1", "H2")))
    ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))
dim(papers)

## [1] 3560   70

dupidx <- which(papers$alternative.id %in% papers$alternative.id[duplicated(papers)])
papers[dupidx, ] %>% arrange(alternative.id) %>% head(n = 10)

## # A tibble: 0 × 70
## # ℹ 70 variables: alternative.id <chr>, container.title <chr>, created <chr>,
## #   deposited <chr>, published.print <chr>, doi <chr>, indexed <chr>,
## #   issn <chr>, issue <chr>, issued <chr>, member <chr>, page <chr>,
## #   prefix <chr>, publisher <chr>, score <chr>, source <chr>,
## #   reference.count <chr>, references.count <chr>,
## #   is.referenced.by.count <chr>, title <chr>, type <chr>, url <chr>,
## #   volume <chr>, short.container.title <chr>, author <list>, …

papers <- papers %>% dplyr::distinct()
dim(papers)

## [1] 3560   70

source_track <- c(source_track, 
                  structure(rep("cleanup", length(setdiff(colnames(papers),
                                                          names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Tabulate number of missing values

In some cases, fetching information from (e.g.) the GitHub API fails for a subset of the publications. There are also other reasons for missing values (for example, the earliest submissions do not have an associated pre-review issue). The table below lists the number of missing values for each of the variables in the data frame.

DT::datatable(
    data.frame(variable = colnames(papers),
               nbr_missing = colSums(is.na(papers))) %>%
        dplyr::mutate(source = source_track[variable]),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of published papers per month

monthly_pubs <- papers %>% 
    dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
    dplyr::group_by(pubmonth) %>%
    dplyr::summarize(npub = n())
ggplot(monthly_pubs, 
       aes(x = factor(pubmonth), y = npub)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    labs(x = "", y = "Number of published papers per month", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DT::datatable(
    monthly_pubs %>% 
        dplyr::rename("Number of papers" = "npub",
                      "Month of publication" = "pubmonth"),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of published papers per year

yearly_pubs <- papers %>% 
    dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
    dplyr::group_by(pubyear) %>%
    dplyr::summarize(npub = n())
ggplot(yearly_pubs, 
       aes(x = factor(pubyear), y = npub)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    labs(x = "", y = "Number of published papers per year", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DT::datatable(
    yearly_pubs %>% 
        dplyr::rename("Number of papers" = "npub",
                      "Year of publication" = "pubyear"),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of submissions per month

We use the number of opened pre-review issues in a month as a proxy for the number of submissions.

monthly_subs <- prereview_issues |>
    dplyr::mutate(submonth = lubridate::floor_date(opened, "month")) |>
    dplyr::group_by(submonth) |>
    dplyr::summarize(nsub = n()) |>
    # fill in missing months (with 0 submissions)
    tidyr::complete(
        submonth = seq(min(submonth, na.rm = TRUE), 
                       max(submonth, na.rm = TRUE), by = "month"),
        fill = list(nsub = 0)
     )
ggplot(monthly_subs, 
       aes(x = factor(submonth), y = nsub)) + 
    geom_bar(stat = "identity") + theme_minimal() + 
    labs(x = "", y = "Number of submissions per month", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

DT::datatable(
    monthly_subs |> 
        dplyr::rename("Number of submissions" = "nsub",
                      "Month of submission" = "submonth"),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Fraction rejected papers

The plots below illustrate the fraction of pre-review and review issues closed during each month that have the ‘rejected’ label attached.

ggplot(all_rejected, 
       aes(x = factor(closedmonth), y = nbr_rejections/nbr_issues_closed)) + 
    geom_bar(stat = "identity") + 
    theme_minimal() + 
    facet_wrap(~ itype, ncol = 1) + 
    labs(x = "Month of issue closing", y = "Fraction of issues rejected",
         caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Citation distribution

Papers with 20 or more citations are grouped in the “>=20” category.

ggplot(papers %>% 
           dplyr::mutate(citation_count = replace(citation_count,
                                                  citation_count >= 20, ">=20")) %>%
           dplyr::mutate(citation_count = factor(citation_count, 
                                                 levels = c(0:20, ">=20"))) %>%
           dplyr::group_by(citation_count) %>%
           dplyr::tally(),
       aes(x = citation_count, y = n)) + 
    geom_bar(stat = "identity") + 
    theme_minimal() + 
    labs(x = "OpenAlex citation count", y = "Number of publications", caption = dcap)

Most cited papers

The table below sorts the JOSS papers in decreasing order by the number of citations in OpenAlex.

DT::datatable(
    papers %>% 
        dplyr::mutate(url = paste0("<a href='", url, "' target='_blank'>", 
                                   url,"</a>")) %>% 
        dplyr::arrange(desc(citation_count)) %>% 
        dplyr::select(title, url, published.date, citation_count),
    escape = FALSE,
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Citation count vs time since publication

plotly::ggplotly(
    ggplot(papers, aes(x = published.date, y = citation_count, label = title)) + 
        geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() + 
        geom_smooth() + 
        labs(x = "Date of publication", y = "OpenAlex citation count", caption = dcap) + 
        theme(axis.title = element_text(size = 15)),
    tooltip = c("label", "x", "y")
)

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Power law of citation count within each half year

Here, we plot the citation count for all papers published within each half year, sorted in decreasing order.

ggplot(papers %>% dplyr::group_by(halfyear) %>% 
           dplyr::arrange(desc(citation_count)) %>%
           dplyr::mutate(idx = seq_along(citation_count)), 
       aes(x = idx, y = citation_count)) + 
    geom_point(alpha = 0.5) + 
    facet_wrap(~ halfyear, scales = "free") + 
    theme_bw() + 
    labs(x = "Index", y = "OpenAlex citation count", caption = dcap)

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

Pre-review/review time over time

In these plots we investigate whether the time a submission spends in the pre-review or review stage (or their sum) has changed over time. The blue curve corresponds to a rolling median for submissions over 120 days.

## Helper functions (modified from https://stackoverflow.com/questions/65147186/geom-smooth-with-median-instead-of-mean)
rolling_median <- function(formula, data, xwindow = 120, ...) {
    ## Get order of x-values and sort x/y
    ordr <- order(data$x)
    x <- data$x[ordr]
    y <- data$y[ordr]
    
    ## Initialize vector for smoothed y-values
    ys <- rep(NA, length(x))
    ## Calculate median y-value for each unique x-value
    for (xs in setdiff(unique(x), NA)) {
        ## Get x-values in the window, and calculate median of corresponding y
        j <- ((xs - xwindow/2) < x) & (x < (xs + xwindow/2))
        ys[x == xs] <- median(y[j], na.rm = TRUE)
    }
    y <- ys
    structure(list(x = x, y = y, f = approxfun(x, y)), class = "rollmed")
}

predict.rollmed <- function(mod, newdata, ...) {
    setNames(mod$f(newdata$x), newdata$x)
}

ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) + 
    geom_point() + 
    geom_smooth(formula = y ~ x, method = "rolling_median", 
                se = FALSE, method.args = list(xwindow = 120)) + 
    theme_bw() + 
    labs(x = "Date of pre-review opening", y = "Number of days in pre-review", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15))

ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) + 
    geom_point() +
    geom_smooth(formula = y ~ x, method = "rolling_median", 
                se = FALSE, method.args = list(xwindow = 120)) +
    theme_bw() + 
    labs(x = "Date of review opening", y = "Number of days in review", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15))

ggplot(papers, aes(x = prerev_opened, 
                   y = as.numeric(days_in_pre) + as.numeric(days_in_rev))) + 
    geom_point() +
    geom_smooth(formula = y ~ x, method = "rolling_median", 
                se = FALSE, method.args = list(xwindow = 120)) +
    theme_bw() + 
    labs(x = "Date of pre-review opening", y = "Number of days in pre-review + review", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15))

Languages

Next, we consider the languages used by the submissions, both as reported by JOSS and based on the information encoded in available GitHub repositories (for the latter, we also record the number of bytes of code written in each language). Note that a given submission can use multiple languages.

## Language information from JOSS
sspl <- strsplit(papers$languages, ",")
all_languages <- unique(unlist(sspl))
langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) {
    data.frame(language = l,
               nbr_submissions_JOSS_API = sum(vapply(sspl, function(v) l %in% v, 0)))
}))

## Language information from GitHub software repos
a <- lapply(strsplit(papers$repo_languages_bytes, ","), function(w) strsplit(w, ":"))
a <- a[sapply(a, length) > 0]
langbytes <- as.data.frame(t(as.data.frame(a))) %>% 
    setNames(c("language", "bytes")) %>%
    dplyr::mutate(bytes = as.numeric(bytes)) %>%
    dplyr::filter(!is.na(language)) %>%
    dplyr::group_by(language) %>%
    dplyr::summarize(nbr_bytes_GitHub = sum(bytes),
                     nbr_repos_GitHub = length(bytes)) %>%
    dplyr::arrange(desc(nbr_bytes_GitHub))

langs <- dplyr::full_join(langs, langbytes, by = "language")

ggplot(langs %>% dplyr::arrange(desc(nbr_submissions_JOSS_API)) %>%
           dplyr::filter(nbr_submissions_JOSS_API > 10) %>%
           dplyr::mutate(language = factor(language, levels = language)),
       aes(x = language, y = nbr_submissions_JOSS_API)) + 
    geom_bar(stat = "identity") + 
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
    labs(x = "", y = "Number of submissions", caption = dcap) + 
    theme(axis.title = element_text(size = 15))

DT::datatable(
    langs %>% dplyr::arrange(desc(nbr_bytes_GitHub)),
    escape = FALSE,
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

ggplot(langs, aes(x = nbr_repos_GitHub, y = nbr_bytes_GitHub)) + 
    geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() + 
    theme_bw() + 
    labs(x = "Number of repos using the language",
         y = "Total number of bytes of code\nwritten in the language", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15))

Association between number of citations and number of stars of the GitHub repo

ggplotly(
    ggplot(papers, aes(x = citation_count, y = repo_nbr_stars,
                       label = title)) + 
        geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() + 
        theme_bw() + 
        labs(x = "OpenAlex citation count", y = "Number of stars, GitHub repo", 
             caption = dcap) + 
        theme(axis.title = element_text(size = 15)),
    tooltip = c("label", "x", "y")
)

Distribution of time between GitHub repo creation and JOSS submission

ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) +
    geom_histogram(bins = 50) + 
    theme_bw() + 
    labs(x = "Time (days) from repo creation to JOSS pre-review start", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15))

Distribution of time between JOSS acceptance and last commit

ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) +
    geom_histogram(bins = 50) + 
    theme_bw() + 
    labs(x = "Time (days) from closure of JOSS review to most recent commit in repo",
         caption = dcap) + 
    theme(axis.title = element_text(size = 15)) + 
    facet_wrap(~ year(published.date), scales = "free_y")

Number of authors per paper

List the papers with the largest number of authors, and display the distribution of the number of authors per paper, for papers with at most 20 authors.

## Papers with largest number of authors
papers %>% dplyr::arrange(desc(nbr_authors)) %>% 
    dplyr::select(title, published.date, url, nbr_authors) %>%
    as.data.frame() %>% head(10)

##                                                                                                                          title
## 1                                                                                    SunPy: A Python package for Solar Physics
## 2                                                        ENZO: An Adaptive Mesh Refinement Code for Astrophysics (Version 2.6)
## 3                                 GEOS: A performance portable multi-physics simulation\nframework for subsurface applications
## 4  The Pencil Code, a modular MPI code for partial differential equations and particles: multipurpose and multiuser-maintained
## 5                                                             sbi reloaded: a toolkit for simulation-based inference workflows
## 6                                                     GRChombo: An adaptable numerical relativity code for fundamental physics
## 7                                       DataLad: distributed system for joint management of code, data, and their relationship
## 8                                                                                       PyBIDS: Python tools for BIDS datasets
## 9                                                                            Chaste: Cancer, Heart and Soft Tissue Environment
## 10                         sourmash v4: A multitool to quickly search, compare,\nand analyze genomic and metagenomic data sets
##    published.date                                 url nbr_authors
## 1      2020-02-14 https://doi.org/10.21105/joss.01832         124
## 2      2019-10-03 https://doi.org/10.21105/joss.01636          55
## 3      2024-10-10 https://doi.org/10.21105/joss.06973          53
## 4      2021-02-21 https://doi.org/10.21105/joss.02807          38
## 5      2025-04-08 https://doi.org/10.21105/joss.07754          33
## 6      2021-12-10 https://doi.org/10.21105/joss.03703          32
## 7      2021-07-01 https://doi.org/10.21105/joss.03262          31
## 8      2019-08-12 https://doi.org/10.21105/joss.01294          31
## 9      2020-03-13 https://doi.org/10.21105/joss.01848          29
## 10     2024-06-28 https://doi.org/10.21105/joss.06830          29

nbins <- max(papers$nbr_authors[papers$nbr_authors <= 20])
ggplot(papers %>% dplyr::filter(nbr_authors <= 20),
       aes(x = nbr_authors)) + 
    geom_histogram(bins = nbins, fill = "lightgrey", color = "grey50") + 
    theme_bw() + 
    facet_wrap(~ year(published.date), scales = "free_y") + 
    theme(axis.title = element_text(size = 15)) + 
    labs(x = "Number of authors",
         y = "Number of publications with\na given number of authors", 
         caption = dcap)

ggplot(papers %>% 
           dplyr::mutate(nbr_authors = replace(nbr_authors, nbr_authors > 5, ">5")) %>%
           dplyr::mutate(nbr_authors = factor(nbr_authors, levels = c("1", "2", "3", 
                                                                      "4", "5", ">5"))) %>%
           dplyr::mutate(year = year(published.date)) %>%
           dplyr::mutate(year = factor(year)) %>%
           dplyr::group_by(year, nbr_authors, .drop = FALSE) %>%
           dplyr::summarize(n = n()) %>%
           dplyr::mutate(freq = n/sum(n)) %>%
           dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = nbr_authors)) + geom_area() + 
    theme_minimal() + 
    scale_fill_brewer(palette = "Set1", name = "Number of\nauthors", 
                      na.value = "grey") + 
    theme(axis.title = element_text(size = 15)) + 
    labs(x = "Year", y = "Fraction of submissions", caption = dcap)

Number of authors vs number of contributors to the GitHub repo

Note that points are slightly jittered to reduce the overlap.

plotly::ggplotly(
    ggplot(papers, aes(x = nbr_authors, y = repo_nbr_contribs_2ormore, label = title)) + 
        geom_abline(slope = 1, intercept = 0) + 
        geom_jitter(width = 0.05, height = 0.05, alpha = 0.5) + 
        # geom_point(alpha = 0.5) + 
        theme_bw() + 
        scale_x_sqrt() + scale_y_sqrt() + 
        labs(x = "Number of authors", 
             y = "Number of contributors\nwith at least 2 commits", 
             caption = dcap) + 
        theme(axis.title = element_text(size = 15)),
    tooltip = c("label", "x", "y")
)

Number of reviewers per paper

Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.

ggplot(papers %>%
           dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
           dplyr::mutate(year = year(published.date)),
       aes(x = nbr_reviewers)) + geom_bar() + 
    facet_wrap(~ year) + theme_bw() + 
    labs(x = "Number of reviewers", y = "Number of submissions", caption = dcap)

Most active reviewers

Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.

All time

reviewers <- papers %>% 
    dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
    dplyr::mutate(year = year(published.date)) %>%
    dplyr::select(reviewers, year) %>%
    tidyr::separate_rows(reviewers, sep = ",")

## Most active reviewers
DT::datatable(
    reviewers %>% dplyr::group_by(reviewers) %>%
        dplyr::summarize(nbr_reviews = length(year),
                         timespan = paste(unique(c(min(year), max(year))), 
                                          collapse = " - ")) %>%
        dplyr::arrange(desc(nbr_reviews)),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Past 5 years

reviewers <- papers %>% 
    dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
    dplyr::mutate(year = year(published.date)) %>%
    dplyr::filter(as.Date(published.date) >= (lubridate::today() - 5 * 365.25)) %>%
    dplyr::select(reviewers, year) %>%
    tidyr::separate_rows(reviewers, sep = ",")

## Most active reviewers
DT::datatable(
    reviewers %>% dplyr::group_by(reviewers) %>%
        dplyr::summarize(nbr_reviews = length(year),
                         timespan = paste(unique(c(min(year), max(year))), 
                                          collapse = " - ")) %>%
        dplyr::arrange(desc(nbr_reviews)),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Past year

reviewers <- papers %>% 
    dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
    dplyr::mutate(year = year(published.date)) %>%
    dplyr::filter(as.Date(published.date) >= (lubridate::today() - 365.25)) %>%
    dplyr::select(reviewers, year) %>%
    tidyr::separate_rows(reviewers, sep = ",")

## Most active reviewers
DT::datatable(
    reviewers %>% dplyr::group_by(reviewers) %>%
        dplyr::summarize(nbr_reviews = length(year),
                         timespan = paste(unique(c(min(year), max(year))), 
                                          collapse = " - ")) %>%
        dplyr::arrange(desc(nbr_reviews)),
    escape = FALSE, rownames = FALSE, 
    filter = list(position = 'top', clear = FALSE),
    options = list(scrollX = TRUE)
)

Number of papers per editor and year

ggplot(papers %>% 
           dplyr::mutate(year = year(published.date),
                         `r/pyOpenSci` = factor(
                             grepl("rOpenSci|pyOpenSci", prerev_labels),
                             levels = c("TRUE", "FALSE"))), 
       aes(x = editor)) + geom_bar(aes(fill = `r/pyOpenSci`)) + 
    theme_bw() + facet_wrap(~ year, ncol = 1) + 
    scale_fill_manual(values = c(`TRUE` = "grey65", `FALSE` = "grey35")) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
    labs(x = "Editor", y = "Number of submissions", caption = dcap)

Distribution of software repo licenses

all_licenses <- sort(unique(papers$repo_license))
license_levels = c(grep("apache", all_licenses, value = TRUE),
                   grep("bsd", all_licenses, value = TRUE),
                   grep("mit", all_licenses, value = TRUE),
                   grep("gpl", all_licenses, value = TRUE),
                   grep("mpl", all_licenses, value = TRUE))
license_levels <- c(license_levels, setdiff(all_licenses, license_levels))
ggplot(papers %>% 
           dplyr::mutate(repo_license = factor(repo_license, 
                                               levels = license_levels)),
       aes(x = repo_license)) +
    geom_bar() + 
    theme_bw() + 
    labs(x = "Software license", y = "Number of submissions", caption = dcap) + 
    theme(axis.title = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
    facet_wrap(~ year(published.date), scales = "free_y")

## For plots below, replace licenses present in less 
## than 2.5% of the submissions by 'other'
tbl <- table(papers$repo_license)
to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)])

ggplot(papers %>% 
           dplyr::mutate(year = year(published.date)) %>%
           dplyr::mutate(repo_license = replace(repo_license, 
                                                repo_license %in% to_replace,
                                                "other")) %>%
           dplyr::mutate(year = factor(year), 
                         repo_license = factor(
                             repo_license, 
                             levels = license_levels[license_levels %in% repo_license]
                         )) %>%
           dplyr::group_by(year, repo_license, .drop = FALSE) %>%
           dplyr::count() %>%
           dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = n, fill = repo_license)) + geom_area() + 
    theme_minimal() + 
    scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                      na.value = "grey") + 
    theme(axis.title = element_text(size = 15)) + 
    labs(x = "Year", y = "Number of submissions", caption = dcap)

ggplot(papers %>% 
           dplyr::mutate(year = year(published.date)) %>%
           dplyr::mutate(repo_license = replace(repo_license, 
                                                repo_license %in% to_replace,
                                                "other")) %>%
           dplyr::mutate(year = factor(year), 
                         repo_license = factor(
                             repo_license, 
                             levels = license_levels[license_levels %in% repo_license]
                         )) %>%
           dplyr::group_by(year, repo_license, .drop = FALSE) %>%
           dplyr::summarize(n = n()) %>%
           dplyr::mutate(freq = n/sum(n)) %>%
           dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = repo_license)) + geom_area() + 
    theme_minimal() + 
    scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                      na.value = "grey") + 
    theme(axis.title = element_text(size = 15)) + 
    labs(x = "Year", y = "Fraction of submissions", caption = dcap)

Most common GitHub repo topics

a <- unlist(strsplit(papers$repo_topics, ","))
a <- a[!is.na(a)]
topicfreq <- table(a)

colors <- viridis::viridis(100)
set.seed(1234)
wordcloud::wordcloud(
    names(topicfreq), sqrt(topicfreq), min.freq = 1, max.words = 300,
    random.order = FALSE, rot.per = 0.05, use.r.layout = FALSE, 
    colors = colors, scale = c(10, 0.1), random.color = TRUE,
    ordered.colors = FALSE, vfont = c("serif", "plain")
)

DT::datatable(as.data.frame(topicfreq) %>% 
                  dplyr::rename(topic = a, nbr_repos = Freq) %>%
                  dplyr::arrange(desc(nbr_repos)),
              escape = FALSE, rownames = FALSE, 
              filter = list(position = 'top', clear = FALSE),
              options = list(scrollX = TRUE))

Citation analysis

Here, we take a more detailed look at the papers that cite JOSS papers, using data from the Open Citations Corpus.

Get citing papers for each submission

## Split into several queries
## Randomize the splitting since a whole query may fail if one ID is not recognized
papidx <- seq_len(nrow(papers))
idxL <- split(sample(papidx, length(papidx), replace = FALSE), ceiling(papidx / 50))
citationsL <- lapply(idxL, function(idx) {
    tryCatch({
        citecorp::oc_coci_cites(doi = papers$alternative.id[idx]) %>%
            dplyr::distinct() %>%
            dplyr::mutate(citation_info_obtained = as.character(lubridate::today()))
    }, error = function(e) {
        NULL
    })
})
citationsL <- citationsL[vapply(citationsL, function(df) !is.null(df) && nrow(df) > 0, FALSE)]
if (length(citationsL) > 0) {
    citations <- do.call(dplyr::bind_rows, citationsL)
} else {
    citations <- NULL
}
dim(citations)

## [1] 91946     8

if (!is.null(citations) && is.data.frame(citations) && "oci" %in% colnames(citations)) {
    citations <- citations %>% 
        dplyr::filter(!(oci %in% citations_archive$oci) & 
                          citing != "")
    
    tmpj <- rcrossref::cr_works(dois = unique(citations$citing))$data %>%
        dplyr::select(contains("doi"), contains("container.title"), contains("issn"),
                      contains("type"), contains("publisher"), contains("prefix"))
    citations <- citations %>% dplyr::left_join(tmpj, by = c("citing" = "doi"))
    
    ## bioRxiv preprints don't have a 'container.title' or 'issn', but we'll assume 
    ## that they can be 
    ## identified from the prefix 10.1101 - set the container.title 
    ## for these records manually; we may or may not want to count these
    ## (would it count citations twice, both preprint and publication?)
    citations$container.title[citations$prefix == "10.1101"] <- "bioRxiv"
    
    ## JOSS is represented by 'The Journal of Open Source Software' as well as 
    ## 'Journal of Open Source Software'
    citations$container.title[citations$container.title == 
                                  "Journal of Open Source Software"] <- 
        "The Journal of Open Source Software"
    
    ## Remove real self citations (cited DOI = citing DOI)
    citations <- citations %>% dplyr::filter(cited != citing)
    
    ## Merge with the archive
    citations <- dplyr::bind_rows(citations, citations_archive)
} else {
    citations <- citations_archive
    if (is.null(citations[["citation_info_obtained"]])) {
        citations$citation_info_obtained <- NA_character_
    }
}

citations$citation_info_obtained[is.na(citations$citation_info_obtained)] <- 
    "2021-08-11"

write.table(citations, file = "joss_submission_citations.tsv",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

Summary statistics

## Latest successful update of new citation data
max(as.Date(citations$citation_info_obtained))

## [1] "2026-05-13"

## Number of JOSS papers with >0 citations included in this collection
length(unique(citations$cited))

## [1] 2215

## Number of JOSS papers with >0 citations according to OpenAlex
length(which(papers$citation_count > 0))

## [1] 2683

## Number of citations from Open Citations Corpus vs OpenAlex
df0 <- papers %>% dplyr::select(doi, citation_count) %>%
    dplyr::full_join(citations %>% dplyr::group_by(cited) %>%
                         dplyr::tally() %>%
                         dplyr::mutate(n = replace(n, is.na(n), 0)),
                     by = c("doi" = "cited"))

## Total citation count OpenAlex
sum(df0$citation_count, na.rm = TRUE)

## [1] 123504

## Total citation count Open Citations Corpus
sum(df0$n, na.rm = TRUE)

## [1] 120464

## Ratio of total citation count Open Citations Corpus/OpenAlex
sum(df0$n, na.rm = TRUE)/sum(df0$citation_count, na.rm = TRUE)

## [1] 0.9753854

ggplot(df0, aes(x = citation_count, y = n)) + 
    geom_abline(slope = 1, intercept = 0) + 
    geom_point(size = 3, alpha = 0.5) + 
    labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count",
         caption = dcap) + 
    theme_bw()

## Zoom in
ggplot(df0, aes(x = citation_count, y = n)) + 
    geom_abline(slope = 1, intercept = 0) + 
    geom_point(size = 3, alpha = 0.5) + 
    labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count",
         caption = dcap) + 
    theme_bw() + 
    coord_cartesian(xlim = c(0, 75), ylim = c(0, 75))

## Number of journals citing JOSS papers
length(unique(citations$container.title))

## [1] 12866

length(unique(citations$issn))

## [1] 9213

Most citing journals

topcit <- citations %>% dplyr::group_by(container.title) %>%
    dplyr::summarize(nbr_citations_of_joss_papers = length(cited),
                     nbr_cited_joss_papers = length(unique(cited)),
                     nbr_citing_papers = length(unique(citing)),
                     nbr_selfcitations_of_joss_papers = sum(author_sc == "yes"),
                     fraction_selfcitations = signif(nbr_selfcitations_of_joss_papers /
                                                         nbr_citations_of_joss_papers, digits = 3)) %>%
    dplyr::arrange(desc(nbr_cited_joss_papers))
DT::datatable(topcit,
              escape = FALSE, rownames = FALSE, 
              filter = list(position = 'top', clear = FALSE),
              options = list(scrollX = TRUE))

## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

plotly::ggplotly(
    ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
                       label = container.title)) + 
        geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
        geom_point(size = 3, alpha = 0.5) + 
        theme_bw() + 
        labs(caption = dcap, x = "Number of citations of JOSS papers",
             y = "Number of cited JOSS papers")
)

plotly::ggplotly(
    ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
                       label = container.title)) + 
        geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
        geom_point(size = 3, alpha = 0.5) + 
        theme_bw() + 
        coord_cartesian(xlim = c(0, 100), ylim = c(0, 50)) + 
        labs(caption = dcap, x = "Number of citations of JOSS papers",
             y = "Number of cited JOSS papers")
)

write.table(topcit, file = "joss_submission_citations_byjournal.tsv",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

Save object

The tibble object with all data collected above is serialized to a file that can be downloaded and reused.

head(papers) %>% as.data.frame()

##        alternative.id                     container.title    created  deposited
## 1 10.21105/joss.07308     Journal of Open Source Software 2025-03-15 2025-03-15
## 2 10.21105/joss.01338     Journal of Open Source Software 2019-04-19 2019-11-17
## 3 10.21105/joss.01229     Journal of Open Source Software 2019-06-10 2024-07-19
## 4 10.21105/joss.01444     Journal of Open Source Software 2019-06-27 2022-09-22
## 5 10.21105/joss.00194 The Journal of Open Source Software 2017-03-07 2019-09-19
## 6 10.21105/joss.03092     Journal of Open Source Software 2021-04-10 2021-04-10
##   published.print                 doi    indexed      issn issue     issued
## 1      2025-03-15 10.21105/joss.07308 2025-03-16 2475-9066   107 2025-03-15
## 2      2019-04-19 10.21105/joss.01338 2025-02-21 2475-9066    36 2019-04-19
## 3      2019-06-10 10.21105/joss.01229 2025-02-21 2475-9066    38 2019-06-10
## 4      2019-06-27 10.21105/joss.01444 2025-02-21 2475-9066    38 2019-06-27
## 5      2017-03-07 10.21105/joss.00194 2025-02-21 2475-9066    11 2017-03-07
## 6      2021-04-10 10.21105/joss.03092 2025-02-21 2475-9066    60 2021-04-10
##   member page   prefix        publisher score   source reference.count
## 1   8722 7308 10.21105 The Open Journal     0 Crossref               9
## 2   8722 1338 10.21105 The Open Journal     0 Crossref               5
## 3   8722 1229 10.21105 The Open Journal     0 Crossref              12
## 4   8722 1444 10.21105 The Open Journal     0 Crossref              11
## 5   8722  194 10.21105 The Open Journal     0 Crossref               6
## 6   8722 3092 10.21105 The Open Journal     0 Crossref              17
##   references.count is.referenced.by.count
## 1                9                      0
## 2                5                      0
## 3               12                      3
## 4               11                      1
## 5                6                      1
## 6               17                      1
##                                                                                                            title
## 1                              DemeterWatch: A Java tool to detect Law of Demeter violations in Java collections
## 2                                                    uJVM: Lightweight Java Virtual Machine for embedded systems
## 3                               Model dispersion with PRISM; an alternative to MCMC for rapid analysis of models
## 4 modelDown: automated website generator with interpretable documentation for predictive machine learning models
## 5                                      lawn: An R client for the Turf Javascript Library for Geospatial Analysis
## 6         bleiglas: An R package for interpolation and visualisation of spatiotemporal data with 3D tessellation
##              type                                 url volume
## 1 journal-article https://doi.org/10.21105/joss.07308     10
## 2 journal-article https://doi.org/10.21105/joss.01338      4
## 3 journal-article https://doi.org/10.21105/joss.01229      4
## 4 journal-article https://doi.org/10.21105/joss.01444      4
## 5 journal-article https://doi.org/10.21105/joss.00194      2
## 6 journal-article https://doi.org/10.21105/joss.03092      6
##   short.container.title
## 1                  JOSS
## 2                  JOSS
## 3                  JOSS
## 4                  JOSS
## 5                  JOSS
## 6                  JOSS
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           author
## 1 https://orcid.org/0009-0004-9334-5936, https://orcid.org/0000-0001-9054-8659, NA, NA, FALSE, FALSE, NA, NA, Juan Pablo P., José Fernando, Diogo D., Ricardo, de Aquino, de M. Firmino, Moreira, de S. Job, first, additional, additional, additional, Instituto Federal de Educação Ciência e Tecnologia da Paraíba - IFPB, Brazil, Universidade Federal da Paraíba - UFPB, Brazil, Instituto Federal de Educação Ciência e Tecnologia da Paraíba - IFPB, Brazil, Instituto Federal de Educação Ciência e Tecnologia da Paraíba - IFPB, Brazil
## 2                                                                                                                                                                                                                                https://orcid.org/0000-0003-3167-983X, https://orcid.org/0000-0002-4651-1696, https://orcid.org/0000-0002-6673-4053, https://orcid.org/0000-0003-3055-8104, FALSE, FALSE, FALSE, FALSE, Oleksandr, Taras, Vitalii, Oleg, Moliavko, Drozdovskyi, Petrychenko, Kopysov, first, additional, additional, additional
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                    https://orcid.org/0000-0002-1559-9832, FALSE, Ellert, van der Velden, first
## 4                                                                                                                                                                                                                                                                                                                                                   Kamil, Magda, Mateusz, Przemysław, Romaszko, Tatarynowicz, Urbański, Biecek, first, additional, additional, additional, NA, NA, NA, https://orcid.org/0000-0001-8423-1823, NA, NA, NA, FALSE
## 5                                                                                                                                                                                                                                                                                                                                                                                       https://orcid.org/0000-0003-1444-9135, https://orcid.org/0000-0002-9254-9740, FALSE, FALSE, Scott, Jeffrey, Chamberlain, W. Hollister, first, additional
## 6                                                                                                                                                                                                                                                                                                                                                                                             https://orcid.org/0000-0003-3448-5715, https://orcid.org/0000-0002-1017-9150, FALSE, FALSE, Clemens, Stephan, Schmid, Schiffels, first, additional
##   citation_count                      openalex_id affil_countries_all
## 1              0 https://openalex.org/W4408479858                  BR
## 2              0 https://openalex.org/W2938572431                    
## 3              5 https://openalex.org/W2964188552                  AU
## 4              3 https://openalex.org/W2953447938                  PL
## 5              1 https://openalex.org/W2593495758                    
## 6              1 https://openalex.org/W3156855294                  DE
##   affil_countries_first
## 1                    BR
## 2                      
## 3                    AU
## 4                    PL
## 5                      
## 6                    DE
##                                                                                                        api_title
## 1                              DemeterWatch: A Java tool to detect Law of Demeter violations in Java collections
## 2                                                    uJVM: Lightweight Java Virtual Machine for embedded systems
## 3                               Model dispersion with PRISM; an alternative to MCMC for rapid analysis of models
## 4 modelDown: automated website generator with interpretable documentation for predictive machine learning models
## 5                                      lawn: An R client for the Turf Javascript Library for Geospatial Analysis
## 6         bleiglas: An R package for interpolation and visualisation of spatiotemporal data with 3D tessellation
##   api_state
## 1  accepted
## 2 retracted
## 3  accepted
## 4  accepted
## 5  accepted
## 6  accepted
##                                                                                                                                                                                       author_affiliations
## 1                                                                             Instituto Federal de Educação Ciência e Tecnologia da Paraíba - IFPB, Brazil;Universidade Federal da Paraíba - UFPB, Brazil
## 2                                                                                                                                                                                                        
## 3 Centre for Astrophysics and Supercomputing, Swinburne University of Technology, PO Box 218, Hawthorn, VIC 3122, Australia, ARC Centre of Excellence for All Sky Astrophysics in 3 Dimensions (ASTRO 3D)
## 4                                                                                                                         Faculty of Mathematics and Information Science, Warsaw University of Technology
## 5                                                                                                                                  rOpenSci;US Environmental Protection Agency, Atlantic Ecology Division
## 6                                                                        Department of Archaeogenetics, Max Planck Institute for the Science of Human History, Kahlaische Strasse 10, 07745 Jena, Germany
##             editor                    reviewers nbr_reviewers
## 1       @vissarion @saaikrishnan,@louiseadennis             2
## 2 @gkthiruvathukal     @morganericsson,@hainesr             2
## 3          @arokem                  @fonnesbeck             1
## 4       @alexhanna   @terrytangyuan,@pdwaggoner             2
## 5           @arfon                       @arfon             1
## 6       @vissarion       @corybrunson,@fabian-s             2
##                                      repo_url review_issue_id
## 1 https://github.com/youngkaneda/DemeterWatch            7308
## 2            https://github.com/Samsung/uJVM/            1338
## 3              https://github.com/1313e/PRISM            1229
## 4     https://github.com/MI2DataLab/modelDown            1444
## 5            https://github.com/ropensci/lawn             194
## 6         https://github.com/nevrome/bleiglas            3092
##   prereview_issue_id       languages                             archive_doi
## 1               7209 Java,JavaScript https://doi.org/10.5281/zenodo.15012171
## 2               1268  C,C++,Assembly  https://doi.org/10.5281/zenodo.2647414
## 3               1200          Python  https://doi.org/10.5281/zenodo.2572736
## 4               1439               R  https://doi.org/10.5281/zenodo.3247303
## 5                193               R   https://doi.org/10.5281/zenodo.345991
## 6               3034         R,C++,C   https://doi.org/10.17605/OSF.IO/AGMD6
##                                                                                                     review_title
## 1                              DemeterWatch: A Java tool to detect Law of Demeter violations in Java collections
## 2                                                    uJVM: Lightweight Java Virtual Machine for embedded systems
## 3                               Model dispersion with PRISM; an alternative to MCMC for rapid analysis of models
## 4 modelDown: automated website generator with interpretable documentation for predictive machine learning models
## 5                                      lawn: An R client for the Turf Javascript Library for Geospatial Analysis
## 6         bleiglas: An R package for interpolation and visualisation of spatiotemporal data with 3D tessellation
##   review_number review_state review_opened review_closed review_ncomments
## 1          7308       closed    2024-10-02    2025-03-15               70
## 2          1338       closed    2019-03-20    2019-04-19               83
## 3          1229       closed    2019-02-04    2019-06-10              103
## 4          1444       closed    2019-05-12    2019-06-27               51
## 5           194       closed    2017-03-04    2017-03-07                7
## 6          3092       closed    2021-03-08    2021-04-10               66
##                                                              review_labels
## 1 accepted,TeX,Java,JavaScript,recommend-accept,published,Track: 7 (CSISM)
## 2                                      accepted,recommend-accept,published
## 3                                  accepted,AAS,recommend-accept,published
## 4                                      accepted,recommend-accept,published
## 5                             accepted,rOpenSci,recommend-accept,published
## 6                            accepted,TeX,R,C++,recommend-accept,published
##                                                                                                     prerev_title
## 1                              DemeterWatch: A Java tool to detect Law of Demeter violations in Java collections
## 2                                                    uJVM: Lightweight Java Virtual Machine for embedded systems
## 3                               Model dispersion with PRISM; an alternative to MCMC for rapid analysis of models
## 4 modelDown: automated website generator with interpretable documentation for predictive machine learning models
## 5                                      lawn: An R client for the Turf Javascript Library for Geospatial Analysis
## 6         bleiglas: An R package for interpolation and visualisation of spatiotemporal data with 3D tessellation
##   prerev_state prerev_opened prerev_closed prerev_ncomments
## 1       closed    2024-09-10    2024-10-02               35
## 2       closed    2019-02-20    2019-03-20               52
## 3       closed    2019-01-25    2019-02-04               19
## 4       closed    2019-05-09    2019-05-12               19
## 5       closed    2017-03-04    2017-03-04                8
## 6       closed    2021-02-12    2021-03-08               27
##                          prerev_labels days_in_pre days_in_rev to_review
## 1 TeX,Java,JavaScript,Track: 7 (CSISM)     22 days    164 days      TRUE
## 2                       Makefile,C++,C     28 days     30 days      TRUE
## 3                           TeX,Python     10 days    126 days      TRUE
## 4                                TeX,R      3 days     46 days      TRUE
## 5                             rOpenSci      0 days      3 days      TRUE
## 6                            TeX,R,C++     24 days     33 days      TRUE
##   repo_created repo_updated repo_pushed repo_nbr_stars repo_language
## 1   2021-07-24   2025-03-15  2025-03-15              0          Java
## 2         <NA>         <NA>        <NA>             NA          <NA>
## 3   2018-02-06   2024-11-03  2021-06-28             46        Python
## 4   2018-05-27   2026-03-25  2026-03-25            120             R
## 5   2015-03-04   2025-03-22  2022-05-10             55             R
## 6   2020-01-03   2026-03-04  2021-09-15             15             R
##                             repo_languages_bytes
## 1 Java:219015,TeX:5505,JavaScript:2668,HTML:2231
## 2                                           <NA>
## 3  Python:911942,Jupyter Notebook:78845,TeX:6351
## 4                               R:20459,TeX:2849
## 5                 R:263262,TeX:1378,Makefile:654
## 6               R:44592,C++:8730,TeX:5812,C:1189
##                                                                 repo_topics
## 1                                                                          
## 2                                                                      <NA>
## 3 model,analysis-pipeline,model-analysis,emulator,regression,python3,python
## 4                                                                          
## 5                         geojson,geospatial,r,rstats,r-package,turf,turfjs
## 6                                                 r,tessellation,3d,voronoi
##   repo_license repo_nbr_contribs repo_nbr_contribs_2ormore repo_info_obtained
## 1      gpl-3.0                 4                         3         2026-05-13
## 2         <NA>                NA                        NA               <NA>
## 3 bsd-3-clause                 1                         1         2026-05-13
## 4         <NA>                 9                         7         2026-05-13
## 5        other                 5                         4         2026-05-13
## 6          mit                 4                         2         2026-05-13
##   published.date halfyear nbr_authors
## 1     2025-03-15   2025H1           4
## 2     2019-04-19   2019H1           4
## 3     2019-06-10   2019H1           1
## 4     2019-06-27   2019H1           4
## 5     2017-03-07   2017H1           2
## 6     2021-04-10   2021H1           2

saveRDS(papers, file = "joss_submission_analytics.rds")

To read the current version of this file directly from GitHub, use the following code:

papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))

Session info

sessionInfo()

## R version 4.6.0 (2026-04-24)
## Platform: aarch64-apple-darwin23
## Running under: macOS Sequoia 15.7.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: UTC
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] openalexR_3.0.1 stringr_1.6.0   gt_1.3.0        rworldmap_1.3-8
##  [5] sp_2.2-1        readr_2.2.0     citecorp_0.3.0  plotly_4.12.0  
##  [9] DT_0.34.0       jsonlite_2.0.0  purrr_1.2.2     gh_1.5.0       
## [13] lubridate_1.9.5 ggplot2_4.0.3   tidyr_1.3.2     dplyr_1.2.1    
## [17] rcrossref_1.2.1 tibble_3.3.1   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   viridisLite_0.4.3  farver_2.1.2       viridis_0.6.5     
##  [5] urltools_1.7.3.1   fields_17.3        S7_0.2.2           fastmap_1.2.0     
##  [9] lazyeval_0.2.3     promises_1.5.0     digest_0.6.39      dotCall64_1.2     
## [13] timechange_0.4.0   mime_0.13          lifecycle_1.0.5    terra_1.9-27      
## [17] magrittr_2.0.5     compiler_4.6.0     rlang_1.2.0        sass_0.4.10       
## [21] tools_4.6.0        wordcloud_2.6      utf8_1.2.6         yaml_2.3.12       
## [25] data.table_1.18.4  knitr_1.51         labeling_0.4.3     fauxpas_0.6.0     
## [29] htmlwidgets_1.6.4  bit_4.6.0          curl_7.1.0         plyr_1.8.9        
## [33] xml2_1.5.2         RColorBrewer_1.1-3 httpcode_0.3.0     miniUI_0.1.2      
## [37] withr_3.0.2        triebeard_0.4.1    grid_4.6.0         xtable_1.8-8      
## [41] gitcreds_0.1.2     scales_1.4.0       crul_1.6.0         cli_3.6.6         
## [45] rmarkdown_2.31     crayon_1.5.3       generics_0.1.4     otel_0.2.0        
## [49] httr_1.4.8         tzdb_0.5.0         cachem_1.1.0       splines_4.6.0     
## [53] maps_3.4.3         parallel_4.6.0     vctrs_0.7.3        Matrix_1.7-5      
## [57] hms_1.1.4          bit64_4.8.0        crosstalk_1.2.2    jquerylib_0.1.4   
## [61] glue_1.8.1         spam_2.11-3        codetools_0.2-20   stringi_1.8.7     
## [65] gtable_0.3.6       later_1.4.8        raster_3.6-32      pillar_1.11.1     
## [69] rappdirs_0.3.4     htmltools_0.5.9    httr2_1.2.2        R6_2.6.1          
## [73] vroom_1.7.1        evaluate_1.0.5     shiny_1.13.0       lattice_0.22-9    
## [77] httpuv_1.6.17      bslib_0.10.0       Rcpp_1.1.1-1.1     gridExtra_2.3     
## [81] nlme_3.1-169       mgcv_1.9-4         whisker_0.4.1      xfun_0.57         
## [85] fs_2.1.0           pkgconfig_2.0.3

JOSS submission analytics

2026-05-13 09:51:38.972511