Setting up and defining custom functions

Set the working directory, load up R packages and data

Remember to re-set the working directory and install the packages first if needed.

library(dplyr) # data handling 
library(stringr) # string manipulation
library(ggplot2) # plots
library(parallel) # parallel computing
library(lme4) # mixed models
library(lmerTest) # mixed models
library(pander) # nice Markdown tables
library(tidyr) # spread and gather functions
library(gridExtra) # arranging ggplots
library(mgcv) # generalised additive models
library(ggbeeswarm) # beeswarm plots
library(brms) # Bayesian models for meta-analysis
library(Cairo) # for exporting the character 'alpha' via ggsave

# Using a database allows quick access to the whole PubMed dataset without loading the whole thing to memory.
# You can download the pubmed_gender_data.sqlite3 database here: https://osf.io/bt9ya/
pubmed_db <- src_sqlite("~/Desktop/Gender Pubmed mining project/data for analysis/pubmed_gender_data.sqlite3", create = F)
pubmed_sqlite <- tbl(pubmed_db, "papers") # information about each individual paper
journals_sqlite <- tbl(pubmed_db, "journals") # information about each journal

Function to retrieve data from the SQLite database

Importantly, this function discards all papers where we do not know the gender of every single author. This is conservative but greatly simplifies the test â€" we’d have to be careful how we incorporated information from papers where some author’s genders were unknown. Our failures to identify author gender are almost certainly randomly distributed across papers with respect to gender homophily, and so this omission makes our tests less sensitive but should not bias their outcomes.

The function returns all the data that we need to run the test on every journal for which we found more than minimum.n suitable papers that were published in the last nYears.

# This function converts dates into a different format, namely time before 'the present' in years. 
# 'The present' is defined as 20/08/2016 which is the date that Luke downloaded a local copy of the Medline database.
convert.dates <- function(dates) {
  dates <- as.numeric(as.Date(dates, "%d_%m_%Y")) - 
    as.numeric(as.Date("20_08_2016","%d_%m_%Y")) # Convert the date to days before the present
  return(dates / 365.25)  # Convert from days to years
}

retrieve.authorship.data <- function(journals, 
                                     minimum.n, 
                                     start.of.time.period, 
                                     end.of.time.period, 
                                     restrict.by.n.authors = NULL,
                                     file.name, 
                                     over.write = FALSE){
  
  # Only do the test if over.write = TRUE or there is no datafile in the working directory
  if(over.write | !file.exists(file.name)){ 
    
    # Declare variables to hold the output
    journal.names <- rep(NA, 10^5)
    count.data <- list()
    counter <- 0
    
    # Loop over all the journals, try to get the data, and if 
    # there is enough, store the data we need as a list entry in 'count.data'
    for(i in 1:length(journals)){
      
      # Get the relevant data for the focal journal out of the database using dplyr
      # the gender.95 column contains author genders in a format like "MFU" - 
      # MFU means the first author was male, second female, and third was of unknown gender. 
      # Genders were only recorded as M or F if the given name is associated with one gender >=95% of the time.
      
      focal <- pubmed_sqlite %>% 
        filter(journal == journals[i]) %>% # restrict to focal journal
        select(gender.95, date) %>%        # get these columns
        collect(n = Inf) %>%               # get the data into memory
        filter(nchar(gender.95) > 1)       # only keep papers with >1 author
      
      # Potentially further restrict to 2-author papers, 3-author papers... or 5 or more author papers
      if(!is.null(restrict.by.n.authors)) {
        if(restrict.by.n.authors != 5) focal <- focal %>% filter(nchar(gender.95) == restrict.by.n.authors)
        else if(restrict.by.n.authors == 5) focal <- focal %>% filter(nchar(gender.95) >= 5)
      }
      
      # Convert the data column into 'years before present', to the nearest day
      focal$date <- convert.dates(focal$date)
      
      # Restrict the analysis to a specified period of time. My logic is that if the gender ratio changes over time, it will give the false impression of homophily, as explained in Bergstrom et al.'s comment here: http://www.michaeleisen.org/blog/?p=1931
      # For example for papers from the last one year in the dataset, set start.of.time.period = 1 and end.of.time.period = 0
      # For papers from between 5 and 6 years ago, set start.of.time.period = 6 and end.of.time.period = 5
      focal <- focal %>% 
        filter(date > (start.of.time.period * -1), 
               date < (end.of.time.period * -1)) %>% 
        select(-date) 
      
      # Proceed if the sample size is big enough
      if(nrow(focal) >= minimum.n){ 
        
        focal <- (focal %>% as.data.frame())[,1] # Get the gender column as a character vector
        focal <- focal[!str_detect(focal, "U")] # to keep it simple, let's only include papers where we know all the authors' genders
        
        # And if the sample size is STILL big enough...
        if(length(focal) >= minimum.n){ 
          
          # Make a summary of the number of M and F authors on each paper, and the gender of the first and last author
          n.char <- nchar(focal)
          counts <- data.frame(nMales = str_count(focal, "M"), 
                               nFemales = str_count(focal, "F"),
                               firstF = substr(focal, 1, 1) == "F",
                               lastF = substr(focal, n.char, n.char) == "F")
          rm(focal) # Explicitly discard 'focal', to reduce memory footprint
          
          counter <- counter + 1 # Save the data and increment the counter
          count.data[[counter]] <- counts
          journal.names[counter] <- journals[i]
        } 
        
        else rm(focal) # Explicitly discard 'focal' anyway
      } 
    } 
    names(count.data) <- journal.names[!is.na(journal.names)] # Name each entry in the list with the title of the journal the data are from
    
    saveRDS(count.data, file = file.name)
  }
}

Define a function to calculate Bergstrom et al.’s “coefficient of homophily�, \(\alpha\)

These two functions estimate \(\alpha\) for a set of papers’ author lists. The version find.alpha.CIs() performs bootstrapping resampling of the dataset to obtain the 95% confidence limits on \(\alpha\). find.alpha() calculates \(\alpha\) without bootstrapping, making it much faster (used later when we are calculating the null distribution of \(\alpha\)).

\(\alpha\) is defined as p - q, and is termed “the coefficient of homophily�. p is the probability that a randomly-chosen co-author of a male author is a man, and q is the probability that a randomly-chosen co-author of a female author is a man. Thus, positive \(\alpha\) means that men coauthor with men, and women coauthor with women, more often than expected under the null model that coauthorship are random with respect to gender. Negative \(\alpha\) means the opposite: men and women coauthor more often than expected.

For more information about \(\alpha\), see this working paper by Bergstrom et al. http://eigenfactor.org/gender/assortativity/measuring_homophily.pdf. Also, see the comments section of this blog post: http://www.michaeleisen.org/blog/?p=1931. Bergstrom et al. point out that \(\alpha\) is equivalent to the Peason correlation coefficient between the genders of authors on a paper.

# The objects nMales and nFemales should give the number of male and female authors on a set of papers
# The function is written assuming that all papers have two or more authors of known gender. 
find.alpha <- function(nMales, nFemales){
  
  # Number of authors per paper, minus one
  nAuthors <- nMales + nFemales - 1
  
  # If you were to pick a SECOND random author from each paper, 
  # after picking a MALE the first time, what is the probability that the author is male?
  p.second.pick.is.male.M <- (nMales - 1) / nAuthors
  
  # If you were to pick a SECOND random author from each paper, 
  # after picking a FEMALE the first time, what is the probability that the author is male?
  p.second.pick.is.male.F <- nMales / nAuthors
  
  # If the probability of picking a male was zero (or one) the first time, 
  # make sure it's still zero (or one) for the second pick
  p.second.pick.is.male.M[nMales == 0] <- 0 
  p.second.pick.is.male.F[nMales == 0] <- 0 
  p.second.pick.is.male.M[nFemales == 0] <- 1 
  p.second.pick.is.male.F[nFemales == 0] <- 1
  
  # Find average proportion of male authors across all the men (p),
  # and average proportion of male authors across all the women (q). Subtract to get alpha
  return(sum(nMales * p.second.pick.is.male.M / sum(nMales)) - 
         sum(nFemales * p.second.pick.is.male.F / sum(nFemales)))
}


# This version uses bootstrapping to also estimate the 95% confidence limits on alpha
# Note that papers with lots of authors are more likely to be picked, ensuring that 
# each individual author is equally likely to end up in the sample. So if
# 2-author vs 10-author papers have different amounts of homophily, or 
# different gender ratios, the CIs should not be biased
find.alpha.CIs <- function(nMales, nFemales, nBoots, correction = NULL){
  
  counts <- data.frame(nMales = nMales, nFemales = nFemales)
  num.papers <- nrow(counts)
  num.authors <- rowSums(counts)
  
  # Resample 'num.papers' papers with replacement, 'nBoots' times. 
  # Papers with lots of authors are more likely to be picked
  boot.alpha <- counts %>% 
    sample_n(size = num.papers * nBoots,   
             replace = TRUE,
             weight = num.authors) %>%
    mutate(replicate = rep(1:nBoots, each = num.papers)) %>%
    group_by(replicate) %>%     # Calculate alpha on each random sample
    summarise(alpha = find.alpha(nMales, nFemales)) %>% .$alpha
  
  # Optionally, apply a correction to get alpha' (e.g. if alpha is expected to 
  # be -0.05 under the null, add 0.05 to get alpha')
  if(!is.null(correction)){
    boot.alpha <- boot.alpha - correction
    alpha <- find.alpha(nMales, nFemales) - correction
  }
  
  else alpha <- find.alpha(nMales, nFemales)
  
  # If there are any re-samples entirely male/female (usually v rare), they generate NaNs. Remove them
  boot.alpha <- boot.alpha[!is.nan(boot.alpha)] 

  c(alpha, as.numeric(quantile(boot.alpha, probs = c(0.025, 0.975)))) 
}


# This version calculates the difference in prop. male co-authors between male and female first (or last) authors
# The is.female column is boolean, and identifies whether each paper's first (or last) author is female
find.alpha.specific.author <- function(nMales, nFemales, is.female){
  
  # Find the proportion of male coauthors for each first (or last) author
  nMales[!is.female] <- nMales[!is.female] - 1 # If first/last author is male, don't count him among coauthors
  nFemales[is.female] <- nFemales[is.female] - 1 # If first/last author is female, don't count her among coauthors
  prop.male <- nMales / (nMales + nFemales) # proportion of male coauthors
  
  return(sum(prop.male[!is.female]) / length(prop.male[!is.female]) - 
         sum(prop.male[is.female])  / length(prop.male[is.female]))
}


# Same, but it also gets the 95% CIs by bootstrapping
# Here, the first authors (or last authors) are the unit of replication, and there is always 1 per paper.
# So, no need to sample papers with lots of authors more frequently
find.alpha.specific.author.CIs <- function(nMales, nFemales, 
                                           nBoots, is.female, 
                                           correction = NULL){
  
  counts <- data.frame(nMales = nMales, 
                       nFemales = nFemales, 
                       is.female = is.female)
  num.papers <- nrow(counts)
  
  boot.alpha <- counts %>% 
    sample_n(size = num.papers * nBoots,   
             replace = TRUE) %>%
    mutate(replicate = rep(1:nBoots, each = num.papers)) %>%
    group_by(replicate) %>%  # Calculate alpha on each random sample
    summarise(alpha = find.alpha.specific.author(nMales, nFemales, is.female)) %>% .$alpha
  
  # Optionally, apply a correction to get alpha'
  if(!is.null(correction)){
    boot.alpha <- boot.alpha - correction
    alpha <- find.alpha.specific.author(nMales, nFemales, is.female) - correction
  }
  
  else alpha <- find.alpha.specific.author(nMales, nFemales, is.female)
  
  # If any re-samples chanced to be entirely male/female, they generate NaNs. Remove them
  boot.alpha <- boot.alpha[!is.nan(boot.alpha)] 
  
  c(alpha, as.numeric(quantile(boot.alpha, probs = c(0.025, 0.975)))) 
}

Define function to calculate \(\alpha\) for a given journal

For each journal, the function calculates \(\alpha\), and its 95% confidence limits. It also generates a two-tailed p-value for the observed \(\alpha\), under the null hypothesis that authors assort randomly with respect to gender. Small p-values mean that the observed set of papers shows significantly more, or less, gender homophily than expected under the null.

There is a good reason to test whether \(\alpha\) is significantly greater (or lower) than expected under the null, rather than simply checking whether the 95% CIs on \(\alpha\) overlap zero. The reason is that the null expectation for \(\alpha\) is not centred on zero for small datasets. This issue was summed up clearly by Prof. Carl Bergstrom in a comment on a blog post (http://www.michaeleisen.org/blog/?p=1931). Prof. Bergstrom wrote:

“Consider a tiny field with four authors, two men and two women, and one paper in each of the six two-author combinations. Now if you pick a man author at random, you are twice likely to pick the man in one of the four man-woman papers as you are to pick a man in the man-man paper. Therefore p = 1/3. But if pick a randomly chosen woman, again you’re more likely to pick a man-woman combination than the one woman-woman combination, and q = 2/3. As a result, \(\alpha\) = -1/3 even though the authorships seem to be distributed without gender bias.�

So, \(\alpha\) is sensitive to the size of the dataset being tested (at least for very small datasets). Our solution is to simulate many fake datasets, where authors are distributed randomly with respect to gender, but the datasets are otherwise identical (in terms of numbers of authorships by men and women, the number of papers, and the distribution of authors per paper). This is what this function does, when calculating whether \(\alpha\) is greater/lower than expected under random assortment by gender.

The function also returns a ‘corrected’ version of \(\alpha\), which is simply \(\alpha\) minus the null expected value of \(\alpha\) for this particular dataset (calculated by our simulation). Because the null expected \(\alpha\) is generally close to zero, the corrected version of \(\alpha\) is usually almost identical. However, the corrected version can deviate a little (as much as 0.05) for the smallest datasets.

# - if a vector is entered for is.female, then the test calculates alpha for a specific authorship position (i.e. first or last); otherwise, it calculates alpha for all authorship positions
# - the 'nYears' argument can be used to restrict the data to papers published nYears before the present. I chose nYears = 1, because if the gender ratio of authors changes over time (which it does), it will generate spurious coauthorship gender homophily 
# - 'minimum.n' refers to the number of papers - the function will quit without performing the test if we did not get enough papers that fit the criteria (i.e. papers published within NYears of the present, with at least 2 authors, where we know all authors' genders with >95% confidence)
gender.grouping.test.one.journal <- function(nMales, nFemales, is.female = NULL, journal, nBoots, nSimulations){
  print(journal)
  
  # First count number (and distribution) of authors that we will swap around
  total.males <- sum(nMales)
  total.females <- sum(nFemales)
  authors.per.paper <- nMales + nFemales
  
  # If we are looking at first or last authors, we will not be swapping them, so subtract accordingly
  if(!is.null(is.female)){ 
    num.male.focal.authors <- sum(!is.female) 
    num.female.focal.authors <- sum(is.female)
    # Quit unless there is a mixture of male and female first/last authors
    if(num.male.focal.authors == 0 | num.female.focal.authors == 0) return(NULL) 
    total.males <- total.males - num.male.focal.authors
    total.females <- total.females - num.female.focal.authors
    authors.per.paper <- authors.per.paper - 1
  }
  
  # Now, let's simulate the expected alpha under the null hypothesis that authors 
  # publish together randomly with respect to gender.
  # Simulate 'nSimulations' fake datasets, which have the same number of papers, 
  # and the same distribution of author list lengths per papers, as the real data,
  # but male and female authors are randomly grouped across papers.
  # The code here is ugly, but is vectorised for speed. Essentially I generate 
  # num.authors * nSimulations random numbers between zero and one, convert them 
  # to ranks, and use these ranks as index positions in 'pasted.gender'. This 
  # allows all the random numbers to be generated in one call, saving time. 
  # Then, it's just a matter of prettifying tapply's output using unname and unlist
  pasted.gender <- c(rep(1, total.males), rep(0, total.females))
  num.papers <- length(nMales)
  num.authors <- total.males + total.females
  simulated.data <- data.frame(
    gender = pasted.gender[unlist(unname(tapply(runif(num.authors * nSimulations), 
                                                rep(1:nSimulations, each = num.authors), rank)))],
    paperID = rep(unlist(mapply(rep, 1:num.papers, each = authors.per.paper)), nSimulations),
    replicate = rep(1:nSimulations, each = num.authors))
  rm(pasted.gender)
  
  # Calculate number of males and females on each simulated paper, in each replicate
  simulated.data <- simulated.data %>% 
    group_by(replicate, paperID) %>% 
    summarise(nFemales = sum(gender == 0), 
              nMales = sum(gender == 1)) %>% 
    select(-paperID)
  
  # Find alpha for each simulation replicate
  
  # If we are doing ALL the author positions, use find.alpha()
  if(is.null(is.female)){
    simulated.alpha.under.null <- simulated.data %>%
      group_by(replicate) %>%
      summarise(alpha = find.alpha(nMales, nFemales)) %>% .$alpha
    
    median.alpha.under.null <- median(simulated.alpha.under.null)
    
    # Calculate the observed alpha and its 95% CIs, both with and 
    # without adjusting for the fact that the null expected alpha can be non-zero
    observed.alpha <- find.alpha.CIs(nMales, nFemales, nBoots = nBoots)
    observed.alpha.adjusted <- find.alpha.CIs(nMales, 
                                              nFemales, nBoots = nBoots, 
                                              correction = median.alpha.under.null)
  }
  
  else{ # Otherwise, use find.alpha.specific.author()
    simulated.data$is.female <- rep(is.female, nSimulations) # Add the first/last authors back in
    simulated.data$nMales[!simulated.data$is.female] <- 
      simulated.data$nMales[!simulated.data$is.female] + 1
    simulated.data$nFemales[simulated.data$is.female] <- 
      simulated.data$nFemales[simulated.data$is.female] + 1
    
    simulated.alpha.under.null <- simulated.data %>%
      group_by(replicate) %>%
      summarise(alpha = find.alpha.specific.author(nMales, nFemales, is.female)) %>% .$alpha
    
    median.alpha.under.null <- median(simulated.alpha.under.null)
    
    observed.alpha <- find.alpha.specific.author.CIs(
      nMales, nFemales, is.female, nBoots = nBoots)
    observed.alpha.adjusted <- find.alpha.specific.author.CIs(
      nMales, nFemales, is.female, nBoots = nBoots, correction = median.alpha.under.null)
  }
  
  # The 2-tailed p-value is the proportion of null-simulated alpha values with 
  # a value at least as far from zero as the observed alpha
  # Thus, a significant p-value means that the observed alpha is larger OR smaller than expected
  two.tail.p.value <- sum(abs(simulated.alpha.under.null) >= 
                            abs(observed.alpha[1])) / length(simulated.alpha.under.null)
  
  # Return the pertinent results
  data.frame(journal = journal,
             n.useable.papers = num.papers,
             n.authors = num.authors,
             gender.ratio.of.sample = 100 * total.females /
               (total.females + total.males),
             observed.alpha = observed.alpha[1],
             lower.95.CI = observed.alpha[2],
             upper.95.CI = observed.alpha[3],
             alpha.adjusted = observed.alpha.adjusted[1],
             lower.95.CI2 = observed.alpha.adjusted[2],
             upper.95.CI2 = observed.alpha.adjusted[3],
             two.tail.p.value = two.tail.p.value)
}

Define a function for parallelisation

Runs gender.grouping.test.one.journal() on all the journals in ‘journals’, in parallel across multiple CPUs.

gender.grouping.test.many.journals <- function(count.data, authors, output.file, nBoots, nSimulations, number.of.cores, over.write = FALSE){
  
  # Only do the test if over.write = TRUE or there is no datafile in the working directory
  if(over.write | !file.exists(output.file)){ 
    journals <- names(count.data)
    nJournals <- length(count.data)
    # Set up a cluster
    cl <- makePSOCKcluster(number.of.cores)
    setDefaultCluster(cl)
    clusterExport(NULL, c("find.alpha",
                          "find.alpha.CIs", 
                          "find.alpha.specific.author",
                          "find.alpha.specific.author.CIs",
                          "gender.grouping.test.one.journal",
                          "pubmed_sqlite"))
    clusterEvalQ(NULL, library(dplyr))
    
    # Run the desired test on each journal, in parallel
    if(authors == "all"){
      results <- parLapply(NULL, 1:nJournals, function(i) 
        gender.grouping.test.one.journal(count.data[[journals[i]]]$nMales,
                                         count.data[[journals[i]]]$nFemales,
                                         NULL,
                                         journal = journals[i],
                                         nBoots = nBoots,
                                         nSimulations = nSimulations))
    }
    
    else if(authors == "first"){
      results <- parLapply(NULL, 1:nJournals, function(i) 
        gender.grouping.test.one.journal(count.data[[journals[i]]]$nMales,
                                         count.data[[journals[i]]]$nFemales,
                                         count.data[[journals[i]]]$firstF,
                                         journal = journals[i],
                                         nBoots = nBoots,
                                         nSimulations = nSimulations))
    }
    
    else if(authors == "last"){
      results <- parLapply(NULL, 1:nJournals, function(i) 
        gender.grouping.test.one.journal(count.data[[journals[i]]]$nMales,
                                         count.data[[journals[i]]]$nFemales,
                                         count.data[[journals[i]]]$lastF,
                                         journal = journals[i],
                                         nBoots = nBoots,
                                         nSimulations = nSimulations))
    }
    
    setDefaultCluster(NULL) # close cluster and write the results to disk
    stopCluster(cl = cl)
    write.csv(do.call("rbind", results), file = output.file, row.names = FALSE)
  }
}

Sample size information

Table S1: Sample sizes for the two datasets, which comprise papers published in the timeframes August 2005 - August 2006, and August 2015 - August 2016.

minimum.n <- 50   # Only include journals with at least 50 papers in the specified range of time
unique.journals <- pubmed_sqlite %>% 
  select(journal) %>% distinct() %>% 
  collect(n = Inf) %>% .$journal # Get a list of all the journals in the database

# Get all the data we need out of the database, and save as an RDS file - for papers in the last 12 months
retrieve.authorship.data(unique.journals, 
                         minimum.n = minimum.n, 
                         start.of.time.period = 1, 
                         end.of.time.period = 0, 
                         file.name = "data/last.year.rds", 
                         over.write = FALSE) 

# Get all the data we need out of the database, and save as an RDS file - for papers that are between 10 and 11 years old
retrieve.authorship.data(unique.journals, 
                         minimum.n = minimum.n, 
                         start.of.time.period = 11, 
                         end.of.time.period = 10, 
                         file.name = "data/older.papers.rds", 
                         over.write = FALSE) 

# Get all the data we need out of the database, and save as an RDS file - for just the 2-author papers in the last 12 months
retrieve.authorship.data(unique.journals, 
                         minimum.n = minimum.n, 
                         start.of.time.period = 1, 
                         end.of.time.period = 0, 
                         restrict.by.n.authors = 2,
                         file.name = "data/last.year_2author.rds", 
                         over.write = FALSE) 

# Get all the data we need out of the database, and save as an RDS file - for just the 3-author papers in the last 12 months
retrieve.authorship.data(unique.journals, 
                         minimum.n = minimum.n, 
                         start.of.time.period = 1, 
                         end.of.time.period = 0, 
                         restrict.by.n.authors = 3,
                         file.name = "data/last.year_3author.rds", 
                         over.write = FALSE) 

# Get all the data we need out of the database, and save as an RDS file - for just the 4-author papers in the last 12 months
retrieve.authorship.data(unique.journals, 
                         minimum.n = minimum.n, 
                         start.of.time.period = 1, 
                         end.of.time.period = 0, 
                         restrict.by.n.authors = 4,
                         file.name = "data/last.year_4author.rds", 
                         over.write = FALSE) 

# Get all the data we need out of the database, and save as an RDS file - for just the 5-plus-author papers in the last 12 months
retrieve.authorship.data(unique.journals, 
                         minimum.n = minimum.n, 
                         start.of.time.period = 1, 
                         end.of.time.period = 0, 
                         restrict.by.n.authors = 5, # NB this actually means "5 or more"
                         file.name = "data/last.year_5author.rds", 
                         over.write = FALSE) 

# Get metadata about each journal (research discipline, ISI impact factor, and the country that publishes it)
# see Holman et al. in prep, for information on how journals were assigned to disciplines
journal.data <- journals_sqlite %>% collect(n=Inf) %>% 
  rename(journal = short.title, 
         discipline = Discipline, 
         country = journal.country) %>% 
  select(journal, discipline, country, IF) %>% 
  as.data.frame %>% 
  distinct(journal, .keep_all = T)


make.sample.size.table <- function(filename){
  
  count.data <- readRDS(paste("data/", filename, ".rds", sep = "")) 
  data.frame(Quantity = c("Number of disciplines", 
                          "Number of journals", 
                          "Number of papers",
                          "Number of authors",
                          "Median number of papers per journal", 
                          "Median number of authors per journal",
                          "Median number of authors per paper"),
             Value = c(journal.data %>% filter(journal %in% names(count.data)) %>% 
                         select(discipline) %>% 
                         distinct() %>% nrow(),
                       length(count.data),
                       nrow(do.call("rbind", count.data)[,1:2]),
                       sum(do.call("rbind", count.data)[,1:2]),
                       median(sapply(count.data, nrow)),
                       median(sapply(count.data, function(x) sum(x[,1]) + sum(x[,2]))),
                       median(rowSums(do.call("rbind", count.data)[,1:2])))) 
}

if(!file.exists("manuscript/sample.size.table.csv")){
  sample.size.table <- make.sample.size.table("older.papers") %>% 
    rename(`n (2005-2006)` = Value) %>% 
    left_join(make.sample.size.table("last.year") %>% 
                rename(`n (2015-2016)` = Value), by = "Quantity")
  write.csv(sample.size.table, file = "manuscript/sample.size.table.csv", row.names = FALSE)
}

read.csv("manuscript/sample.size.table.csv") %>% pander(split.cell = 40, split.table = Inf)
Quantity n..2005.2006. n..2015.2016.
Number of disciplines 101 107
Number of journals 1192 2116
Number of papers 151652 276879
Number of authors 647634 1311213
Median number of papers per journal 87 87
Median number of authors per journal 371 413
Median number of authors per paper 4 4

Calculating \(\alpha\)’

Run the test on every journal for which we have sufficient data

Here, ‘sufficient data’ means that there are at least 50 papers, for which there are 2+ authors, and we know all authors’ genders, and these papers were published in the last year. We calculate \(\alpha\) for three types of authors:

  1. All authors; this alpha statistic is positive if the average man author has a co-author gender ratio that is more male-biased than the average woman author.
  2. First authors; this alpha statistic is positive if the average man first author has a co-author gender ratio that is more male-biased than the average woman first author.
  3. Last authors; this alpha statistic is positive if the average man last author has a co-author gender ratio that is more male-biased than the average woman last author.
# Calculate alpha across all authorship positions for all suitable NEW papers
gender.grouping.test.many.journals(readRDS("data/last.year.rds"),
                                   "all",
                                   "data/homophily.results.all.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha for first authors for all suitable NEW papers
gender.grouping.test.many.journals(readRDS("data/last.year.rds"),
                                   "first",
                                   "data/homophily.results.first.authors.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha for last authors for all suitable NEW papers
gender.grouping.test.many.journals(readRDS("data/last.year.rds"),
                                   "last",
                                   "data/homophily.results.last.authors.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha across all authorship positions for all suitable OLD papers
gender.grouping.test.many.journals(readRDS("data/older.papers.rds"),
                                   "all",
                                   "data/older.homophily.results.all.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha for first authors for all suitable OLD papers
gender.grouping.test.many.journals(readRDS("data/older.papers.rds"),
                                   "first",
                                   "data/older.homophily.results.first.authors.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha for last authors for all suitable OLD papers
gender.grouping.test.many.journals(readRDS("data/older.papers.rds"),
                                   "last",
                                   "data/older.homophily.results.last.authors.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha across all authorship positions for all suitable NEW papers with TWO authors
gender.grouping.test.many.journals(readRDS("data/last.year_2author.rds"),
                                   "all",
                                   "data/homophily.results.2author.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha across all authorship positions for all suitable NEW papers with THREE authors
gender.grouping.test.many.journals(readRDS("data/last.year_3author.rds"),
                                   "all",
                                   "data/homophily.results.3author.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha across all authorship positions for all suitable NEW papers with FOUR authors
gender.grouping.test.many.journals(readRDS("data/last.year_4author.rds"),
                                   "all",
                                   "data/homophily.results.4author.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

# Calculate alpha across all authorship positions for all suitable NEW papers with FIVE authors
gender.grouping.test.many.journals(readRDS("data/last.year_5author.rds"),
                                   "all",
                                   "data/homophily.results.5author.csv",
                                   nBoots = 1000, 
                                   nSimulations = 1000, 
                                   number.of.cores = 8,
                                   over.write = FALSE)

Add additional metadata to the \(\alpha\) results

Here, we assign extra information to each journal - research discipline (see Holman et al. 2018), ISI impact factor (where available), and the country that publishes the journal (where known; information taken from PubMed).

# Calculate the impact factor of each journal relative to other journals in its discipline (using residuals from a mixed model)
journal.data$logIF <- log10(journal.data$IF) # Take the log10 of the impact factor
journal.data$standardised.IF <- NA
# Calculate residual log10 IF for each journal (i.e. IF relative to the field that it is in)
journal.data$standardised.IF[!is.na(journal.data$logIF)] <- 
  resid(lmer(logIF ~ (1|discipline), data = journal.data[!is.na(journal.data$logIF), ])) 

# Function to neatly merge the journal-specific data with the gender homophily results
# Also add another column of p-values, which are False Discovery Rate-corrected using B-H method
merge.and.clean <- function(homophily.filepath, journal.data){
  left_join(read.csv(homophily.filepath, stringsAsFactors = FALSE), 
            journal.data, 
            by = "journal") %>% 
    select(discipline, 
           journal, 
           standardised.IF, 
           country,
           n.useable.papers, 
           n.authors, 
           gender.ratio.of.sample, 
           observed.alpha, 
           lower.95.CI, 
           upper.95.CI, 
           alpha.adjusted, 
           lower.95.CI2, 
           upper.95.CI2, 
           two.tail.p.value) %>% 
    arrange(-n.useable.papers) %>% # Sort by sample size
    mutate(adjusted.p.value = p.adjust(two.tail.p.value, method = "BH")) 
}

# Load up all the homophily results, and merge them with the journal-specific information
gender.homophily.all <- merge.and.clean("data/homophily.results.all.csv", journal.data)
gender.homophily.first <- merge.and.clean("data/homophily.results.first.authors.csv", journal.data)
gender.homophily.last <- merge.and.clean("data/homophily.results.last.authors.csv", journal.data)

gender.homophily.all.old <- merge.and.clean("data/older.homophily.results.all.csv", journal.data)
gender.homophily.first.old <- merge.and.clean("data/older.homophily.results.first.authors.csv", journal.data)
gender.homophily.last.old <- merge.and.clean("data/older.homophily.results.last.authors.csv", journal.data)

# Bind all 5 datasets into a single one, with a column called 'Subset' to identify them
master.dataset <- rbind(gender.homophily.all,
                        gender.homophily.first,
                        gender.homophily.last,
                        gender.homophily.all.old,
                        gender.homophily.first.old,
                        gender.homophily.last.old) %>%
  mutate(Subset = unlist(mapply(rep, c("All authors", "First authors", "Last authors"), 
                                each = c(nrow(gender.homophily.all), 
                                         nrow(gender.homophily.first),
                                         nrow(gender.homophily.last),
                                         nrow(gender.homophily.all.old),
                                         nrow(gender.homophily.first.old),
                                         nrow(gender.homophily.last.old)))),
         Age = unlist(mapply(rep, c("New", "Old"), 
                                each = c(sum(nrow(gender.homophily.all), 
                                             nrow(gender.homophily.first), 
                                             nrow(gender.homophily.last)),
                                         sum(nrow(gender.homophily.all.old), 
                                             nrow(gender.homophily.first.old), 
                                             nrow(gender.homophily.last.old)))))) %>%
  select(discipline, journal, Subset, Age, standardised.IF, n.useable.papers, n.authors, 
         gender.ratio.of.sample, observed.alpha, lower.95.CI, upper.95.CI, alpha.adjusted,
         lower.95.CI2, upper.95.CI2, two.tail.p.value, adjusted.p.value) %>%
  rename(unadjusted.p.value = two.tail.p.value)

Calculating gender homophily for each combination of author country and journal

One reason that we might see ‘spurious’ homophily is that the gender ratio of resaerchers varies across countries. For example, in our dartaset, most Serbian author are women and most Japanese authors are men. Because there are realtive few collaborations between people from these countries (relative to within country-collaborations), we will tend to see more msotly-male and mostly-female papers than expected even if people assort randomly within countries. So, if we split the data by country of author affiliation, and we still see homophily, this suggests that variation across countries cannot fully explain our results.

The following code restricts our dataset to only include papers where we know the country of affiliation for all authors on the paper, and all authors are from the same country (this is much simpler than trying to account for multi-country papers). Then, for each journal-country combination where we have at least 50 papers, we calculate the coefficient of homophily just as before.

gender.grouping.test.by.journal.and.country <- function(){
  
  country.data <- pubmed_sqlite %>%            # Go to the big database
    select(pmid, date, gender.95, country) %>%   # Get the pubmed ID, gender, and author country columns
    filter(!is.na(country) & country != "NA") %>% # Throw out papers with no country
    collect(n=Inf) %>%                        # Bring the data into memory
    filter(!grepl("NA", country),             # Throw out papers with no country (again)
           !grepl("U", gender.95),            # Throw out papers where we don't know all author genders
           nchar(gender.95) > 1) %>%          # Throw out papers with only one author
    mutate(date = convert.dates(date)) %>%    # Convert the dates to years-before-present
    filter(date > -1) %>% select(-date)       # Only keep papers from August 2015 - August 2016
  unique.split.countries <- lapply(strsplit(country.data$country, 
                                            split = "_"), unique)
  to.include <- sapply(unique.split.countries, length) == 1
  country.data <- data.frame(pmid = country.data$pmid[to.include],
                             gender.95 = country.data$gender.95[to.include],
                             country = unlist(unique.split.countries[which(to.include)]))
  rm(list = c("unique.split.countries", "to.include")) # free up memory
  
  # Merge in information on the journal and discipline of all these suitable papers
  country.data <- country.data %>%          
    left_join(pubmed_sqlite %>% select(pmid, journal, discipline) %>%
                collect(n=Inf), by = "pmid") %>% 
    select(-pmid)
  
  # Restrict the dataset to only journal-country combinations 
  # for which we have at least 'minimum.n' papers
  country.data <- country.data %>% 
    group_by(journal, country) %>% 
    summarise(n.papers = n()) %>% 
    filter(n.papers >= minimum.n) %>% select(-n.papers) %>% 
    left_join(country.data, by = c("journal", "country")) %>%
    mutate(gender.95 = as.character(gender.95))
  
  country.data$nMales <- str_count(country.data$gender.95, "M")
  country.data$nFemales <- str_count(country.data$gender.95, "F")
  
  # First find all the combinations of country and journal
  combinations <- (country.data %>% group_by(journal, country) %>% 
                     summarise(n=n()))[,1:2] %>% as.data.frame()
  # Now loop over them, using multiple cores
  output <- do.call("rbind", mclapply(1:nrow(combinations), function(i){
    focal <- country.data %>% filter(country == combinations$country[i], 
                                     journal == combinations$journal[i])
    data.frame(country = combinations$country[i],
               journal = combinations$journal[i],
               gender.grouping.test.one.journal(focal$nMales, 
                                                focal$nFemales, 
                                                is.female = NULL, 
                                                "Doing one", 
                                                nBoots=1000, 
                                                nSimulations=1000)[,-1]
    )}))
  
  # Function to fix capitalisation of country names
  capitalise.countries <- function(countries){ 
    countries <- as.character(
      sapply(as.character(countries), function(x) {
               s <- strsplit(x, " ")[[1]]
               paste(toupper(substring(s, 1, 1)), substring(s, 2), sep="", collapse=" ")
             })) 
    str_replace(str_replace(countries, " And ", " and "), "Usa", "USA")
  }
  
  output$country <- capitalise.countries(output$country)
  output
}


if(!file.exists("data/country.data.csv")){ # Write the data, assuming it's not already been done
  country.data <- gender.grouping.test.by.journal.and.country()
  write.csv(country.data, file = "data/country.data.csv", row.names = FALSE)
}

country.data <- read.csv("data/country.data.csv", stringsAsFactors = FALSE) %>%
  mutate(p.adjusted = p.adjust(two.tail.p.value, method = "BH")) %>%
  left_join(journal.data %>% select(journal, discipline), by = "journal") 

Checking that adjusted and un-adjusted \(\alpha\) are very similar

fig_s7 <- master.dataset %>%
  ggplot(aes(x = observed.alpha, y = alpha.adjusted)) + 
  geom_abline(intercept = 0, slope = 1, colour = "darkgrey") + 
    geom_point(size=0.6, alpha=0.4) + 
  ylab("Adjusted measure of homophily (\u03B1')") + 
  xlab("Unadjusted measure of homophily (\u03B1)") + 
  theme_minimal()
ggsave(fig_s7, file = "figures/S7 Fig.pdf", height = 5, width = 5, device = cairo_pdf)
fig_s7


Figure S7: There is a very strong correlation between the values of \(\alpha\) and \(\alpha'\) calculated for each journal, though in a handful of cases the difference is considerable. The deviation between \(\alpha\) and \(\alpha'\) is greatest for journals for which there is a small sample size (see S8 Fig).

Un-adjusted \(\alpha\) is downwardly biased for small datasets

As expected, unadjusted \(\alpha\) is slightly downwardly biased for small datasets, because the expected value of \(\alpha\) under the null is less than 0. Thus, our adjusted \(\alpha'\) statistic, which is the observed \(\alpha\) minus the value of \(\alpha\) expected under the null, tends to be substantially higher than \(\alpha\) for small datasets. All of our results do not differ qualitatively if we use the unadjusted \(\alpha\), but we have elected to present the results for \(\alpha'\) in light of the issues with unadjusted \(\alpha\) for small datasets.

fig_s8 <- master.dataset %>%
  filter(Age == "New") %>%
  ggplot(aes(x = log10(n.useable.papers), 
             y = alpha.adjusted - observed.alpha)) + 
  geom_point(alpha = 0.3, size = 0.5) + 
  facet_wrap(~Subset) +
  ylab("\u03B1' - \u03B1") + 
  xlab("Log10 sample size (number of papers)") + 
  theme_minimal()
ggsave(fig_s8, file = "figures/S8 Fig.pdf", height = 5, width = 5, device = cairo_pdf)


Figure S8: For journals for which we recovered a small number of papers (<100), the unadjusted metric \(\alpha\) was downwardly biased. This fits our expectations: because authors cannot be their own co-authors, small datasets will tend to produce negative estimates of \(\alpha\) even if authors assort randomly with respect to gender (see main text). This suggests that \(\alpha'\) is a more useful measure of homophily and heterophily, especially for small samples.

Results

Create S1 Data, which holds all the \(\alpha'\) estimates

S1 Data: This file gives the \(\alpha'\) values calculated for each journal, in the 2005 and 2015 samples, and for each type of author (all authors, first authors, and last authors). The tables gives the impact factor of each journal, the sample size, \(\alpha\) and \(\alpha'\) and their 95% CIs, and the p-value from a 2-tailed test evaluating the null hypothesis that \(\alpha\) is zero (both raw and FDR-corrected p-values are shown).

# Save and export a neat and tidy version for the supplementary material
neat <- master.dataset
names(neat) <- gsub("[.]", " ", names(neat))
names(neat) <- gsub("Subset", "Author position", names(neat))
names(neat) <- paste(toupper(substring(names(neat), 1, 1)), substring(names(neat), 2), sep="")
names(neat) <- gsub("N", "n", names(neat))
names(neat) <- gsub("2", "", names(neat))
for(i in which(sapply(neat, is.numeric))) neat[,i] <- round(neat[,i], 3)
write.csv(neat, file = "supplement/S1 data.csv", row.names = FALSE)
rm(neat)

Create S2 Data, showing the number of papers with \(n\) authors per journal

S2 Data: This file gives the number and percentage of paper that have 1, 2, 3, 4, or \({\ge}5\) authors for each journal in the dataset of Holman et al. (2018) PLoS Biology. Note that the sample sizes include papers for which the gender of one or more authors was not determined by Holman et al.

if(!file.exists("supplement/S2 data.csv")){
  freq_multiauthor_by_journal <- pubmed_sqlite %>% 
  select(journal, gender.95) %>% 
  collect(n=Inf) %>% 
  group_by(journal) %>% 
  summarise(n1 = sum(nchar(gender.95) == 1), # Count the number of n author papers
            n2 = sum(nchar(gender.95) == 2), 
            n3 = sum(nchar(gender.95) == 3), 
            n4 = sum(nchar(gender.95) == 4), 
            `n5+` = sum(nchar(gender.95) >= 5)) %>%
  gather(author_number, count, n1, n2, n3, n4, `n5+`) %>%
  mutate(author_number = gsub("n", "", author_number)) %>%
  arrange(journal, author_number) %>% 
  left_join(journals_sqlite %>% 
              select(short.title, Discipline) %>% 
              collect(), by = c("journal" = "short.title")) %>%
  distinct() %>% 
  split(.$journal) %>% # Add a % column
  purrr::map(function(x) {
    x$percent <- 100 * x$count/sum(x$count)
    x}) %>% 
  do.call("rbind", .) %>%
  mutate(percent = round(percent, 1))
write.csv(freq_multiauthor_by_journal, file = "supplement/S2 data.csv", row.names = FALSE)
}

Create S3 Data, showing the number of papers with \(n\) authors per discipline

S3 Data: This file gives the number and percentage of paper that have 1, 2, 3, 4, or \({\ge}5\) authors for each discipline in the dataset of Holman et al. (2018) PLoS Biology. Note that the sample sizes include papers for which the gender of one or more authors was not determined by Holman et al.

freq_multiauthor_by_discipline <- read.csv("supplement/S2 data.csv", stringsAsFactors = FALSE) %>%
  group_by(Discipline, author_number) %>%
  summarise(count = sum(count)) %>% 
  split(.$Discipline) %>% 
  purrr::map(function(x) {
    x$percent <- 100 * x$count/sum(x$count)
    x}) %>% 
  do.call("rbind", .) %>%
  mutate(percent = round(percent, 1))
write.csv(freq_multiauthor_by_discipline, file = "supplement/S3 data.csv", row.names = FALSE)

Multi-author papers are common in most disciplines

This figure illustrates that single-author papers make up a small minority of the total, for the majority of disciplines we examined. Indeed for many disciplines, the majority of papers have at least five authors.

big.plot <- freq_multiauthor_by_discipline %>%
  ggplot(aes(author_number, percent, group = 1)) + 
  geom_line() + 
  scale_y_continuous(breaks = c(0, 40, 80)) +
  facet_wrap(~Discipline, ncol = 8, labeller = label_wrap_gen(width = 25)) +
  theme_bw() + 
  theme(strip.text = element_text(size = 7.2), panel.grid.major.x = element_blank()) + 
  xlab("Number of authors") + 
  ylab("% of papers")

ggsave(big.plot, file = "figures/S1 Fig.pdf", height = 11, width = 10)
big.plot


Figure S1: Plot showing the percentage of papers that have 1, 2, 3, 4, or \({\ge}5\) authors for each discipline in the dataset of Holman et al. (2018). This information can also be found in S3 Data.

Distribution of gender homophily and heterophily across journals

Most journals show gender homophily (i.e. \(\alpha' > 0\)), not heterophily, meaning that people are more likely to co-publish with colleagues of the same gender.

figure2A <- master.dataset %>%
  filter(Age == "New") %>% 
  mutate(Significant = 1 * (adjusted.p.value < 0.05),
         Significant = replace(Significant, Significant == 0, "No"),
         Significant = replace(Significant, Significant == 1, "Yes")) 
n.sig.homo <- with(figure2A, sum(Significant[Subset == "All authors"] == "Yes" & 
                                   alpha.adjusted[Subset == "All authors"] > 0))
n.sig.hetero <- with(figure2A, sum(Significant[Subset == "All authors"] == "Yes" & 
                                     alpha.adjusted[Subset == "All authors"] < 0))
n.journals <- sum(figure2A$Subset == "All authors")

figure2A <- figure2A %>%
  ggplot(aes(x = alpha.adjusted, fill = Significant)) + 
  geom_vline(xintercept = 0, linetype = 2) +
  geom_area(binwidth = 0.01, colour = "black", stat = "bin") + 
  scale_fill_manual(values = c("skyblue", "white")) +
  xlab("Coefficient of homophily (\u03B1')") + 
  ylab("Number of journals") +
  facet_wrap(~Subset, scales = "free_y", ncol=1) + 
  theme_minimal(12) + 
  theme(legend.position = c(0.9, 0.96))
figure2A