
options(PlosApiKey = "<insert your API key here!>") 

#install_github("rplos", "ropensci") 

library("rplos") 

library("ggplot2") 

require("dplyr") 



# Convert author strings to counts 

countAuths < function(cell) 

length(unlist(strsplit(cell, ";"))) 



countAuths < Vectorize(countAuths) 



# Query PLoS API for 1k papers per journal per year, 

# count the number of authors and return a data.frame 

getAuths < function(j, lim=1000, start.year=2006){ 

cat("Getting results for journal: ", j, "\n") 

# seem to be in reverse order by year? 

results < sapply(start.year:2013, function(i) data.frame(year = i, 

auths = searchplos( 

q = paste0('publication_date:[', i, 

'0101T00:00:00Z TO ', i, 

'1231T23:59:59Z]'), 

fl = "author", 

fq = list("doc_type:full", 

paste0("cross_published_journal_key:", j)), 

start=0, limit=lim, sleep=6), 

year=i), simplify=F) 

results < do.call(rbind, results) 

results$counts < countAuths(results$author) 

results$journal < j 

results 

} 



journals < journalnamekey() 

plos.all < sapply(journals[c(1:5, 7)], getAuths, simplify=F) 

plos < do.call(rbind, plos.all) 



# Fig. 1: Bean plot showing distribution of author counts 

# per journal overall 

ggplot(plos, aes(x=journal, y=counts, fill=journal)) + 

geom_violin(scale="width") + 

geom_boxplot(width=.12, fill=I("black"), notch=T, 

outlier.size=NA, col="grey40") + 

stat_summary(fun.y="median", geom="point", shape=20, col="white") + 

scale_y_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) + 

coord_flip() + labs(x="", y="Number of authors per paper") + 

theme_classic() + theme(legend.position="none") + 

scale_fill_brewer() 



# Fig 2. ECDFs of the author count distributions 

# 5in x 5in 

ggplot(plos, aes(x=counts, col=journal)) + 

stat_ecdf(geom="smooth", se=F, size=1.2) + theme_bw() + 

scale_x_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) + 

theme(legend.position=c(.75,.33)) + 

labs(x="Number of authors per paper", y="ECDF", 

col="") + coord_cartesian(xlim=c(1,300)) + 

scale_color_brewer(type="qual", palette=6) 



# Fig 3. Trends in author counts over time with 

# confidence limits on the means 

# 7 x 7 

ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) + 

stat_summary(fun.data="mean_cl_boot", geom="ribbon", 

width=.2, alpha=I(.5)) + 

stat_summary(fun.y="mean", geom="line") + 

labs(list(x="Year", y="Mean number of authors per paper")) + 

theme_bw() + theme(legend.position=c(.2,.85)) + 

scale_fill_brewer(type="qual", palette=2, 

guide=guide_legend(direction="vertical", 

label.position="bottom", 

title=NULL, ncol=2, 

label.hjust=0.5)) + 

scale_color_brewer(type="qual", palette=2, guide="none") 





# from http://stackoverflow.com/a/17024184/1274516 

# show regression equation on each graph facet 

lm_eqn < function(df){ 

m < summary(lm(counts ~ year, df)) 

eq < substitute(~~y~"="~beta*x+i~(R^2==r2), 

list(beta = format(m$coefficients[2,"Estimate"], 

digits = 3), 

i = format(m$coefficients[1,"Estimate"], digits=3), 

r2 = format(m$r.squared, digits=2))) 

as.character(as.expression(eq)) 

} 



means < group_by(plos, journal, year) %.% summarise(counts=mean(counts)) 

b < by(means, means$journal, lm_eqn) 

df < data.frame(beta=unclass(b), journal=names(b)) 

summary(lm(counts ~ year + journal, data=means)) 



means < group_by(means, journal) %.% summarise(m=max(counts)) 

df$top < means$m * 1.2 



# Fig 4. Facetted linear regression of author inflation per journal 

# 6 x 8.5 

ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) + 

stat_summary(fun.data="mean_cl_boot", geom="errorbar", 

width=.2, alpha=I(.5)) + 

stat_summary(fun.y="mean", geom="point") + 

#stat_summary(fun.y="median", geom="point", shape=4) + 

facet_wrap(~journal, scales="free_y") + 

geom_smooth(method="lm") + 

scale_x_continuous(breaks=2006:2013) + 

labs(list(x="", y="Mean number of authors per paper")) + 

theme_bw() + theme(axis.text.x=element_text(angle=45, hjust=1)) + 

scale_fill_brewer(type="qual", palette=2, guide="none") + 

scale_color_brewer(type="qual", palette=2, guide="none") + 

geom_text(data=df, aes(x=2009.5, y=top, label=beta), size=3, parse=T) 



# Overall estimate of author inflation? 

# .21 extra authors per paper per year, on average 

s < summary(lm(counts ~ year + journal, data=plos)) 





# Summary barchart data: 

bc < data.frame(journal = unique(means$journal), 

trend = c(0.2490979, 

0.1211823, 

0.5201688, 

0.4088536, 

0.05894102, 

0.1828939), 

std.err = c(0.08224567, 

0.02213142, 

0.1493662, 

0.06361849, 

0.03891493, 

0.03798822), 

IF = c(12.690, 

4.867, 

8.517, 

15.253, 

3.730, 

8.136)) 



bc$journal < factor(bc$journal, levels=bc$journal[order(bc$trend)]) 



# Fig 5. Barchart of author inflation estimate per journal. 

# 7 x 5 

ggplot(bc, aes(x=journal, y=trend, fill=journal, ymin=trend–std.err, 

ymax=trend+std.err)) + 

geom_bar(stat="identity") + 

geom_errorbar(width=.2) + 

scale_y_continuous(expand=c(0,0)) + 

theme_classic() + 

labs(x="", 

y="Estimate of annual author inflation (additional mean authors per paper)") + 

theme(axis.text.x=element_text(angle=45, hjust=1)) + 

scale_fill_brewer(palette="Blues", guide="none") 



pcc < cor(bc$trend, bc$IF) 

# Fig 6. Correlation of author inflation and journal impact factors. 

# 5 x 5 

ggplot(bc, aes(x=trend, y=IF, col=journal)) + 

geom_text(aes(label=journal)) + xlim(0,.6) + 

labs(x="Author inflation estimate", 

y="Journal impact factor (2012)") + 

scale_color_brewer(type="qual", palette=2, guide="none") + 

annotate("text", x=.05, y=15, 

label=paste0("rho == ", format(pcc, digits=2)), parse=T) 



# N.S. (p = 0.18) 

cor.test(bc$trend, bc$IF) 