Ring et al. 2017 AER plotting

Caroline Ring

2018-01-23

This vignette contains the code necessary to create the AER (and OED, and exposure) heatmaps contained in the paper.

First, let’s load some useful packages.

library('data.table')
library('gplots')
library('ggplot2')
library('httk')

The vignette about model evaluations for subpopulations produced data files for each subpopulation, containing Css percentiles for each chemical in the HTTK data set. As described in the paper, for each chemical, an oral equivalent dose (OED) can be computed using the 95th percentile Css and a ToxCast AC50. The OED is an estimate of the dose that would induce bioactivity. Then, this OED can be compared to an estimate of exposure for the same chemical. The ratio of OED to exposure is called the activity-exposure ratio, or AER. if the AER is 1 or less, then exposure to this chemical may be high enough to induce bioactivity. if the AER is much more than 1, then there probably isn’t enough exposure to this chemical to cause bioactivity. The AER is thus an estimate of risk.

Computing OEDs

The first step is to read in the Css percentile data. We’ll go ahead and do this for all 10 subpopulations.

#Set some basic parameters for which data set to use
poormetab <- TRUE
fup.censor <- TRUE
model <- '3compartmentss'
#List all the subpopulations
ExpoCast.groups <- c('Total',
                     'Age.6.11',
                     'Age.12.19',
                     'Age.20.65',
                     'Age.GT65',
                     'BMIgt30',
                     'BMIle30',
                     'Males',
                     'Females',
                     'ReproAgeFemale')
#Read in data from each subpop and bind it all together
#Use the direct-resampling data
dat <- rbindlist(lapply(ExpoCast.groups,
                        function(x) {
                          tmp <- readRDS(paste0('data/',
                                                paste('allchems',
                                                      x,
                                                      'dr',
                                                      'poormetab',
                                                      poormetab,
                                                      'fup.censor',
                                                      fup.censor,
                                                      model,
                                                      "FuptoFub",
                                                      sep='_'),
                                                '.Rdata'))
                          tmp[, ExpoCast.group:=x]
                          return(tmp)
                          }))

Get the compound names that correspond to each CAS.

chem.dt <- as.data.table(httk::get_cheminfo(info=c('CAS', 'Compound'),
                                            exclude.fub.zero=FALSE))
setnames(chem.dt, 'CAS', 'chemcas')
dat <- merge(dat, chem.dt, by='chemcas')

Next, read in the ToxCast AC50 values.

#Column names are Assay Endpoint, CASRN, Activity Call, Q, AC50, Emax, Log AC50,
#B, T, W, Data Type, Chemical Name.
#Replace names containing spaces with names without spaces.
setnames(tc.dt, 
         c('Assay Endpoint', 'AC 50', 'Chemical Name','Activity Call'),
         c('Assay.Endpoint', 'AC50', 'Chemical.Name', 'Activity.Call'))

Keep only the rows of ToxCast data where the chemical was judged “Active” in an assay. Note: this will remove some chemicals entirely (if they were “Inactive” in all assays).

#Keep only the rows with "Active" calls.
tc.dt.sub <- tc.dt[Activity.Call=="Active", 
                   .(Chemical.Name, CASRN, Assay.Endpoint, Activity.Call, AC50)]

We summarize the distribution of AC50 values across all assays for each chemical by taking several percentiles.

ac50pct <- tc.dt.sub[, 
                     as.list(quantile(AC50, 
                                      probs=c(0,0.05,0.1,
                                              0.25,0.5,0.75,
                                              0.9,0.95,1), 
                                      na.rm=TRUE)),
                     keyby=CASRN]

By default, the columns of ac50pct have names like “5%”, “10%”, etc. The “%” sign interferes with data.table’s syntax for referring to columns. So we replace names like “5%” with names like “AC50p5”, to indicate that the column contains AC50 values, percentile 5. To do this, use regular expressions.

pctnames <- grep(names(ac50pct), pattern='%',value=TRUE)
setnames(ac50pct,
         pctnames,
         gsub(x=pctnames, 
              pattern='(\\d{1,3})\\%', 
              replacement='AC50p\\1', 
              perl=TRUE) 
         )
#While we're at it, let's change the name of the CAS column to comport with its
#name in the Css data
setnames(ac50pct, 'CASRN', 'chemcas')

Now we are ready to compute OEDs. Let’s compute one OED corresponding to each ToxCast AC50, using the 95th percentile Css for all of them.

#merge in the AC50 percentile data
m.tmp <- merge(dat,ac50pct,by='chemcas')
#find column names beginning with "AC50"
#to use for naming the OED columns
ac50names <- grep(x=names(m.tmp),
                  pattern='^AC50', 
                  value=TRUE,
                  perl=TRUE)
#Compute OEDs
m.tmp[, (paste('oed', ac50names, sep='.')):=lapply(.SD,
                                                   function(x) x/m.tmp[, css95]),
      .SDcols=ac50names]

For another view of the data, let’s compute one OED corresponding to each Css percentile, using the 10th percentile AC50 value for all of them.

#merge in the AC50 percentile data
m.css <- merge(dat,ac50pct,by='chemcas')
#find column names beginning with "css"
#to use for naming the OED columns
cssnames <- grep(x=names(m.css),
                  pattern='^css', 
                  value=TRUE,
                  perl=TRUE)
#Compute OEDs
m.css[, (paste('oed', cssnames, sep='.')):=lapply(.SD,
                                                   function(x) m.css[, AC50p10]/x),
      .SDcols=cssnames]

Exposure data

Bringing in the NHANES exposure inference data, onlyp, let’s compute the “worst-case” AER: using oed.css95 (the OED based on the 95th percentile Css value and 10th percentile AC50) and exposure.median.95CI.upper (the upper bound of the 95% confidence interval on the median exposure estimate).

#Merge in the exposure data
m.tmp <- merge(m.tmp,onlyp,
               by=c("chemcas", "ExpoCast.group"))
m.css <- merge(m.css,onlyp,
               by=c("chemcas", "ExpoCast.group"))
#Compute AER
m.tmp[, aer:=oed.AC50p10/exposure.median.95CI.upper]
m.css[, aer:=oed.css95/exposure.median.95CI.upper]

Plotting OED vs. exposure boxplots

First, take care of some housekeeping. Replace a couple of very long chemical names with shorter names, for better display.

#Shorten a couple of compound names for display
m.tmp[Compound=="O-ethyl o-(p-nitrophenyl) phenylphosphonothioate",
      Compound:="Phosphonothioic acid"]
m.tmp[Compound=="4-(1,1,3,3-tetramethylbutyl)phenol",
      Compound:="p-tert-Octylphenol"]
m.css[Compound=="O-ethyl o-(p-nitrophenyl) phenylphosphonothioate",
      Compound:="Phosphonothioic acid"]
m.css[Compound=="4-(1,1,3,3-tetramethylbutyl)phenol",
      Compound:="p-tert-Octylphenol"]

For this plot, show only the total population.

#Plot only for the total population
m.Total <- m.tmp[ExpoCast.group=='Total',]
m.css.Total <- m.css[ExpoCast.group=='Total',]

By default, ggplot2 will plot the chemicals in alphabetical order. Instead, let’s order them from smallest to largest AER. To do this, we need to sort the data table, then create a chemical names factor variable, with levels given by the AER-sorted order.

#Order the chemicals by AER
setorder(m.Total, aer)
#Get a list of ordered chemical names so that ggplot2 will plot them in the 
#right order (as opposed to its default alphabetical order)
cpdlevels <- m.Total[, Compound]
m.Total[, Compound.factor:=factor(Compound, levels=cpdlevels)]
m.tmp[, Compound.factor:=factor(Compound, levels=cpdlevels)]

#Order the chemicals by AER
setorder(m.css.Total, aer)
#Get a list of ordered chemical names so that ggplot2 will plot them in the 
#right order (as opposed to its default alphabetical order)
cpdlevels <- m.css.Total[, Compound]
m.css.Total[, Compound.factor:=factor(Compound, levels=cpdlevels)]
m.css[, Compound.factor:=factor(Compound, levels=cpdlevels)]

Now, we’re ready to start the plot. First, make the OED boxes, ranging from 25th percentile AC50 to 75th percentile AC50, with a crossbar at median AC50.

#Start the plot: first, make the OED boxes, ranging between 25th and 75th 
#percentile AC50, with a crossbar at median AC50.
p <- ggplot(data=m.Total) +
  geom_crossbar(aes(x=Compound.factor,y=oed.AC50p50,
                    ymin=oed.AC50p25,ymax=oed.AC50p75))

Add the upper whisker (to OED for 90th percentile AC50) and the lower whisker (to OED for 10th percentile OED).

p <- p +
  geom_linerange(aes(x=Compound.factor,
                     ymin=oed.AC50p75,
                     ymax=oed.AC50p90))+
  geom_linerange(aes(x=Compound.factor,
                     ymin=oed.AC50p10,
                     ymax=oed.AC50p25))  

And add points above and below the box-and-whisker plots, to represent OEDs for 95th percentile AC50 values, and for 5th percentile AC50 values.

p <- p +
  geom_point(aes(x=Compound.factor, y=oed.AC50p95)) + 
  geom_point(aes(x=Compound.factor, y=oed.AC50p5))

Finally, add the exposure box plots, ranging between the upper and lower bounds on the 95% confidence interval for the median, with a crossbar at the median.

p <- p +
  geom_crossbar(aes(x=Compound.factor, y=exposure.median,
                    ymin=exposure.median.95CI.lower,
                    ymax=exposure.median.95CI.upper),
                color='#FC8D62') 

Set the OED/exposure axis (vertical axis) to a log scale, and do some tweaking of the plot labels to make things more readable.

p <- p +
  scale_y_log10()+
  theme_bw()+
  theme(axis.text.x = element_text(size = 12, angle = 60, 
                                   hjust = 1, colour = "grey50"),
        axis.ticks.x = element_line(size=0.01, color = 'grey50'),
        legend.title = element_text(size=rel(1.2)),
        legend.text = element_text(size=rel(1.2))) +
  labs(x='Compound',
       y='OED or Inferred Exposure, \n mg/kg/day')

Then save the plot.

ggsave(plot=p, filename=paste0('pdf_figures/',
                               paste('oed_exposure_plot',model,
                                     'poormetab', poormetab,
                                     'fupcensor', fup.censor,
                                     "FuptoFub",
                                     'Total', 'OEDdistoverAC50', 
                                     sep='_'),'.pdf'),
       width=14,height=8.5)
print(p)

Then let’s do it again for the OEDs computed for different Css percentiles. Remember that the OEDs are inversely proportional to Css.

p <- ggplot(data=m.css.Total) +
  geom_crossbar(aes(x=Compound.factor,y=oed.css50,
                    ymin=oed.css75,
                    ymax=oed.css25))+
  geom_linerange(aes(x=Compound.factor,
                     ymin=oed.css25,
                     ymax=oed.css10))+
  geom_linerange(aes(x=Compound.factor,
                     ymin=oed.css90,
                     ymax=oed.css75)) +
  geom_point(aes(x=Compound.factor, y=oed.css5)) + 
  geom_point(aes(x=Compound.factor, y=oed.css95))+
  geom_crossbar(aes(x=Compound.factor, y=exposure.median,
                    ymin=exposure.median.95CI.lower,
                    ymax=exposure.median.95CI.upper),
                color='#FC8D62')+
  scale_y_log10()+
  theme_bw()+
  theme(axis.text.x = element_text(size = 12, angle = 60, 
                                   hjust = 1, colour = "grey50"),
        axis.ticks.x = element_line(size=0.01, color = 'grey50'),
        legend.title = element_text(size=rel(1.2)),
        legend.text = element_text(size=rel(1.2))) +
  labs(x='Compound',
       y='OED or Inferred Exposure, \n mg/kg/day')
ggsave(plot=p, filename=paste0('pdf_figures/',
                               paste('oed_exposure_plot',model,
                                     'poormetab', poormetab,
                                     'fupcensor', fup.censor,
                                     "FuptoFub",
                                     'Total', 'OEDdistoverCss', 
                                     sep='_'),'.pdf'),
       width=14,height=8.5)
print(p)

Plotting heatmaps

Heatmaps will let us visualize the magnitude of the difference in AER, OED, and exposure, between each subpopulation and the total population, for each chemical.

Plotting AER heatmaps

To visualize the difference in AER from the total population for each group, it’s probably best to look at the order-of-magnitude differences. That is, if AER for the total population for a given chemical is 1e6, and AER for the same chemical for the Age.6.11 population is 1, then AER for Age.6.11 is 6 orders of magnitude less than AER for the total population.

To compute this numerically, compute the difference in the log10 AERs between each subpopulation and the total population. In the example above, log10 AER for Age.6.11 is log10(1) = 0, and log10 AER for the total population is log10(1e6) = 6. So the difference is 0 - 6 = -6.

#Compute difference in log AERs:
#log AER group - log AER Total
m.tmp[, logAER.diff:=log10(aer)-
        m.tmp[ExpoCast.group=='Total', log10(aer)],
      by=ExpoCast.group]
#if logAER.diff is positive, then log AER group > log AER Total
#if logAER.diff is negative, then log AER group < log AER Total

To create the heatmap, first cast the data table into matrix form.

#Cast into matrix
m.mat <- reshape2::acast(m.tmp[,
                               .(Compound.factor, ExpoCast.group, logAER.diff)],
                         Compound.factor~ExpoCast.group,
                         value.var='logAER.diff')

To visualize differences above and below some mean value, it’s usually best to use a diverging colormap, like those in ColorBrewer 2. Unfortunately, heatmap.2 — the function we’ll use to create the heatmap plots — doesn’t have a built-in way to properly interpolate the diverging color palettes provided by, e.g., RColorBrewer. So we have to set up a function to do the interpolation. This function is a minor adaptation of some code provided in a Stack Overflow answer by Josh O’Brien.

#Use a diverging color palette
#Based on http://stackoverflow.com/a/10986203 by Josh O'Brien
diverge.color <- function(data,pal_choice="RdBu",Thresh=0){
  #use 100 colors total, and divide the data into 100 bins
  #meaning that the "halfway" color is bin 50
  nHalf<-50 
  Min <- min(data,na.rm=TRUE)
  Max <- max(data,na.rm=TRUE)
  pal<-RColorBrewer::brewer.pal(n=11,pal_choice)
  #interpolate gradient between the first two colors
  rc1<-colorRampPalette(colors=c(pal[1],pal[2]), space="Lab")(10) 
  #interpolate gradient between each succeeding pair of colors
  for(i in 2:10){
    tmp<-colorRampPalette(colors=c(pal[i],pal[i+1]), space="Lab")(10) 
    rc1<-c(rc1,tmp)
    }
  #calculate the data breaks
  rb1 <- seq(Min, Thresh, length.out=nHalf+1)
  rb2 <- seq(Thresh, Max, length.out=nHalf+1)[-1]
  rampbreaks <- c(rb1, rb2)
  cuts <- classInt::classIntervals(data, style="fixed", fixedBreaks=rampbreaks)
  return(list(cuts=cuts,rc1=rc1))
  }

Then we use the diverge.color function to generate the appropriate colors and breaks for the AER data.

brks <- diverge.color(data=c(as.vector(m.mat), -0.8,0.8),pal_choice='RdBu')
breaksv <- brks$cuts[['brks']]

Then we generate a matrix of character strings so that we can label the single cell with missing data for the AER and exposure heatmaps.

notemat <- matrix(rep("", length(m.mat)), nrow=nrow(m.mat), ncol=ncol(m.mat))
notemat[which(is.na(m.mat), arr.ind=T)] <- 'NA'

For ease of use later, define a list of plotting options to use for all the heatmap.2 calls.

hm_args <- list(Rowv=FALSE, #don't reorder rows
                 dendrogram = 'column',
                 col=brks$rc1,
                 breaks=breaksv,
                 na.col = '#FFFF33',
                 trace='none',
                 margin=c(7, 8),
                 key.xlab=expression(paste(Delta, 'log(AER), Group - Total')),
                 key.ylab="Count",
                 key.title=NA,
                 srtCol=40,
                 adjCol=c(1,1),
                 denscol='black',
                 cellnote = notemat,
                 notecex=0.7,
                 notecol='black',
                 cexRow=0.8)

Finally, we actually plot the heatmap!

pdf(paste0('pdf_figures/',
           paste('deltaAERheatmap',
                 'model',model,
                 'poormetab',poormetab,
                 'fupcensor',fup.censor, 
                 "FuptoFub",
                 'test',
                 sep='_'), '.pdf'), pointsize=10)
hm1 <- do.call(what=heatmap.2,
               args=c(list(x=m.mat),
                      hm_args))
dev.off()
hm1 <- do.call(what=heatmap.2,
               args=c(list(x=m.mat),
                      hm_args))

We also want to plot a version with a colorbar to the left of the heatmap, illustrating the orders of magnitude of the AERs in the Total population.

m.Total[, aer.ordermag:=findInterval(log10(aer), 0:8)]
cols <- RColorBrewer::brewer.pal(name='Set3', n=9)
colvect <- cols[m.Total[,aer.ordermag+1]]
pdf(paste0('pdf_figures/',
           paste('deltaAERheatmap',
                 'model',model,
                 'poormetab',poormetab,
                 'fupcensor',fup.censor, 
                 "FuptoFub",
                 'bigger', 'annotated',
                 sep='_'), '.pdf'),
    pointsize = 10)
hm1.ann <- do.call(what=heatmap.2,
                   args=c(list(x=m.mat,
                               RowSideColors=colvect),
                          hm_args))
dev.off()
hm1.ann <- do.call(what=heatmap.2,
                   args=c(list(x=m.mat,
                               RowSideColors=colvect),
                          hm_args))

Plotting OED heatmap

We follow a similar procedure for the OED heatmap. First, compute differences in the log10 OEDs for each subpopulation from the total population.

#Likewise, compute differences in log OEDs
m.tmp[, logOED.diff:=log10(oed.AC50p10)-
        m.tmp[ExpoCast.group=='Total', log10(oed.AC50p10)],
      by=ExpoCast.group]

Then construct the matrix in the same way.

m.mat2 <- reshape2::acast(m.tmp[,
                                .(Compound.factor, ExpoCast.group, logOED.diff)],
                          Compound.factor~ExpoCast.group,
                          value.var='logOED.diff')

Generate the diverging colormap for the OED data.

brks <- diverge.color(data=c(as.vector(m.mat2), -0.8,0.8), pal_choice='RdBu')
breaksv <- brks$cuts[['brks']]

Put the subpopulations in the same order as for the AER heatmap.

m.mat2 <- m.mat2[, hm1$colInd]

And finally, plot the heatmap.

hm_args_oed <- hm_args
hm_args_oed$col <- brks$rc1
hm_args_oed$breaks <- breaksv
hm_args_oed$key.xlab<-expression(paste(Delta, 'log(OED), Group - Total'))
hm_args_oed$Colv <- FALSE
hm_args_oed$cellnote <- NULL
pdf(paste0('pdf_figures/',
           paste('deltaOEDheatmap',
                 'model',model,
                 'poormetab',poormetab,
                 'fupcensor',fup.censor,
                 "FuptoFub", 
                 sep='_'),
           '.pdf'), pointsize=10)
do.call(what=heatmap.2,
        args=c(list(x=m.mat2),
               hm_args_oed))
dev.off()
do.call(what=heatmap.2,
        args=c(list(x=m.mat2),
               hm_args_oed))

Exposure heatmap

The procedure is the same for the exposure data.

#And likewise, compute differences in log exposures
m.tmp[, logexposure.diff:=log10(exposure.median.95CI.upper)-
        m.tmp[ExpoCast.group=='Total', log10(exposure.median.95CI.upper)],
      by=ExpoCast.group]
#Cast into matrix
m.mat3 <- reshape2::acast(m.tmp[,
                      .(Compound.factor, ExpoCast.group, logexposure.diff)],
                Compound.factor~ExpoCast.group,
                value.var='logexposure.diff')
#Put columns in same order as for AER heatmap
m.mat3 <- m.mat3[,hm1$colInd]
#Make cell labeling matrix
notemat <- matrix(rep("", length(m.mat3)), nrow=nrow(m.mat3), ncol=ncol(m.mat3))
notemat[which(is.na(m.mat3), arr.ind=T)] <- 'NA'
#Set up diverging colormap
brks <- diverge.color(data=c(as.vector(m.mat3),-0.8,0.8), pal_choice='RdBu')
breaksv <- brks$cuts[['brks']]

hm_args_exp <- hm_args
hm_args_exp$col <- rev(brks$rc1)
hm_args_exp$breaks <- breaksv
hm_args_exp$key.xlab<-expression(paste(Delta, 'log(exposure), Group - Total'))
hm_args_exp$Colv <- FALSE
hm_args_exp$cellnote <- notemat
#And make the plot
pdf(paste0('pdf_figures/',
           paste('deltaexposureheatmap',
                 'model', model,
                 'poormetab', poormetab,
                 'fupcensor', fup.censor, 
                 "FuptoFub", 
                 sep='_'),
           '.pdf'), pointsize=10)
do.call(what=heatmap.2,
        args=c(list(x=m.mat3),
               hm_args_exp))
dev.off()
do.call(what=heatmap.2,
        args=c(list(x=m.mat3),
               hm_args_exp))

Finally, write the OED, exposure, and AER data in tabular form.

setorder(m.tmp, ExpoCast.group, aer)
write.table(m.tmp[, .(Compound, ExpoCast.group, css95, oed.AC50p10, exposure.median.95CI.upper, aer)], file="data/aer_data_FuptoFub.txt", row.names=FALSE)