\documentclass[9pt,lineno %, onehalfspacing ]{elife} \usepackage[T1]{fontenc} \usepackage[utf8]{inputenc} \usepackage[english]{babel} \usepackage[dvipsnames]{xcolor} \usepackage{doi} \usepackage{tikz} % to draw schematics \usetikzlibrary{decorations.pathreplacing,calligraphy} % for tikz curly braces \usepackage{todonotes} \definecolor{darkblue2}{HTML}{273B81} \definecolor{darkred2}{HTML}{D92102} % \documentclass[a4paper, 11pt]{article} % \usepackage[T1]{fontenc} % \usepackage[utf8]{inputenc} % \usepackage[english]{babel} % \usepackage{graphics} % \usepackage[dvipsnames]{xcolor} % \usepackage{amsmath, amssymb} % \usepackage{doi} % automatic doi-links % \usepackage[round]{natbib} % bibliography % \usepackage{booktabs} % nicer tables % \usepackage[title]{appendix} % better appendices % \usepackage[onehalfspacing]{setspace} % more space % \usepackage[labelfont=bf,font=small]{caption} % smaller captions % \usepackage{tikz} % to draw schematics % \usetikzlibrary{decorations.pathreplacing,calligraphy} % for tikz curly braces % \usepackage{todonotes} % %% margins % \usepackage{geometry} % \geometry{ % a4paper, % total={170mm,257mm}, % left=25mm, % right=25mm, % top=30mm, % bottom=25mm, % } % \title{\vspace{-4em} % \textbf{Meta-research:\\ % Replication of ``null results'' -- Absence of evidence or evidence of absence?}} % \author{{\bf Samuel Pawel\textsuperscript{*}, % Rachel Heyard\textsuperscript{*}, % Charlotte Micheloud, % Leonhard Held} \\ % * contributed equally \\ % Epidemiology, Biostatistics and Prevention Institute \\ % Center for Reproducible Science \\ % University of Zurich} % \date{\today} %don't forget to hard-code date when submitting to arXiv! % %% hyperref options % \usepackage{hyperref} % \hypersetup{ % unicode=true, % bookmarksopen=true, % breaklinks=true, % colorlinks=true, % linkcolor=blue, % anchorcolor=black, % citecolor=blue, % urlcolor=black, % } \title{Meta-Research: Replication of ``null results'' -- Absence of evidence or evidence of absence?} \author[1*\authfn{1}]{Samuel Pawel} \author[1\authfn{1}]{Rachel Heyard} \author[1]{Charlotte Micheloud} \author[1]{Leonhard Held} \affil[1]{Epidemiology, Biostatistics and Prevention Institute, Center for Reproducible Science, University of Zurich, Switzerland} \corr{samuel.pawel@uzh.ch}{SP} \contrib[\authfn{1}]{Contributed equally} %% custom commands \input{defs.tex} \begin{document} \maketitle % %% Disclaimer that a preprint % \vspace{-3em} % \begin{center} % {\color{red}This is a preprint which has not yet been peer reviewed.} % \end{center} << "setup", include = FALSE >>= ## knitr options library(knitr) opts_chunk$set(fig.height = 4, echo = FALSE, warning = FALSE, message = FALSE, cache = FALSE, eval = TRUE) ## should sessionInfo be printed at the end? Reproducibility <- TRUE ## packages library(ggplot2) # plotting library(dplyr) # data manipulation library(reporttools) # reporting of p-values ## not show scientific notation for small numbers options("scipen" = 10) ## the replication Bayes factor under normality BFr <- function(to, tr, so, sr) { bf <- dnorm(x = tr, mean = 0, sd = so) / dnorm(x = tr, mean = to, sd = sqrt(so^2 + sr^2)) return(bf) } ## function to format Bayes factors formatBF. <- function(BF) { if (is.na(BF)) { BFform <- NA } else if (BF > 1) { if (BF > 1000) { BFform <- "> 1000" } else { BFform <- as.character(signif(BF, 2)) } } else { if (BF < 1/1000) { BFform <- "< 1/1000" } else { BFform <- paste0("1/", signif(1/BF, 2)) } } if (!is.na(BFform) && BFform == "1/1") { return("1") } else { return(BFform) } } formatBF <- Vectorize(FUN = formatBF.) ## Bayes factor under normality with unit-information prior under alternative BF01 <- function(estimate, se, null = 0, unitvar = 4) { bf <- dnorm(x = estimate, mean = null, sd = se) / dnorm(x = estimate, mean = null, sd = sqrt(se^2 + unitvar)) return(bf) } @ % %% Abstract % %% ----------------------------------------------------------------------------- % \begin{center} % \begin{minipage}{13cm} {\small % \rule{\textwidth}{0.5pt} \\ % {\centering \textbf{Abstract} \\ % % \textit{Absence of evidence is not evidence of absence} -- the title of % % the 1995 paper by Douglas Altman and Martin Bland has since become a % % mantra in the statistical and medical literature. Yet the % % misinterpretation of statistically non-significant results as evidence % % for the absence of an effect is still common and further complicated in % % the context of replication studies. % In several large-scale replication % projects, non-significant results in both the original and the % replication study have been interpreted as a ``replication success''. % Here we discuss the logical problems with this approach. % Non-significance in both studies does not ensure that the studies % provide evidence for the absence of an effect and % % Because the null hypothesis of the statistical tests in both studies % % is misaligned, % ``replication success'' can virtually always be achieved if the sample % sizes of the studies are small enough. In addition, the relevant error % rates are not controlled. We show how methods, such as equivalence % testing and Bayes factors, can be used to adequately quantify the % evidence for the absence of an effect and how they can be applied in the % replication setting. Using data from the Reproducibility Project: Cancer % Biology we illustrate that most original and replication studies with % ``null results'' are in fact inconclusive. We conclude that it is % important to also replicate studies with statistically non-significant % results, but that they should be designed, analyzed, and interpreted % appropriately. % } \\ % \rule{\textwidth}{0.5pt} \emph{Keywords}: Bayesian hypothesis testing, % equivalence testing, meta-research, null hypothesis, replication success} % \end{minipage} % \end{center} \begin{abstract} In several large-scale replication projects, statistically non-significant results in both the original and the replication study have been interpreted as a ``replication success''. Here we discuss the logical problems with this approach. Non-significance in both studies does not ensure that the studies provide evidence for the absence of an effect and ``replication success'' can virtually always be achieved if the sample sizes of the studies are small enough. In addition, the relevant error rates are not controlled. We show how methods, such as equivalence testing and Bayes factors, can be used to adequately quantify the evidence for the absence of an effect and how they can be applied in the replication setting. Using data from the Reproducibility Project: Cancer Biology we illustrate that most original and replication studies with ``null results'' are in fact inconclusive. We conclude that it is important to also replicate studies with statistically non-significant results, but that they should be designed, analyzed, and interpreted appropriately. \end{abstract} % definition from RPCP: null effects - the original authors interpreted their % data as not showing evidence for a meaningful relationship or impact of an % intervention. \section{Introduction} \textit{Absence of evidence is not evidence of absence} -- the title of the 1995 paper by Douglas Altman and Martin Bland has since become a mantra in the statistical and medical literature \citep{Altman1995}. Yet, the misconception that a statistically non-significant result indicates evidence for the absence of an effect is unfortunately still widespread \citep{Makin2019}. Such a ``null result'' -- typically characterized by a $p$-value of $p > 0.05$ for the null hypothesis of an absent effect -- may also occur if an effect is actually present. For example, if the sample size of a study is chosen to detect an assumed effect with a power of 80\%, null results will incorrectly occur 20\% of the time when the assumed effect is actually present. Conversely, if the power of the study is lower, null results will occur more often. In general, the lower the power of a study, the greater the ambiguity of a null result. To put a null result in context, it is therefore critical to know whether the study was adequately powered and under what assumed effect the power was calculated \citep{Hoenig2001, Greenland2012}. However, if the goal of a study is to explicitly quantify the evidence for the absence of an effect, more appropriate methods designed for this task, such as equivalence testing \citep{Wellek2010} or Bayes factors \citep{Kass1995}, should be used from the outset. % two systematic reviews that I found which show that animal studies are very % much underpowered on average \citep{Jennions2003,Carneiro2018} The contextualization of null results becomes even more complicated in the setting of replication studies. In a replication study, researchers attempt to repeat an original study as closely as possible in order to assess whether similar results can be obtained with new data \citep{NSF2019}. There have been various large-scale replication projects in the biomedical and social sciences in the last decade \citep[among others]{Prinz2011,Begley2012,Klein2014,Opensc2015,Camerer2016,Camerer2018,Klein2018,Cova2018,Errington2021}. Most of these projects reported alarmingly low replicability rates across a broad spectrum of criteria for quantifying replicability. While most of these projects restricted their focus on original studies with statistically significant results (``positive results''), the \emph{Reproducibility Project: Psychology} \citep[RPP,][]{Opensc2015}, the \emph{Reproducibility Project: Experimental Philosophy} \citep[RPEP,][]{Cova2018}, and the \emph{Reproducibility Project: Cancer Biology} \citep[RPCB,][]{Errington2021} also attempted to replicate some original studies with null results. The RPP excluded the original null results from its overall assessment of replication success, but the RPCB and the RPEP explicitly defined null results in both the original and the replication study as a criterion for ``replication success''. There are several logical problems with this ``non-significance'' criterion. First, if the original study had low statistical power, a non-significant result is highly inconclusive and does not provide evidence for the absence of an effect. It is then unclear what exactly the goal of the replication should be -- to replicate the inconclusiveness of the original result? On the other hand, if the original study was adequately powered, a non-significant result may indeed provide some evidence for the absence of an effect when analyzed with appropriate methods, so that the goal of the replication is clearer. However, the criterion does not distinguish between these two cases. Second, with this criterion researchers can virtually always achieve replication success by conducting two studies with very small sample sizes, such that the $p$-values are non-significant and the results are inconclusive. This is because the null hypothesis under which the $p$-values are computed is misaligned with the goal of inference, which is to quantify the evidence for the absence of an effect. We will discuss methods that are better aligned with this inferential goal. % in Section~\ref{sec:methods}. Third, the criterion does not control the error of falsely claiming the absence of an effect at some predetermined rate. This is in contrast to the standard replication success criterion of requiring significance from both studies \citep[also known as the two-trials rule, see chapter 12.2.8 in][]{Senn2008}, which ensures that the error of falsely claiming the presence of an effect is controlled at a rate equal to the squared significance level (for example, $5\% \times 5\% = 0.25\%$ for a $5\%$ significance level). The non-significance criterion may be intended to complement the two-trials rule for null results, but it fails to do so in this respect, which may be important to regulators, funders, and researchers. We will now demonstrate these issues and potential solutions using the null results from the RPCB. \section{Null results from the Reproducibility Project: Cancer Biology} \label{sec:rpcb} << "data" >>= ## data rpcbRaw <- read.csv(file = "../data/prepped_outcome_level_data.csv") rpcb <- rpcbRaw %>% select( osf = OSF.project.link, paper = pID, experiment = eID, effect = oID, internalReplication = internalID, effectType = Effect.size.type, ## effect sizes, standard errors, p-values on original scale ESo = Original.effect.size, seESo = Original.standard.error, lowerESo = Original.lower.CI, upperESo = Original.upper.CI, po = origPval, ESr = Replication.effect.size, seESr = Replication.standard.error, lowerESr = Replication.lower.CI, upperESr = Replication.upper.CI, pr = repPval, ## effect sizes and standard errors on SMD scale smdo = origES3, so = origSE3, lowero = origESLo3, uppero = origESHi3, smdr = repES3, sr = repSE3, ## Original and replication sample size ## (not consistent whether group or full sample size) no = origN, nr = repN) %>% mutate( ## define identifier for effect id = paste0("(", paper, ", ", experiment, ", ", effect, ", ", internalReplication, ")"), ## recompute one-sided p-values based on normality ## (in direction of original effect estimate) zo = smdo/so, zr = smdr/sr, po1 = pnorm(q = abs(zo), lower.tail = FALSE), pr1 = pnorm(q = abs(zr), lower.tail = ifelse(sign(zo) < 0, TRUE, FALSE)), ## compute some other quantities c = so^2/sr^2, # variance ratio d = smdr/smdo, # relative effect size po2 = 2*(1 - pnorm(q = abs(zo))), # two-sided original p-value pr2 = 2*(1 - pnorm(q = abs(zr))), # two-sided replication p-value sm = 1/sqrt(1/so^2 + 1/sr^2), # standard error of fixed effect estimate smdm = (smdo/so^2 + smdr/sr^2)*sm^2, # fixed effect estimate pm2 = 2*(1 - pnorm(q = abs(smdm/sm))), # two-sided fixed effect p-value Q = (smdo - smdr)^2/(so^2 + sr^2), # Q-statistic pQ = pchisq(q = Q, df = 1, lower.tail = FALSE), # p-value from Q-test BFr = BFr(to = smdo, tr = smdr, so = so, sr = sr), # replication BF BFrformat = formatBF(BF = BFr), BForig = BF01(estimate = smdo, se = so), # unit-information BF for original BForigformat = formatBF(BF = BForig), BFrep = BF01(estimate = smdr, se = sr), # unit-information BF for replication BFrepformat = formatBF(BF = BFrep) ) # TODO identify correct "null" findings as in paper rpcbNull <- rpcb %>% ## filter(po1 > 0.025) #? filter(po > 0.05) #? ## ## check whether 10/20 = 50% of the original "null" results were also "null" in ## ## the replication (table 1 in Errington, 2021) ## rpcbNull %>% ## mutate(success = sign(smdo) == sign(smdr) & pr >= 0.05) %>% ## summarise(sum(success)) ## ### noooo :) ## check the sample sizes ## paper 5 (https://osf.io/q96yj) - 1 Cohen's d - sample size correspond to forest plot ## paper 9 (https://osf.io/yhq4n) - 3 Cohen's w- sample size do not correspond at all ## paper 15 (https://osf.io/ytrx5) - 1 r - sample size correspond to forest plot ## paper 19 (https://osf.io/465r3) - 2 Cohen's dz - sample size correspond to forest plot ## paper 20 (https://osf.io/acg8s) - 1 r and 1 Cliff's delta - sample size correspond to forest plot ## paper 21 (https://osf.io/ycq5g) - 1 Cohen's d - sample size correspond to forest plot ## paper 24 (https://osf.io/pcuhs) - 2 Cohen's d - sample size correspond to forest plot ## paper 28 (https://osf.io/gb7sr/) - 3 Cohen's d - sample size correspond to forest plot ## paper 29 (https://osf.io/8acw4) - 1 Cohen's d - sample size do not correspond, seem to be double ## paper 41 (https://osf.io/qnpxv) - 1 Hazard ratio - sample size correspond to forest plot ## paper 47 (https://osf.io/jhp8z) - 2 r - sample size correspond to forest plot ## paper 48 (https://osf.io/zewrd) - 1 r - sample size do not correspond to forest plot for original study @ Figure~\ref{fig:2examples} shows standardized mean difference effect estimates with confidence intervals from two RPCB study pairs. Both are ``null results'' and meet the non-significance criterion for replication success (the two-sided $p$-values are greater than 0.05 in both the original and the replication study), but intuition would suggest that these two pairs are very much different. \begin{figure}[ht] << "2-example-studies", fig.height = 3.25 >>= ## some evidence for absence of effect https://doi.org/10.7554/eLife.45120 I ## can't find the replication effect like reported in the data set :( let's take ## it at face value we are not data detectives ## https://iiif.elifesciences.org/lax/45120%2Felife-45120-fig4-v1.tif/full/1500,/0/default.jpg study1 <- "(20, 1, 1, 1)" ## absence of evidence study2 <- "(29, 2, 2, 1)" ## https://iiif.elifesciences.org/lax/25306%2Felife-25306-fig5-v2.tif/full/1500,/0/default.jpg plotDF1 <- rpcbNull %>% filter(id %in% c(study1, study2)) %>% mutate(label = ifelse(id == study1, "Goetz et al. (2011)\nEvidence of absence", "Dawson et al. (2011)\nAbsence of evidence")) ## ## RH: this data is really a mess. turns out for Dawson n represents the group ## ## size (n = 6 in https://osf.io/8acw4) while in Goetz it is the sample size of ## ## the whole experiment (n = 34 and 61 in https://osf.io/acg8s). in study 2 the ## ## so multiply by 2 to have the total sample size, see Figure 5A ## ## https://doi.org/10.7554/eLife.25306.012 ## plotDF1$no[plotDF1$id == study2] <- plotDF1$no[plotDF1$id == study2]*2 ## plotDF1$nr[plotDF1$id == study2] <- plotDF1$nr[plotDF1$id == study2]*2 ## create plot showing two example study pairs with null results conflevel <- 0.95 ggplot(data = plotDF1) + facet_wrap(~ label) + geom_hline(yintercept = 0, lty = 2, alpha = 0.3) + geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), fatten = 3) + geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) + geom_text(aes(x = 1.05, y = 2.5, label = paste("italic(n) ==", no)), col = "darkblue", parse = TRUE, size = 3.8, hjust = 0) + geom_text(aes(x = 2.05, y = 2.5, label = paste("italic(n) ==", nr)), col = "darkblue", parse = TRUE, size = 3.8, hjust = 0) + geom_text(aes(x = 1.05, y = 3, label = paste("italic(p) ==", formatPval(po))), col = "darkblue", parse = TRUE, size = 3.8, hjust = 0) + geom_text(aes(x = 2.05, y = 3, label = paste("italic(p) ==", formatPval(pr))), col = "darkblue", parse = TRUE, size = 3.8, hjust = 0) + labs(x = "", y = "Standardized mean difference (SMD)") + theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), strip.text = element_text(size = 12, margin = margin(4), vjust = 1.5), strip.background = element_rect(fill = alpha("tan", 0.4)), axis.text = element_text(size = 12)) @ \caption{\label{fig:2examples} Two examples of original and replication study pairs which meet the non-significance replication success criterion from the Reproducibility Project: Cancer Biology \citep{Errington2021}. Shown are standardized mean difference effect estimates with \Sexpr{round(conflevel*100, 2)}\% confidence intervals, sample sizes, and two-sided $p$-values for the null hypothesis that the standardized mean difference is zero.} \end{figure} The original study from \citet{Dawson2011} and its replication both show large effect estimates in magnitude, but due to the small sample sizes, the uncertainty of these estimates is very large, too. If the sample sizes of the studies were larger and the point estimates remained the same, intuitively both studies would provide evidence for a non-zero effect. However, with the samples sizes that were actually used, the results seem inconclusive. In contrast, the effect estimates from \citet{Goetz2011} and its replication are much smaller in magnitude and their uncertainty is also smaller because the studies used larger sample sizes. Intuitively, these studies seem to provide some evidence for a zero (or negligibly small) effect. While these two examples show the qualitative difference between absence of evidence and evidence of absence, we will now discuss how the two can be quantitatively distinguished. \section{Methods for asssessing replicability of null results} \label{sec:methods} There are both frequentist and Bayesian methods that can be used for assessing evidence for the absence of an effect. \citet{Anderson2016} provide an excellent summary of both approaches in the context of replication studies in psychology. We now briefly discuss two possible approaches -- frequentist equivalence testing and Bayesian hypothesis testing -- and their application to the RPCB data. \subsection{Equivalence testing} Equivalence testing was developed in the context of clinical trials to assess whether a new treatment -- typically cheaper or with fewer side effects than the established treatment -- is practically equivalent to the established treatment \citep{Westlake1972,Schuirmann1987}. The method can also be used to assess whether an effect is practically equivalent to the value of an absent effect, usually zero. Using equivalence testing as a remedy for non-significant results has been suggested by several authors \citep{Hauck1986, Campbell2018}. The main challenge is to specify the margin $\Delta > 0$ that defines an equivalence range $[-\Delta, +\Delta]$ in which an effect is considered as absent for practical purposes. The goal is then to reject the % composite %% maybe too technical? null hypothesis that the true effect is outside the equivalence range. This is in contrast to the usual null hypothesis of a superiority test which states that the effect is zero or smaller than zero, see Figure~\ref{fig:hypotheses} for an illustration. \begin{figure} \begin{center} \begin{tikzpicture}[ultra thick] \draw[stealth-stealth] (0,0) -- (6,0); \node[text width=4.5cm, align=center] at (3,-1) {Effect size}; \draw (2,0.2) -- (2,-0.2) node[below]{$-\Delta$}; \draw (3,0.2) -- (3,-0.2) node[below]{$0$}; \draw (4,0.2) -- (4,-0.2) node[below]{$+\Delta$}; \node[text width=5cm, align=left] at (0,1.5) {\textbf{Equivalence}}; \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt}] (2.05,1) -- (3.95,1) node[midway,yshift=1.5em]{\textcolor{darkred2}{$H_1$}}; \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}] (0,1) -- (1.95,1) node[pos=0.6,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}}; \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}] (4.05,1) -- (6,1) node[pos=0.4,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}}; \node[text width=5cm, align=left] at (0,3.5) {\textbf{Superiority \\(two-sided)}}; \draw [decorate,decoration={brace,amplitude=5pt}] (3,3) -- (3,3) node[midway,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}}; \draw[darkblue2] (3,2.9) -- (3,3.2); \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}] (0,3) -- (2.95,3) node[pos=0.6,yshift=1.5em]{\textcolor{darkred2}{$H_1$}}; \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}] (3.05,3) -- (6,3) node[pos=0.4,yshift=1.5em]{\textcolor{darkred2}{$H_1$}}; \node[text width=5cm, align=left] at (0,5.5) {\textbf{Superiority \\ (one-sided)}}; \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}] (3.05,5) -- (6,5) node[pos=0.4,yshift=1.5em]{\textcolor{darkred2}{$H_1$}}; \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}] (0,5) -- (3,5) node[pos=0.6,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}}; \draw [dashed] (2,0) -- (2,1); \draw [dashed] (4,0) -- (4,1); \draw [dashed] (3,0) -- (3,1); \draw [dashed] (3,1.9) -- (3,2.8); \draw [dashed] (3,3.9) -- (3,5); \end{tikzpicture} \end{center} \caption{Null hypothesis ($H_0$) and alternative hypothesis ($H_1$) for different study designs with equivalence margin $\Delta$.} \label{fig:hypotheses} \end{figure} To ensure that the null hypothesis is falsely rejected at most $\alpha \times 100\%$ of the time, one either rejects it if the $(1-2\alpha)\times 100\%$ confidence interval for the effect is contained within the equivalence range (for example, a 90\% confidence interval for $\alpha = 5\%$), or if two one-sided tests (TOST) for the effect being smaller/greater than $+\Delta$ and $-\Delta$ are significant at level $\alpha$, respectively. A quantitative measure of evidence for the absence of an effect is then given by the maximum of the two one-sided $p$-values (the TOST $p$-value). \begin{figure} \begin{fullwidth} << "plot-null-findings-rpcb", fig.height = 8.25, fig.width = "0.95\\linewidth" >>= ## compute TOST p-values margin <- 1 conflevel <- 0.9 rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = margin, sd = so, lower.tail = TRUE), pnorm(q = smdo, mean = -margin, sd = so, lower.tail = FALSE))) rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = margin, sd = sr, lower.tail = TRUE), pnorm(q = smdr, mean = -margin, sd = sr, lower.tail = FALSE))) ## highlight the studies from Goetz and Dawson rpcbNull$id <- ifelse(rpcbNull$id == "(20, 1, 1, 1)", "(20, 1, 1, 1) - Goetz et al. (2011)", rpcbNull$id) rpcbNull$id <- ifelse(rpcbNull$id == "(29, 2, 2, 1)", "(29, 2, 2, 1) - Dawson et al. (2011)", rpcbNull$id) ## create plots of all study pairs with null results in original study ggplot(data = rpcbNull) + facet_wrap(~ id, scales = "free", ncol = 4) + geom_hline(yintercept = 0, lty = 2, alpha = 0.25) + ## equivalence margin geom_hline(yintercept = c(-margin, margin), lty = 3, col = 2, alpha = 0.9) + geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), size = 0.25, fatten = 2) + geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), size = 0.25, fatten = 2) + labs(x = "", y = "Standardized mean difference (SMD)") + geom_text(aes(x = 0.46, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)['TOST']", ifelse(ptosto < 0.0001, "", "=="), formatPval(ptosto))), col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 0.5) + geom_text(aes(x = 1.51, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)['TOST']", ifelse(ptostr < 0.0001, "", "=="), formatPval(ptostr))), col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 0.5) + geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="), BForigformat)), col = "darkblue", parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0) + geom_text(aes(x = 1.59, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="), BFrepformat)), col = "darkblue", parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0) + geom_text(aes(x = 1.05, y = pmin(-0.9*margin, smdr - 1.6*sr, smdo - 1.6*so), label = paste("italic(n) ==", no)), col = "darkblue", parse = TRUE, size = 2.3, hjust = 0) + geom_text(aes(x = 2.05, y = pmin(-0.9*margin, smdr - 1.6*sr, smdo - 1.6*so), label = paste("italic(n) ==", nr)), col = "darkblue", parse = TRUE, size = 2.3, hjust = 0) + theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), strip.text = element_text(size = 6.4, margin = margin(3), vjust = 2), strip.background = element_rect(fill = alpha("tan", 0.4)), axis.text = element_text(size = 8)) @ \caption{Standardized mean difference (SMD) effect estimates with \Sexpr{round(conflevel*100, 2)}\% confidence interval for the ``null results'' (those with original two-sided $p$-value $p > 0.05$) and their replication studies from the Reproducibility Project: Cancer Biology \citep{Errington2021}. The identifier above each plot indicates (Original paper number, Experiment number, Effect number, Internal replication number). The two examples from Figure~\ref{fig:2examples} are indicated in the plot titles. The dashed grey line depicts the value of no effect ($\text{SMD} = 0$) whereas the dotted red lines depict the equivalence range with margin $\Delta = \Sexpr{margin}$. The $p$-values $p_{\text{TOST}}$ are the maximum of the two one-sided $p$-values for the effect being smaller or greater than $+\Delta$ or $-\Delta$, respectively. The Bayes factors $\BF_{01}$ quantify evidence for the null hypothesis $H_{0} \colon \text{SMD} = 0$ against the alternative $H_{1} \colon \text{SMD} \neq 0$ with normal unit-information prior assigned to the SMD under $H_{1}$.} \label{fig:nullfindings} \end{fullwidth} \end{figure} Returning to the RPCB data, Figure~\ref{fig:nullfindings} shows the standarized mean difference effect estimates with \Sexpr{round(conflevel*100, 2)}\% confidence intervals for the 20 study pairs with quantitative null results in the original study ($p > 0.05$). The dotted red lines represent an equivalence range for the margin $\Delta = \Sexpr{margin}$, for which the shown TOST $p$-values are computed. This margin is rather lax compared to the margins typically used in clinical research; we chose it primarily for illustrative purposes and because effect sizes in preclinical research are typically much larger than in clinical research. In practice, the margin should be determined on a case-by-case basis by researchers who are familiar with the subject matter. However, even with this generous margin, only four of the twenty study pairs -- one of them being the previously discussed example from \citet{Goetz2011} -- are able to establish equivalence at the 5\% level in the sense that both the original and the replication 90\% confidence interval fall within the equivalence range (or equivalently that their TOST $p$-values are smaller than $0.05$). For the remaining 16 studies -- for instance, the previously discussed example from \citet{Dawson2011} -- the situation remains inconclusive and there is neither evidence for the absence nor the presence of the effect. \subsection{Bayesian hypothesis testing} The distinction between absence of evidence and evidence of absence is naturally built into the Bayesian approach to hypothesis testing. A central measure of evidence is the Bayes factor \citep{Kass1995}, which is the updating factor of the prior odds to the posterior odds of the null hypothesis $H_{0}$ versus the alternative hypothesis $H_{1}$ \begin{align*} \underbrace{\frac{\Pr(H_{0} \given \mathrm{data})}{\Pr(H_{1} \given \mathrm{data})}}_{\mathrm{Posterior~odds}} = \underbrace{\frac{\Pr(H_{0})}{\Pr(H_{1})}}_{\mathrm{Prior~odds}} \times \underbrace{\frac{p(\mathrm{data} \given H_{0})}{p(\mathrm{data} \given H_{1})}}_{\mathrm{Bayes~factor}~\BF_{01}}. \end{align*} The Bayes factor quantifies how much the observed data have increased or decreased the probability of the null hypothesis $H_{0}$ relative to the alternative $H_{1}$. If the null hypothesis states the absence of an effect, a Bayes factor greater than one (\mbox{$\BF_{01} > 1$}) indicates evidence for the absence of the effect and a Bayes factor smaller than one indicates evidence for the presence of the effect (\mbox{$\BF_{01} < 1$}), whereas a Bayes factor not much different from one indicates absence of evidence for either hypothesis (\mbox{$\BF_{01} \approx 1$}). When the observed data are dichotomized into positive (\mbox{$p < 0.05$}) or null results (\mbox{$p > 0.05$}), the Bayes factor based on a null result is the probability of observing \mbox{$p > 0.05$} when the effect is indeed absent (which is $95\%$) divided by the probability of observing $p > 0.05$ when the effect is indeed present (which is one minus the power of the study). For example, if the power is 90\%, we have \mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten times more evidence for the absence of the effect than for its presence. On the other hand, if the power is only 50\%, we have \mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating only slightly more evidence for the absence of the effect. This example also highlights the main challenge with Bayes factors -- the specification of the alternative hypothesis $H_{1}$. The assumed effect under $H_{1}$ is directly related to the power of the study, and researchers who assume different effects under $H_{1}$ will end up with different Bayes factors. Instead of specifying a single effect, one therefore typically specifies a ``prior distribution'' of plausible effects. Importantly, the prior distribution, like the equivalence margin, should be determined by researchers with subject knowledge and before the data are observed. In practice, the observed data should not be dichotomized into positive or null results, as this leads to a loss of information. Therefore, to compute the Bayes factors for the RPCB null results, we used the observed effect estimates as the data and assumed a normal sampling distribution for them, as in a meta-analysis. The Bayes factors $\BF_{01}$ shown in Figure~\ref{fig:nullfindings} then quantify the evidence for the null hypothesis of no effect ($H_{0} \colon \text{SMD} = 0$) against the alternative hypothesis that there is an effect ($H_{1} \colon \text{SMD} \neq 0$) using a normal ``unit-information'' prior distribution \citep{Kass1995b} for the effect size under the alternative $H_{1}$. There are several more advanced prior distributions that could be used here, and they should ideally be specified for each effect individually based on domain knowledge. The normal unit-information prior (with a standard deviation of 2 for SMDs) is only a reasonable default choice, as it implies that small to large effects are plausible under the alternative. We see that in most cases there is no substantial evidence for either the absence or the presence of an effect, as with the equivalence tests. The Bayes factors for the two previously discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent with our intuititons -- there is indeed some evidence for the absence of an effect in \citet{Goetz2011}, while there is even slightly more evidence for the presence of an effect in \citet{Dawson2011}, though the Bayes factor is very close to one due to the small sample sizes. With a lenient Bayes factor threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect, only one of the twenty study pairs meets this criterion in both the original and replication study. << >>= studyInteresting <- filter(rpcbNull, id == "(48, 2, 4, 1)") noInteresting <- studyInteresting$no nrInteresting <- studyInteresting$nr write.csv(rpcbNull, "rpcb-Null.csv", row.names = FALSE) @ Among the twenty RPCB null results, there is one interesting case (the rightmost plot in the fourth row (48, 2, 4, 1)) where the Bayes factor is qualitatively different from the equivalence test, revealing a fundamental difference between the two approaches. The Bayes factor is concerned with testing whether the effect is \emph{exactly zero}, whereas the equivalence test is concerned with whether the effect is within an \emph{interval around zero}. Due to the very large sample size in the original study ($n = \Sexpr{noInteresting}$) and the replication ($n = \Sexpr{nrInteresting}$), the data are incompatible with an exactly zero effect, but compatible with effects within the equivalence range. Apart from this example, however, the approaches lead to the same qualitative conclusion -- most RPCB null results are highly ambiguous. \section{Conclusions} We showed that in most of the RPCB studies with ``null results'' (those with $p > 0.05$), neither the original nor the replication study provided conclusive evidence for the presence or absence of an effect. It seems logically questionable to declare an inconclusive replication of an inconclusive original study as a replication success. While it is important to replicate original studies with null results, our analysis highlights that they should be analyzed and interpreted appropriately. For both the equivalence testing and the Bayes factor approach, it is critical that the parameters of the procedure (the equivalence margin and the prior distribution) are specified independently of the data, ideally before the studies are conducted. Typically, however, the original studies were designed to find evidence for the presence of an effect, and the goal of replicating the ``null result'' was formulated only after failure to do so. \citet{Campbell2021} discuss various approaches to post-hoc specification of equivalence margins, such as motivating it using data from previous studies or using field conventions. \citet{Hauck1986} propose a sensitivity analysis approach in the form of plotting the TOST $p$-value against a range of possible margins (``equivalence curves''). Post-hoc specification of a prior distribution for a Bayes factor may likewise be based on historical data, field conventions, or assessed visually with sensitivity analyses. % As error rate control may no longer be ensured in this case, the TOST % $p$-values should not be used as dichotomous decision tools, but rather as % descriptive measures of compatibility between the data and effects outside the % equivalence region \citep{Amrhein2019, Rafi2020, Greenland2023}. While the equivalence test and the Bayes factor are two principled methods for analyzing original and replication studies with null results, they are not the only possible methods for doing so. For instance, the reverse-Bayes approach from \citet{Micheloud2022} specifically tailored to equivalence testing in the replication setting may lead to more appropriate inferences as it also takes into account the compatibility of the effect estimates from original and replication studies. In addition, there are various other Bayesian methods which could potentially improve upon the considered Bayes factor approach. For example, Bayes factors based on non-local priors \citep{Johnson2010} or based on interval null hypotheses \citep{Morey2011, Liao2020}, methods for equivalence testing based on effect size posterior distributions \citep{Kruschke2018}, or Bayesian procedures that involve utilities of decisions \citep{Lindley1998}. Finally, the design of replication studies should align with the planned analysis \citep{Anderson2017, Anderson2022, Micheloud2020, Pawel2022c}. % The RPCB determined the sample size of their replication studies to achieve at % least 80\% power for detecting the original effect size which does not seem to % be aligned with their goal If the goal of the study is to find evidence for the absence of an effect, the replication sample size should also be determined so that the study has adequate power to make conclusive inferences regarding the absence of the effect. \section*{Acknowledgements} We thank the contributors of the RPCB for their tremendous efforts and for making their data publicly available. We thank Maya Mathur for helpful advice with the data preparation. This work was supported by the Swiss National Science Foundation (grant \href{https://data.snf.ch/grants/grant/189295}{\#189295}). \section*{Conflict of interest} We declare no conflict of interest. \section*{Data and software} The data from the RPCB were obtained by downloading the files from \url{https://github.com/mayamathur/rpcb} (commit a1e0c63) and executing the R script \texttt{Code/data\_prep.R} with the line 632 commented out so that also original studies with null results are included. This then produced the file \texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent analyses. The effect estimates and standard errors on SMD scale provided in this data set differ in some cases from those in the data set available at \url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in \citet{Errington2021}. We used this particular version of the data set because it was recommended to us by the RPCB statistician (Maya Mathur) upon request. % For the \citet{Dawson2011} example study and its replication \citep{Shan2017}, % the sample sizes $n = 3$ in th data set seem to correspond to the group sample % sizes, see Figure 5A in the replication study % (\url{https://doi.org/10.7554/eLife.25306.012}), which is why we report the % total sample sizes of $n = 6$ in Figure~\ref{fig:2examples}. The code and data to reproduce our analyses is openly available at \url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository at the time of writing is available at \url{https://doi.org/10.5281/zenodo.XXXXXX}. We used the statistical programming language R version \Sexpr{paste(version$major, version$minor, sep = ".")} \citep{R} for analyses. The R packages \texttt{ggplot2} \citep{Wickham2016}, \texttt{dplyr} \citep{Wickham2022}, \texttt{knitr} \citep{Xie2022}, and \texttt{reporttools} \citep{Rufibach2009} were used for plotting, data preparation, dynamic reporting, and formatting, respectively. % \bibliographystyle{apalikedoiurl} \bibliography{bibliography} << >>= ## see differences between Maya Mathur's and the "offical" data set? showdifferences <- FALSE @ << eval = showdifferences, results = "asis" >>= ## print R sessionInfo to see system information and package versions ## used to compile the manuscript (set Reproducibility = FALSE, to not do that) cat("\\newpage \\section*{Maya Mathur's data set}") @ << "plot-null-findings-rpcb2", fig.height = 8.25, eval = showdifferences >>= margin <- 1 conflevel <- 0.9 ggplot(data = rpcbNull) + facet_wrap(~ id + effectType , scales = "free", ncol = 4) + geom_hline(yintercept = 0, lty = 2, alpha = 0.3) + ## equivalence margin of 0.5 geom_hline(yintercept = c(-margin, margin), lty = 3, col = 2, alpha = 0.9) + geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), size = .25, fatten = 2) + geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), size = .25, fatten = 2) + labs(x = "", y = "Standardized mean difference (SMD)") + geom_text(aes(x = 1.01, y = smdo + so, label = paste("italic(n[o]) ==", no)), col = "darkblue", parse = TRUE, size = 2.5, hjust = 0) + geom_text(aes(x = 2.01, y = smdr + sr, label = paste("italic(n[r]) ==", nr)), col = "darkblue", parse = TRUE, size = 2.5, hjust = 0) + geom_text(aes(x = 1, y = pmin(smdo - 2.5*so, smdr - 2.5*sr, -margin), label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="), BForigformat)), col = "darkblue", parse = TRUE, size = 2.5) + geom_text(aes(x = 2, y = pmin(smdo - 2.5*so, smdr - 2.5*sr, -margin), label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="), BFrepformat)), col = "darkblue", parse = TRUE, size = 2.5) + theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5), # panel.margin = unit(-1, "lines"), strip.background = element_rect(fill = alpha("tan", 0.4)), axis.text = element_text(size = 8)) @ << eval = showdifferences, results = "asis" >>= ## print R sessionInfo to see system information and package versions ## used to compile the manuscript (set Reproducibility = FALSE, to not do that) cat("\\newpage \\section*{Official data set}") @ << "plot-null-findings-rpcb3", fig.height = 8.25, eval = showdifferences >>= ## create same plot with "official" data set rpcbRaw2 <- read.csv(file = "../data/RP_CB Final Analysis - Effect level data.csv") rpcb2 <- rpcbRaw2 %>% select(paper = Paper.., experiment = Experiment.., effect = Effect.., internalReplication = Internal.replication.., effectType = Effect.size.type, ## effect sizes, standard errors, p-values on original scale ESo = Original.effect.size, seESo = Original.standard.error, lowerESo = Original.lower.CI, upperESo = Original.upper.CI, po = Original.p.value, ESr = Replication.effect.size, seESr = Replication.standard.error, lowerESr = Replication.lower.CI, upperESr = Replication.upper.CI, pr = Replication.p.value, ## effect sizes, standard errors, p-values on SMD scale smdo = Original.effect.size..SMD., so = Original.standard.error..SMD., no = Original.sample.size, smdr = Replication.effect.size..SMD., sr = Replication.standard.error..SMD. , nr = Replication.sample.size ) %>% mutate( ## define identifier for effect id = paste0("(", paper, ", ", experiment, ", ", effect, ", ", internalReplication, ")"), ## recompute one-sided p-values based on normality ## (in direction of original effect estimate) zo = smdo/so, zr = smdr/sr, po1 = pnorm(q = abs(zo), lower.tail = FALSE), pr1 = pnorm(q = abs(zr), lower.tail = ifelse(sign(zo) < 0, TRUE, FALSE)), ## compute some other quantities c = so^2/sr^2, # variance ratio d = smdr/smdo, # relative effect size po2 = 2*(1 - pnorm(q = abs(zo))), # two-sided original p-value pr2 = 2*(1 - pnorm(q = abs(zr))), # two-sided replication p-value sm = 1/sqrt(1/so^2 + 1/sr^2), # standard error of fixed effect estimate smdm = (smdo/so^2 + smdr/sr^2)*sm^2, # fixed effect estimate pm2 = 2*(1 - pnorm(q = abs(smdm/sm))), # two-sided fixed effect p-value Q = (smdo - smdr)^2/(so^2 + sr^2), # Q-statistic pQ = pchisq(q = Q, df = 1, lower.tail = FALSE), # p-value from Q-test BFr = BFr(to = smdo, tr = smdr, so = so, sr = sr), # replication BF BFrformat = formatBF(BF = BFr), BForig = BF01(estimate = smdo, se = so), # unit-information BF for original BForigformat = formatBF(BF = BForig), BFrep = BF01(estimate = smdr, se = sr), # unit-information BF for replication BFrepformat = formatBF(BF = BFrep) ) rpcbNull2 <- rpcb2 %>% ## filter(po1 > 0.025) #? filter(po > 0.05) #? margin <- 1 conflevel <- 0.9 ggplot(data = rpcbNull2) + facet_wrap(~ id + effectType , scales = "free", ncol = 4) + geom_hline(yintercept = 0, lty = 2, alpha = 0.3) + ## equivalence margin of 0.5 geom_hline(yintercept = c(-margin, margin), lty = 3, col = 2, alpha = 0.9) + geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), size = .25, fatten = 2) + geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), size = .25, fatten = 2) + labs(x = "", y = "Standardized mean difference (SMD)") + geom_text(aes(x = 1.01, y = smdo + so, label = paste("italic(n[o]) ==", no)), col = "darkblue", parse = TRUE, size = 2.5, hjust = 0) + geom_text(aes(x = 2.01, y = smdr + sr, label = paste("italic(n[r]) ==", nr)), col = "darkblue", parse = TRUE, size = 2.5, hjust = 0) + geom_text(aes(x = 1, y = pmin(smdo - 2.5*so, smdr - 2.5*sr, -margin), label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="), BForigformat)), col = "darkblue", parse = TRUE, size = 2.5) + geom_text(aes(x = 2, y = pmin(smdo - 2.5*so, smdr - 2.5*sr, -margin), label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="), BFrepformat)), col = "darkblue", parse = TRUE, size = 2.5) + theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5), # panel.margin = unit(-1, "lines"), strip.background = element_rect(fill = alpha("tan", 0.4)), axis.text = element_text(size = 8)) ## ok I checked the differences ## the studies which are Cohen's d, Cohen's dz, r, Cliff's delta ES type are fine ## the studies with Glass' delta, Hazard ratio, Cohen's w ES type are different ## (do not appear in both data sets with po > 0.05 or they have different estimates or standard errors) ## UPDATE: actually the data sets differ in all standard errors!! even for the Cohen's d :( @ % \appendix % \section{Note on $p$-values} % \todo[inline]{SP: I have used the original $p$-values as reported in the data % set to select the studies in the figure . I think in this way we have the data % correctly identified as the RPCP paper reports that there are 20 null results % in the ``All outcomes'' category. I wonder how they go from the all outcomes % category to the ``effects'' category (15 null results), perhaps pool the % internal replications by meta-analysis? I think it would be better to stay in % the all outcomes category, but of course it needs to be discussed. Also some % of the $p$-values were probably computed in a different way than under % normality (e.g., the $p$-value from (47, 1, 6, 1) under normality is clearly % significant).} % \begin{figure}[!htb] << "plot-p-values", fig.height = 3.5, eval = FALSE >>= library(ggrepel) # to highlight data points with non-overlapping labels ## check discrepancy between reported and recomputed p-values for null results pbreaks <- c(0.005, 0.02, 0.05, 0.15, 0.4) ggplot(data = rpcbNull, aes(x = po, y = po2)) + geom_abline(intercept = 0, slope = 1, alpha = 0.2) + geom_vline(xintercept = 0.05, alpha = 0.2, lty = 2) + geom_hline(yintercept = 0.05, alpha = 0.2, lty = 2) + geom_point(alpha = 0.8, shape = 21, fill = "darkgrey") + geom_label_repel(data = filter(rpcbNull, po2 < 0.05), aes(x = po, y = po2, label = id), alpha = 0.8, size = 3, min.segment.length = 0, box.padding = 0.7) + labs(x = bquote(italic(p["o"]) ~ "(reported)"), y = bquote(italic(p["o"]) ~ "(recomputed under normality)")) + scale_x_log10(breaks = pbreaks, label = scales::percent) + scale_y_log10(breaks = pbreaks, labels = scales::percent) + coord_fixed(xlim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1), ylim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1)) + theme_bw() + theme(panel.grid.minor = element_blank()) @ % \caption{Reported versus recomputed under normality two-sided $p$-values from % original studies declared as ``null results'' ($p_{o} > 0.05$) in % Reproducibility Project: Cancer Biology \citep{Errington2021}.} % \end{figure} << "sessionInfo1", eval = Reproducibility, results = "asis" >>= ## print R sessionInfo to see system information and package versions ## used to compile the manuscript (set Reproducibility = FALSE, to not do that) cat("\\newpage \\section*{Computational details}") @ << "sessionInfo2", echo = Reproducibility, results = Reproducibility >>= cat(paste(Sys.time(), Sys.timezone(), "\n")) sessionInfo() @ \end{document}