diff --git a/bibliography.bib b/bibliography.bib index 8e9d3bced12fe69a59d80968316aff578c81227b..d0173215c5ac3687e123dd440c67dbc7384a2a09 100644 --- a/bibliography.bib +++ b/bibliography.bib @@ -96,6 +96,20 @@ journal = {BMJ} } +@article{Goodman2008, + doi = {10.1053/j.seminhematol.2008.04.003}, + url = {https://doi.org/10.1053/j.seminhematol.2008.04.003}, + year = {2008}, + month = jul, + publisher = {Elsevier {BV}}, + volume = {45}, + number = {3}, + pages = {135--140}, + author = {Steven Goodman}, + title = {A Dirty Dozen: Twelve P-Value Misconceptions}, + journal = {Seminars in Hematology} +} + @Article{Bayarri2003, doi = {10.1016/s0378-3758(02)00282-3}, year = {2003}, @@ -830,6 +844,15 @@ url = {www.fda.gov/regulatory-information/search-fda-guidance-documents/providi title = {New preprint server for medical research}, journal = {{BMJ}} } +@book{NSF2019, + doi = {10.17226/25303}, + url = {https://doi.org/10.17226/25303}, + year = {2019}, + month = sep, + publisher = {National Academies Press}, + author = {{National Academies of Sciences, Engineering, and Medicine}}, + title = {Reproducibility and Replicability in Science} +} @Manual{Gehlenborg2019, title = {UpSetR: A More Scalable Alternative to Venn and Euler Diagrams for diff --git a/rsAbsence.Rnw b/rsAbsence.Rnw index 13422f1aed611e2ac85c0030ecc42a9e832f2bdb..f3562ea84adee6d3453fb092765f456236469984 100755 --- a/rsAbsence.Rnw +++ b/rsAbsence.Rnw @@ -24,7 +24,7 @@ bottom=25mm, } -\title{\bf Meta-research: Replication studies and absence of evidence} +\title{\bf Meta-research: Replication studies and the ``absence of evidence''} \author{{\bf Rachel Heyard, Charlotte Micheloud, Samuel Pawel, Leonhard Held} \\ Epidemiology, Biostatistics and Prevention Institute \\ Center for Reproducible Science \\ @@ -122,9 +122,9 @@ formatBF <- Vectorize(FUN = formatBF.) replicating or even proving a null-effect. Methods to adequately summarize the evidence for the null have been proposed. With this paper we want to highlight the consequences of the ``absence of evidence'' fallacy in the - replication setting and want to guide the readers and hopefully future - authors of replication studies to the correct methods to design and - analyse their replication attempts. + replication setting and want to guide the reader and future + author of replication studies to existing methods to appropriately + design and analyse replication attempts of non-significant findings. } \\ \rule{\textwidth}{0.5pt} \emph{Keywords}: Bayesian hypothesis testing, equivalence test, non-inferiority test, null hypothesis, replication @@ -142,25 +142,41 @@ participants, n) is used to achieve an 80-90\% power of correctly rejecting the null hypothesis. This leaves us with a 10-20\% chance of a false negative. Somehow this fact from ``Hypothesis Testing 101'' is all too often forgotten and studies showing an effect with a p-value larger than the conventionally used -significance level of $\alpha = 0.05$ is doomed to be a ``negative study'' or showing a +significance level of $\alpha = 0.05$ are doomed to be a ``negative study'' or showing a ``null effect''. Some have even called to abolish the term ``negative -study'' altogether, as every well-designed and conducted study is a ``positive +study'' altogether, as every well-designed and well-conducted study is a ``positive contribution to knowledge'', regardless it’s results \citep{Chalmers1002}. Others suggest to shift away from significance testing because of the many misconceptions -of $p$-values and significance \citep{Berner2022}. - -More specifically, turning to the replication context, ``the absence of evidence'' fallacy -appeared in the definitions of replication success in some of the large-scale -replication projects. The Replication Project Cancer Biology \citep[RPCB]{Errington2021} -and the RP in Experimental Philosophy \citep[RPEP]{Cova2018} explicitly define a -replication of a non-significant original effect as successful if the effect in the -replication study is also non-significant. While the authors of the RPEP warn -the reader that the use of p-values as criterion for success is problematic when -applied to replications of original non-significant findings, the authors of the -RPCB do not. The RP in Psychological Science \citep{Opensc2015}, on the other hand, -excluded the ``original nulls'' when deciding replication success based on significance and -the Social Science RP \citep{Camerer2018} as well as the RP in Experimental Economics -\cite{Camerer2016} did not include original studies without a significant finding. +of $p$-values and significance \citep{Goodman2008, Berner2022}. + +Turning to the replication context, replicability has been +defined as ``obtaining consistent results across studies aimed at answering the +same scientific question, each of which has obtained its own data'' \citep{NSF2019}. +Hence, a replication of an original finding attempts to find consistent results while +applying the same methods and protocol as published in the original study on newly collected data. +In the past decade, big collaborations of researcher and research groups conducted +large-scale replication projects (RP) to estimate the replicability of their respective research +field. In these projects, a set of high impact and influential original studies were +selected to be replicated as close as possible to the original methodology. The +results and conclusions of the RPs showed alarmingly low levels of replicability in most fields. +The Replication Project Cancer Biology \citep[RPCB]{Errington2021}, the RP in +Experimental Philosophy \citep[RPEP]{Cova2018} and the RP in Psychological +Science \citep[RPP]{Opensc2015} also attempted to replicate original studies with +non-significant effects. The authors of those RPs unfortunately fell into the +``absence of evidence''-fallacy trap when defining successful replications. +More specifically, the RPCB and RPEP explicitly define a replication of a non-significant +original effect as successful if the effect in the replication study is also non-significant. +While the authors of the RPEP warn the reader that the use of $p$-values as criterion +for success is problematic when applied to replications of original non-significant findings, +the authors of the RPCB do not. In the RPP, on the other hand, ``original nulls'' +were excluded when assessing replication success based on significance. + +% In general, using the significance criterion as definition of replication success +% arises from a false interpretation of the failure to find evidence against the null +% hypothesis as evidence for the null. Non-significant original finding does not +% mean that the underlying true effect is zero nor that it does not exist. This is +% especially true if the original study is under-powered. + \textbf{To replicate or not to replicate an original ``null'' finding?} Because of the previously presented fallacy, original studies with @@ -174,41 +190,28 @@ successful replication we need a ``significant result in the same direction in both the original and the replication study'' (i.e. the two-trials rule, \cite{Senn2008}), replicating a non-significant original result does indeed not make any sense. However, the use of significance as sole criterion for replication success has -its shortcomings. - -\citet{Anderson2016} summarized the goals of replications and recommended analyses and -success criterion. Interestingly they recommended using the two-trials rule only if -the goal is to infer the \textit{existence and direction} of a statistical significant -effect, while the replicating researchers are not interested in the size of this effect. -A successful replication attempt would result in a small $p$-value, while a large $p$-value -in the replication would only mean that the -On the contrary, if the goal is to infer a null effect \cite{Anderson2016} write that, -in this case, evidence for the null hypothesis has to be provided. To achieve this -goal equivalence tests or Bayesian methods to quantify the evidence for the null -hypothesis can be used. In the following, we will illustrate how to accurately -interpret the potential replication of original non-significant results in the -Cancer Biology Replication Project. -% \todo[inline]{SP: look and discuss the papers from \citet{Anderson2016, Anderson2017}} -\todo[inline]{RH: Note sure what to cite from \citet{Anderson2017}} - - -In general a non-significant original finding does not mean that the underlying -true effect is zero nor that it does not exist. This is especially true if the -original study is under-powered. \todo[inline]{RH: for myself, more blabla on -under-powered original studies} +its shortcomings and other definitions for replication success have been proposed +\cite{Simonsohn2015, Ly2018, Hedges2019, Held2020}. Additionally, replication +studies have to be well-design too in order to ensure high enough replication power +\cite{Anderson2017, Micheloud2020}. + +According to \citet{Anderson2016}, if the goal of a replications is to infer a null effect +evidence for the null hypothesis has to be provided. To achieve this they recommend to use +equivalence tests or Bayesian methods to quantify the evidence for the null hypothesis can be used. +In the following, we will illustrate how to accurately interpret the potential +replication of original non-significant results in the Replication Project Cancer Biology. \section{Example: ``Null findings'' from the Replication Project Cancer Biology} Of the 158 effects presented in 23 original studies that were repeated in the -cancer biology RP \citep{Errington2021} 14\% (22) were interpreted as ``null +RPCB \citep{Errington2021} 14\% (22) were interpreted as ``null effects''. -% One of those repeated effects with a non-significant original finding was -% presented in Lu et al. (2014) and replicated by Richarson et al (2016). -Note that the attempt to replicate all the experiments from the original study -was not completed because of some unforeseen issues in the implementation (see -\cite{Errington2021b} for more details on the unfinished registered reports in -the RPCB). Figure~\ref{fig:nullfindings} shows effect estimates with confidence -intervals for the original ``null findings'' (with $p_{o} > 0.05$) and their +% Note that the attempt to replicate all the experiments from the original study +% was not completed because of some unforeseen issues in the implementation (see +% \cite{Errington2021b} for more details on the unfinished registered reports in +% the RPCB). +Figure~\ref{fig:nullfindings} shows effect estimates with confidence +intervals for these original ``null findings'' (with $p_{o} > 0.05$) and their replication studies from the project. % The replication of our example effect (Paper \# 47, Experiment \# 1, Effect \# % 5) was however completed. The authors of the original study declared that @@ -223,16 +226,6 @@ replication studies from the project. % effect sizes together with their 95\% confidence intervals and respective % two-sided p-values. -\todo[inline]{SP: I have used the original $p$-values as reported in the data - set to select the studies in the figure . I think in this way we have the data - correctly identified as the RPCP paper reports that there are 20 null findings - in the ``All outcomes'' category. I wonder how they go from the all outcomes - category to the ``effects'' category (15 null findings), perhaps pool the - internal replications by meta-analysis? I think it would be better to stay in - the all outcomes category, but of course it needs to be discussed. Also some - of the $p$-values were probably computed in a different way than under - normality (e.g., the $p$-value from (47, 1, 6, 1) under normality is clearly - significant).} << "data" >>= ## data @@ -282,53 +275,31 @@ rpcbNull <- rpcb %>% @ -\begin{figure}[!htb] -<< "plot-p-values", fig.height = 3.5 >>= -## check discrepancy between reported and recomputed p-values for null results -pbreaks <- c(0.005, 0.02, 0.05, 0.15, 0.4) -ggplot(data = rpcbNull, aes(x = po, y = po2)) + - geom_abline(intercept = 0, slope = 1, alpha = 0.2) + - geom_vline(xintercept = 0.05, alpha = 0.2, lty = 2) + - geom_hline(yintercept = 0.05, alpha = 0.2, lty = 2) + - geom_point(alpha = 0.8, shape = 21, fill = "darkgrey") + - geom_label_repel(data = filter(rpcbNull, po2 < 0.05), - aes(x = po, y = po2, label = id), alpha = 0.8, size = 3, - min.segment.length = 0, box.padding = 0.7) + - labs(x = bquote(italic(p["o"]) ~ "(reported)"), - y = bquote(italic(p["o"]) ~ "(recomputed under normality)")) + - scale_x_log10(breaks = pbreaks, label = scales::percent) + - scale_y_log10(breaks = pbreaks, labels = scales::percent) + - coord_fixed(xlim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1), - ylim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1)) + - theme_bw() + - theme(panel.grid.minor = element_blank()) - - -@ -\caption{Reported versus recomputed under normality two-sided $p$-values from - original studies declared as ``null findings'' ($p_{o} > 0.05$) in - Reproducibility Project: Cancer Biology \citep{Errington2021}.} -\end{figure} \begin{figure}[!htb] -<< "plot-null-findings-rpcb", fig.height = 8.5 >>= +<< "plot-null-findings-rpcb", fig.height =8.5 >>= ggplot(data = rpcbNull) + - facet_wrap(~ id, scales = "free", ncol = 4) + - geom_hline(yintercept = 0, lty = 2, alpha = 0.5) + - geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - 2*so, - ymax = smdo + 2*so)) + - geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - 2*sr, - ymax = smdr + 2*sr)) + - geom_text(aes(x = "Replication", y = pmax(smdr + 2.1*sr, smdo + 2.1*so), - label = paste("'BF'['01']", - ifelse(BFrformat == "< 1/1000", "", "=="), - BFrformat)), - parse = TRUE, size = 3, - nudge_y = -0.5) + - labs(x = "", y = "Standardized mean difference (SMD)") + - theme_bw() + - theme(panel.grid.minor = element_blank(), - panel.grid.major.x = element_blank()) + facet_wrap(~ id, scales = "free", ncol = 4) + + geom_hline(yintercept = 0, lty = 2, alpha = 0.5) + + geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - 2*so, + ymax = smdo + 2*so)) + + geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - 2*sr, + ymax = smdr + 2*sr)) + + labs(x = "", y = "Standardized mean difference (SMD)") + + geom_text(aes(x = 1.4, y = smdo, #pmin(smdr - 2.2*sr, smdo - 2.2*so), + label = paste("n[o]==", no)), col = "darkblue", + parse = TRUE, size = 2.5, + nudge_x = -.05) + + geom_text(aes(x = 2.4, y = smdr, #pmin(smdr - 2.2*sr, smdo - 2.2*so), + label = paste("n[r]==", nr)), col = "darkblue", + parse = TRUE, size = 2.5, + nudge_x = -.05) + + theme_bw() + + theme(panel.grid.minor = element_blank(), + panel.grid.major.x = element_blank()) + +# TODO: one replication is missing, id == "(37, 2, 2, 1)" +# what should we do with it? @ \caption{Standardized mean difference effect estimates with 95\% confidence @@ -338,12 +309,14 @@ ggplot(data = rpcbNull) + number, Effect number, Internal replication number). The data were downloaded from \url{https://doi.org/10.17605/osf.io/e5nvr}. The relevant variables were extracted from the file ``\texttt{RP\_CB Final Analysis - Effect level - data.csv}''.} + data.csv}''. Additionally the original ($n_o$) and replication sample sizes + ($n_r$) are indicated in each plot.} \label{fig:nullfindings} \end{figure} \section{Dealing with original non-significant findings in replication projects} + \subsection{Equivalence Design} For many years, equivalence designs have been used in clinical trials to understand whether a new drug, which might be cheaper or have less side effects @@ -384,10 +357,85 @@ absence of evidence for either hypothesis ($\BF_{01} \approx 1$). % the replication Bayes factor \citep{Verhagen2014}. +\begin{figure}[!htb] +<< "plot-null-findings-rpcb-br", fig.height = 8.5 >>= +ggplot(data = rpcbNull) + + facet_wrap(~ id, scales = "free", ncol = 4) + + geom_hline(yintercept = 0, lty = 2, alpha = 0.5) + + geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - 2*so, + ymax = smdo + 2*so)) + + geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - 2*sr, + ymax = smdr + 2*sr)) + + geom_text(aes(x = "Replication", y = pmax(smdr + 2.1*sr, smdo + 2.1*so), + label = paste("'BF'['01']", + ifelse(BFrformat == "< 1/1000", "", "=="), + BFrformat)), + parse = TRUE, size = 3, + nudge_y = -0.5) + + labs(x = "", y = "Standardized mean difference (SMD)") + + theme_bw() + + theme(panel.grid.minor = element_blank(), + panel.grid.major.x = element_blank()) + +@ +\caption{Standardized mean difference effect estimates with 95\% confidence + interval for the ``null findings'' (with $p_{o} > 0.05$) and their replication + studies from the Reproducibility Project: Cancer Biology \citep{Errington2021}. + The identifier above each plot indicates (Original paper number, Experiment + number, Effect number, Internal replication number). The data were downloaded + from \url{https://doi.org/10.17605/osf.io/e5nvr}. The relevant variables were + extracted from the file ``\texttt{RP\_CB Final Analysis - Effect level + data.csv}''.} +\label{fig:nullfindings} +\end{figure} + \bibliographystyle{apalikedoiurl} \bibliography{bibliography} +\appendix + +\section{Note on $p$-values} + + +\todo[inline]{SP: I have used the original $p$-values as reported in the data + set to select the studies in the figure . I think in this way we have the data + correctly identified as the RPCP paper reports that there are 20 null findings + in the ``All outcomes'' category. I wonder how they go from the all outcomes + category to the ``effects'' category (15 null findings), perhaps pool the + internal replications by meta-analysis? I think it would be better to stay in + the all outcomes category, but of course it needs to be discussed. Also some + of the $p$-values were probably computed in a different way than under + normality (e.g., the $p$-value from (47, 1, 6, 1) under normality is clearly + significant).} + +\begin{figure}[!htb] +<< "plot-p-values", fig.height = 3.5 >>= +## check discrepancy between reported and recomputed p-values for null results +pbreaks <- c(0.005, 0.02, 0.05, 0.15, 0.4) +ggplot(data = rpcbNull, aes(x = po, y = po2)) + + geom_abline(intercept = 0, slope = 1, alpha = 0.2) + + geom_vline(xintercept = 0.05, alpha = 0.2, lty = 2) + + geom_hline(yintercept = 0.05, alpha = 0.2, lty = 2) + + geom_point(alpha = 0.8, shape = 21, fill = "darkgrey") + + geom_label_repel(data = filter(rpcbNull, po2 < 0.05), + aes(x = po, y = po2, label = id), alpha = 0.8, size = 3, + min.segment.length = 0, box.padding = 0.7) + + labs(x = bquote(italic(p["o"]) ~ "(reported)"), + y = bquote(italic(p["o"]) ~ "(recomputed under normality)")) + + scale_x_log10(breaks = pbreaks, label = scales::percent) + + scale_y_log10(breaks = pbreaks, labels = scales::percent) + + coord_fixed(xlim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1), + ylim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1)) + + theme_bw() + + theme(panel.grid.minor = element_blank()) + + +@ +\caption{Reported versus recomputed under normality two-sided $p$-values from + original studies declared as ``null findings'' ($p_{o} > 0.05$) in + Reproducibility Project: Cancer Biology \citep{Errington2021}.} +\end{figure} << "sessionInfo1", eval = Reproducibility, results = "asis" >>= ## print R sessionInfo to see system information and package versions