From 68fffbee9b5b7ceaa709779057132fdd3de7ebc7 Mon Sep 17 00:00:00 2001
From: Rachel Heyard <rachel.heyard@uzh.ch>
Date: Fri, 16 Dec 2022 17:06:43 +0100
Subject: [PATCH] some random changes (not done yet).

---
 bibliography.bib | 40 ++++++++++++++++++++++
 rsAbsence.Rnw    | 88 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 98 insertions(+), 30 deletions(-)

diff --git a/bibliography.bib b/bibliography.bib
index 3418193..8e9d3bc 100644
--- a/bibliography.bib
+++ b/bibliography.bib
@@ -82,6 +82,20 @@
   journal = {Frontiers in Psychology}
 }
 
+@article {Chalmers1002,
+	author = {Chalmers, Iain},
+	title = {Proposal to outlaw the term {\textquotedblleft}negative trial{\textquotedblright}},
+	volume = {290},
+	number = {6473},
+	pages = {1002--1002},
+	year = {1985},
+	doi = {10.1136/bmj.290.6473.1002},
+	publisher = {BMJ Publishing Group Ltd},
+	issn = {0267-0623},
+	URL = {https://www.bmj.com/content/290/6473/1002.1},
+	journal = {BMJ}
+}
+
 @Article{Bayarri2003,
   doi = {10.1016/s0378-3758(02)00282-3},
   year = {2003},
@@ -1388,6 +1402,18 @@ Visualizing Intersecting Sets},
   journal = {{eLife}}
 }
 
+@article{Errington2021b,
+  doi = {10.7554/elife.73430},
+  url = {https://doi.org/10.7554/elife.73430},
+  year = {2021},
+  month = dec,
+  publisher = {{eLife} Sciences Publications,  Ltd},
+  volume = {10},
+  author = {Timothy M Errington and Alexandria Denis and Anne B Allison and Renee Araiza and Pedro Aza-Blanc and Lynette R Bower and Jessica Campos and Heidi Chu and Sarah Denson and Cristine Donham and Kaitlyn Harr and Babette Haven and Elizabeth Iorns and Jennie Kwok and Elysia McDonald and Steven Pelech and Nicole Perfito and Amanda Pike and Darryl Sampey and Michael Settles and David A Scott and Vidhu Sharma and Todd Tolentino and Angela Trinh and Rachel Tsui and Brandon Willis and Joshua Wood and Lisa Young},
+  title = {Experiments from unfinished Registered Reports in the Reproducibility Project: Cancer Biology},
+  journal = {{eLife}}
+}
+
 @article{Bretz2009,
   doi = {10.1002/sim.3538},
   year = {2009},
@@ -2831,6 +2857,20 @@ Discrimination},
   journal = {Statistics in Medicine}
 }
 
+@article{Berner2022,
+  doi = {10.1111/jeb.14009},
+  url = {https://doi.org/10.1111/jeb.14009},
+  year = {2022},
+  month = may,
+  publisher = {Wiley},
+  volume = {35},
+  number = {6},
+  pages = {777--787},
+  author = {Daniel Berner and Valentin Amrhein},
+  title = {Why and how we should join the shift from significance testing to estimation},
+  journal = {Journal of Evolutionary Biology}
+}
+
 
 @book{Senn2008,
   title={Statistical issues in drug development},
diff --git a/rsAbsence.Rnw b/rsAbsence.Rnw
index d1732b1..13422f1 100755
--- a/rsAbsence.Rnw
+++ b/rsAbsence.Rnw
@@ -24,7 +24,7 @@
   bottom=25mm,
 }
 
-\title{\bf Replication studies and absence of evidence}
+\title{\bf Meta-research: Replication studies and absence of evidence}
 \author{{\bf Rachel Heyard, Charlotte Micheloud, Samuel Pawel, Leonhard Held} \\
   Epidemiology, Biostatistics and Prevention Institute \\
   Center for Reproducible Science \\
@@ -116,7 +116,15 @@ formatBF <- Vectorize(FUN = formatBF.)
         some sort of mantra in statistics and medical lectures. The
         misinterpretation of non-significant results as ``null-findings'' is
         however still common and has important consequences for the
-        interpretation of replication projects and alike.
+        interpretation of replication projects and alike. In many replication
+        attempts and large replication projects, failure to reject the null
+        hypothesis in the replication study is interpreted as successfully
+        replicating or even proving a null-effect. Methods to adequately summarize
+        the evidence for the null have been proposed. With this paper we want to
+        highlight the consequences of the ``absence of evidence'' fallacy in the
+        replication setting and want to guide the readers and hopefully future
+        authors of replication studies to the correct methods to design and
+        analyse their replication attempts.
       } \\
       \rule{\textwidth}{0.5pt} \emph{Keywords}: Bayesian hypothesis testing,
       equivalence test, non-inferiority test, null hypothesis, replication
@@ -127,8 +135,6 @@ formatBF <- Vectorize(FUN = formatBF.)
 
 \section{Introduction}
 
-
-
 The general misconception that statistical non-significance indicates evidence
 for the absence of an effect is unfortunately widespread \citep{Altman1995}. A
 well-designed study is constructed in a way that a large enough sample (of
@@ -136,41 +142,62 @@ participants, n) is used to achieve an 80-90\% power of correctly rejecting the
 null hypothesis. This leaves us with a 10-20\% chance of a false negative.
 Somehow this fact from ``Hypothesis Testing 101'' is all too often forgotten and
 studies showing an effect with a p-value larger than the conventionally used
-significance level of 0.05 is doomed to be ``negative study'' or showing a
-``null effect''. Some have even pleaded for abolishing the term ``negative
-study'', as every well-designed and conducted study is a ``positive contribution
-to knowledge'', regardless it’s results [REF]. \todo[inline]{Some more from
-  https://onlinelibrary.wiley.com/doi/full/10.1111/jeb.14009}
-
-More specifically, turning to the replication context, the misconception
+significance level of $\alpha = 0.05$ is doomed to be a ``negative study'' or showing a
+``null effect''. Some have even called to abolish the term ``negative
+study'' altogether, as every well-designed and conducted study is a ``positive
+contribution to knowledge'', regardless it’s results \citep{Chalmers1002}. Others
+suggest to shift away from significance testing because of the many misconceptions
+of $p$-values and significance \citep{Berner2022}.
+
+More specifically, turning to the replication context, ``the absence of evidence'' fallacy
 appeared in the definitions of replication success in some of the large-scale
-replication projects. The Replication Project Cancer Biology (RPCB [REF]) and
-the RP in Experimental Philosophy (RPEP [REF]) explicitly define a replication
-of a non-significant original effect as successful if the effect in the
+replication projects. The Replication Project Cancer Biology \citep[RPCB]{Errington2021}
+and the RP in Experimental Philosophy \citep[RPEP]{Cova2018} explicitly define a
+replication of a non-significant original effect as successful if the effect in the
 replication study is also non-significant. While the authors of the RPEP warn
 the reader that the use of p-values as criterion for success is problematic when
 applied to replications of original non-significant findings, the authors of the
-RPCB do not. The RP in Psychological Science [REF], on the other hand, excluded
-the ``original nulls'' when deciding replication success based on significance and
-the Social Science RP [REF] as well as the RP in Experimental Economics [REF]
-did not include original studies without a significant finding.
+RPCB do not. The RP in Psychological Science \citep{Opensc2015}, on the other hand,
+excluded the ``original nulls'' when deciding replication success based on significance and
+the Social Science RP \citep{Camerer2018} as well as the RP in Experimental Economics
+\cite{Camerer2016} did not include original studies without a significant finding.
 
-\section{To replicate or not to replicate (a ``null'')?}
+\textbf{To replicate or not to replicate an original ``null'' finding?}
 Because of the previously presented fallacy, original studies with
 non-significant effects are seldom replicated. Given the cost of replication
 studies, it is also unwise to advise replicating a study that has low changes of
 successful replication. To help deciding what studies are worth repeating,
 efforts to predict which studies have a higher chance to replicate successfully
-emerged [REF]. Of note is that the chance of a successful replication
+emerged \citep{Altmejd2019, Pawel2020}. Of note is that the chance of a successful replication
 intrinsically depends on the definition of replication success. If for a
 successful replication we need a ``significant result in the same direction in
-both the original and the replication study'' (i.e. the two-trials rule),
+both the original and the replication study'' (i.e. the two-trials rule, \cite{Senn2008}),
 replicating a non-significant original result does indeed not make any sense.
 However, the use of significance as sole criterion for replication success has
-its shortcomings .....
-\todo[inline]{SP: look and discuss the papers from \citet{Anderson2016, Anderson2017}}
-
-\section{Example: ``Null findings'' from the Reproducibility Project: Cancer
+its shortcomings.
+
+\citet{Anderson2016} summarized the goals of replications and recommended analyses and
+success criterion. Interestingly they recommended using the two-trials rule only if
+the goal is to infer the \textit{existence and direction} of a statistical significant
+effect, while the replicating researchers are not interested in the size of this effect.
+A successful replication attempt would result in a small $p$-value, while a large $p$-value
+in the replication would only mean that the
+On the contrary, if the goal is to infer a null effect \cite{Anderson2016} write that,
+in this case, evidence for the null hypothesis has to be provided. To achieve this
+goal equivalence tests or Bayesian methods to quantify the evidence for the null
+hypothesis can be used. In the following, we will illustrate how to accurately
+interpret the potential replication of original non-significant results in the
+Cancer Biology Replication Project.
+% \todo[inline]{SP: look and discuss the papers from \citet{Anderson2016, Anderson2017}}
+\todo[inline]{RH: Note sure what to cite from \citet{Anderson2017}}
+
+
+In general a non-significant original finding does not mean that the underlying
+true effect is zero nor that it does not exist. This is especially true if the
+original study is under-powered. \todo[inline]{RH: for myself, more blabla on
+under-powered original studies}
+
+\section{Example: ``Null findings'' from the Replication Project Cancer
   Biology}
 Of the 158 effects presented in 23 original studies that were repeated in the
 cancer biology RP \citep{Errington2021} 14\% (22) were interpreted as ``null
@@ -179,7 +206,7 @@ effects''.
 % presented in Lu et al. (2014) and replicated by Richarson et al (2016).
 Note that the attempt to replicate all the experiments from the original study
 was not completed because of some unforeseen issues in the implementation (see
-Errington et al (2021) for more details on the unfinished registered reports in
+\cite{Errington2021b} for more details on the unfinished registered reports in
 the RPCB). Figure~\ref{fig:nullfindings} shows effect estimates with confidence
 intervals for the original ``null findings'' (with $p_{o} > 0.05$) and their
 replication studies from the project.
@@ -315,18 +342,19 @@ ggplot(data = rpcbNull) +
 \label{fig:nullfindings}
 \end{figure}
 
-\section{Equivalence Design}
+
+\section{Dealing with original non-significant findings in replication projects}
+\subsection{Equivalence Design}
 For many years, equivalence designs have been used in clinical trials to
 understand whether a new drug, which might be cheaper or have less side effects
 is equivalent to a drug already on the market [some general REF]. Essentially,
 this type of design tests whether the difference between the effects of both
 treatments or interventions is smaller than a predefined margin/threshold.
 Turning back to the replication contexts and our example ....
-% \todo[inline]{fix margin:
-% to 0.25??}
 
 
-\section{Bayesian Hypothesis Testing}
+
+\subsection{Bayesian Hypothesis Testing}
 Bayesian hypothesis testing is a hypothesis testing framework in which the
 distinction between absence of evidence and evidence of absence is more natural.
 The central quantity is the Bayes factor \citep{Jeffreys1961, Good1958,
-- 
GitLab