more Bayes stuff, conclusions

802d7992 · SamCH93 · e7c89bd8 · 802d7992 · 802d7992
Commit 802d7992 authored 2 years ago by SamCH93
--- a/bibliography.bib
+++ b/bibliography.bib
+@article{Morey2011,
+  doi = {10.1037/a0024377},
+  year = {2011},
+  volume = {16},
+  number = {4},
+  pages = {406--419},
+  author = {Richard D. Morey and Jeffrey N. Rouder},
+  title = {Bayes factor approaches for testing interval null hypotheses},
+  journal = {Psychological Methods}
+}
+@article{Micheloud2022,
+  doi = {10.48550/ARXIV.2204.06960},
+  author = {Micheloud,  Charlotte and Held,  Leonhard},
+  title = {The replication of non-inferiority and equivalence studies},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual,  non-exclusive license},
+  note = {arXiv preprint}
+}
 @article{Schuirmann1987,
  doi = {10.1007/bf01068419},
  year = {1987},

--- a/rsAbsence.Rnw
+++ b/rsAbsence.Rnw
@@ -421,7 +421,7 @@ $p$-values are greater than 5\% in both the original and the replication study),
 but intuition would suggest that these two pairs are very much different.
 \begin{figure}[ht]
-<< "2-example-studies", fig.height = 3.5 >>=
+<< "2-example-studies", fig.height = 3.25 >>=
 ## some evidence for absence of effect (when a really genereous margin Delta = 1
 ## of a lenient BF = 3 threshold)
 ## https://doi.org/10.7554/eLife.45120 I can't find the replication effect like reported in the data set :(
@@ -446,17 +446,17 @@ ggplot(data = plotDF1) +
                        ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr,
                        ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) +
    geom_text(aes(x = 1.05, y = 2.5,
-                  label = paste("italic(n[o]) ==", no)), col = "darkblue",
+                  label = paste("italic(n) ==", no)), col = "darkblue",
-              parse = TRUE, size = 4, hjust = 0) +
+              parse = TRUE, size = 3.8, hjust = 0) +
    geom_text(aes(x = 2.05, y = 2.5,
-                  label = paste("italic(n[r]) ==", nr)), col = "darkblue",
+                  label = paste("italic(n) ==", nr)), col = "darkblue",
-              parse = TRUE, size = 4, hjust = 0) +
+              parse = TRUE, size = 3.8, hjust = 0) +
    geom_text(aes(x = 1.05, y = 3,
-                  label = paste("italic(p[o]) ==", biostatUZH::formatPval(po))), col = "darkblue",
+                  label = paste("italic(p) ==", biostatUZH::formatPval(po))), col = "darkblue",
-              parse = TRUE, size = 4, hjust = 0) +
+              parse = TRUE, size = 3.8, hjust = 0) +
    geom_text(aes(x = 2.05, y = 3,
-                  label = paste("italic(p[r]) ==", biostatUZH::formatPval(pr))), col = "darkblue",
+                  label = paste("italic(p) ==", biostatUZH::formatPval(pr))), col = "darkblue",
-              parse = TRUE, size = 4, hjust = 0) +
+              parse = TRUE, size = 3.8, hjust = 0) +
    labs(x = "", y = "Standardized mean difference (SMD)") +
    theme_bw() +
    theme(panel.grid.minor = element_blank(),
@@ -503,7 +503,7 @@ discuss how the two can be quantitatively distinguished.
 \begin{figure}[!htb]
-<< "plot-null-findings-rpcb", fig.height = 8.2 >>=
+<< "plot-null-findings-rpcb", fig.height = 8.25 >>=
 margin <- 1
 conflevel <- 0.9
 rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = margin, sd = so,
@@ -514,6 +514,10 @@ rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = margin, sd = sr,
                                             lower.tail = TRUE),
                                       pnorm(q = smdr, mean = -margin, sd = sr,
                                             lower.tail = FALSE)))
+## highlight the studies from Goetz and Dawson
+rpcbNull$id <- ifelse(rpcbNull$id == "(20, 1, 1, 1)", "(20, 1, 1, 1) - Goetz et al. (2011)", rpcbNull$id)
+rpcbNull$id <- ifelse(rpcbNull$id == "(29, 2, 2, 1)", "(29, 2, 2, 1) - Dawson et al. (2011)", rpcbNull$id)
 estypes <- c("r", "Cohen's dz", "Cohen's d")
 ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
    facet_wrap(~ id ## + effectType
@@ -538,26 +542,26 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
                  label = paste("italic(p)['TOST']",
                                ifelse(ptosto < 0.0001, "", "=="),
                                biostatUZH::formatPval(ptosto))),
-              col = "darkblue", parse = TRUE, size = 2.5, hjust = 0,
+              col = "darkblue", parse = TRUE, size = 2.3, hjust = 0,
              vjust = 0.5) +
    geom_text(aes(x = 1.51, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                  label = paste("italic(p)['TOST']",
                                ifelse(ptostr < 0.0001, "", "=="),
                                biostatUZH::formatPval(ptostr))),
-              col = "darkblue", parse = TRUE, size = 2.5, hjust = 0,
+              col = "darkblue", parse = TRUE, size = 2.3, hjust = 0,
              vjust = 0.5) +
    geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                  label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="),
                                BForigformat)), col = "darkblue",
-              parse = TRUE, size = 2.5, vjust = 1.7, hjust = 0,) +
+              parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) +
    geom_text(aes(x = 1.59, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                  label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="),
                                BFrepformat)), col = "darkblue",
-              parse = TRUE, size = 2.5, vjust = 1.7, hjust = 0,) +
+              parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) +
    theme_bw() +
    theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
-          strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5),
+          strip.text = element_text(size = 6.4, margin = margin(3), vjust = 2),
                                        # panel.margin = unit(-1, "lines"),
          strip.background = element_rect(fill = alpha("tan", .4)),
          axis.text = element_text(size = 8))
@@ -567,15 +571,17 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
  (those with two-sided $p$-value $p_{o} > 0.05$) and their replication studies
  from the Reproducibility Project: Cancer Biology \citep{Errington2021}. The
  identifier above each plot indicates (Original paper number, Experiment
-  number, Effect number, Internal replication number). The dashed grey line
+  number, Effect number, Internal replication number). The two examples
+   from Figure~\ref{fig:2examples} are indicated
+  in the plot titles. The dashed grey line
  depicts the value of no effect ($\text{SMD} = 0$) whereas the dotted red lines
  depict the equivalence range with margin $\Delta = \Sexpr{margin}$. The
  $p$-values $p_{\text{TOST}}$ are the maximum of the two one-sided $p$-values
  for the effect being smaller or greater than $+\Delta$ or $-\Delta$,
-  respectively. The Bayes factors $\BF_{01}$ quantify the evidence for the null
+  respectively. The Bayes factors $\BF_{01}$ quantify  evidence for the null
-  hypothesis $H_{0} \colon \text{SMD} = 0$ to the alternative
+  hypothesis $H_{0} \colon \text{SMD} = 0$ against the alternative
-  $H_{1} \colon \text{SMD} \neq 0$ with unit-information prior assigned to the
+  $H_{1} \colon \text{SMD} \neq 0$ with normal unit-information prior assigned to the
-  SMD under the alternative $H_{1}$.
+  SMD under $H_{1}$.
  % Additionally, the
  % original effect size type is indicated, while all effect sizes were
  % transformed to the SMD scale.
@@ -629,14 +635,14 @@ research; we chose it primarily for illustrative purposes and because effect
 sizes in preclinical research are typically much larger than in clinical
 research. In practice, the margin should be determined on a case-by-case basis
 by researchers who are familiar with the subject matter. However, even with this
-generous margin, only two of the study pairs -- one of them being the previously
+generous margin, only four of the twenty study pairs -- one of them being the
-discussed study by \citet{Goetz2011} with identifier (20, 1, 1, 1) -- are able
+previously discussed example from \citet{Goetz2011} -- are able to establish
-to establish equivalence at the 5\% level in the sense that both the original
+equivalence at the 5\% level in the sense that both the original and the
-and the replication 90\% confidence interval fall within the equivalence range
+replication 90\% confidence interval fall within the equivalence range or both
-or both TOST $p$-value are smaller than $5\%$. For the remaining 18 studies, for
+TOST $p$-values are smaller than $5\%$. For the remaining 18 studies, for
-example, the previously discussed study by \citet{Dawson2011} with identifier
+example, the previously discussed example from \citet{Dawson2011}, the situation
-(29, 2, 2, 1), the situation remains inconclusive and there is neither evidence
+remains inconclusive and there is neither evidence for the absence or presence
-for the absence or presence of the effect.
+of the effect.
 \subsection{Bayesian hypothesis testing}
@@ -664,31 +670,57 @@ different from one indicates absence of evidence for either hypothesis
 When the observed data are dichotomized into positive (\mbox{$p < 5\%$}) or null
 results (\mbox{$p > 5\%$}), the Bayes factor based on a null result is the
 probability of observing \mbox{$p > 5\%$} when the effect is indeed absent
-($95\%$) divided by the probability of observing $p > 5\%$ when the effect is
+(which is $95\%$) divided by the probability of observing $p > 5\%$ when the
-indeed present (which is one minus the power of the study). For example, if the
+effect is indeed present (which is one minus the power of the study). For
-power is 90\%, we have
+example, if the power is 90\%, we have
 \mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten
 times more evidence for the absence of the effect than for its presence. On the
 other hand, if the power is only 50\%, we have
-\mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating hardly any
+\mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating only
-evidence for either absence or presence of the effect. This example also
+slightly more evidence for the absence of the effect. This example also
 highlights the main challenge with Bayes factors -- the specification of the
-alternative hypothesis $H_{1}$. The assumed effect size under $H_{1}$ is
+alternative hypothesis $H_{1}$. The assumed effect under $H_{1}$ is directly
-directly related to the power of the study, and researchers who assume different
+related to the power of the study, and researchers who assume different effects
-effect sizes under $H_{1}$ will end up with different Bayes factors. Instead of
+under $H_{1}$ will end up with different Bayes factors. Instead of specifying a
-specifying a single effect, one therefore typically specifies a ``prior
+single effect, one therefore typically specifies a ``prior distribution'' of
-distribution'' of plausible effects. Importantly, the prior distribution, like
+plausible effects. Importantly, the prior distribution, like the equivalence
-the equivalence margin, should be determined by researchers with subject
+margin, should be determined by researchers with subject knowledge and before
-knowledge and before the data are observed. Furthermore, when computing Bayes
+the data are observed.
-factors, the observed data should not be dichotomized into positive or null
-results, as this leads to a loss of information.
+In practice, the observed data should not be dichotomized into positive or null
+results, as this leads to a loss of information. Therefore, to compute the Bayes
+factors for the RPCB null results, we used the observed effect estimates as the
-Figure~\ref{fig:nullfindings} shows also the Bayes factors contrasting the null
+data and assumed a normal sampling distribution for them, as in a meta-analysis.
-hypothesis of no effect ($H_{0} \colon \text{SMD} = 0$) to the alternative that
+The Bayes factors $\BF_{01}$ shown in Figure~\ref{fig:nullfindings} then
-there is an effect ($H_{0} \colon \text{SMD} \neq 0$) using a
+quantify the evidence for the null hypothesis of no effect
-``unit-information'' normal prior distribution \citep{Kass1995b} for the SMD
+($H_{0} \colon \text{SMD} = 0$) against the alternative hypothesis that there is
-under the alternative $H_{1}$
+an effect ($H_{1} \colon \text{SMD} \neq 0$) using a ``unit-information'' normal
+prior distribution \citep{Kass1995b} for the effect size under the alternative
+$H_{1}$. There are several more advanced prior distributions that could be used
+here, and they should ideally be specified for each effect individually based on
+domain knowledge. The normal unit-information prior (with a standard deviation
+of 2 for SMDs) is only a reasonable default choice, as it implies that small to
+large effects are plausible under the alternative. We see that in most cases
+there is no substantial evidence for either the absence or the presence of an
+effect, as with the equivalence tests. The Bayes factors for the two previously
+discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent
+with our intuititons -- there is indeed some evidence for the absence of an
+effect in \citet{Goetz2011}, while there is even slightly more evidence for the
+presence of an effect in \citet{Dawson2011}, though the Bayes factor is very
+close to one due to the small sample sizes. If we use a lenient Bayes factor
+threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect,
+only one of the twenty study pairs meets this criteiron in both the original and
+replication study. There is one interesting case -- the rightmost plot in the
+fourth row (48, 2, 4, 1) -- where the Bayes factor is qualitatively different
+from the equivalence test, revealing a fundamental difference between the two
+approaches. The Bayes factor is concerned with testing whether the effect is
+\emph{exactly zero}, whereas the equivalence test is concerned with whether the
+effect is within an \emph{interval around zero}. Due to the very large sample
+size in this replication study, the data are incompatible with an exactly zero
+effect, but compatible with effects within the equivalence range. Apart from
+this example, however, the approaches lead to the same qualitative conclusion --
+most RPCB null results are highly ambiguous.
+% regarding the presence or absence of an effect.
@@ -699,30 +731,49 @@ We showed that in most of the RPCB studies with ``null results'' (those with
 $p > 5\%$), neither the original nor the replication study provided conclusive
 evidence for the presence or absence of an effect. It seems logically
 questionable to declare an inconclusive replication of an inconclusive original
-study a replication success. While it is important to replicate original studies
+study as a replication success. While it is important to replicate original
-with null results, our analysis highlights that they should be analyzed,
+studies with null results, our analysis highlights that they should be analyzed
-designed, and interpreted appropriately.
+and interpreted appropriately.
+While the equivalence test and Bayes factor approaches are two principled
+methods for analyzing original and replication studies with null results, they
-\section{Acknowledgements and conflicts of interest}
+are not the only possible methods for doing so. Other methods specifically
+tailored to the replication setting, such as the reverse-Bayes approach of
+\citet{Micheloud2022}, may lead to more appropriate inferences as they also take
+into account the compatibility of the effect estimates from original and
+replication studies. In addition, there are various more advanced Bayesian
+hypothesis testing procedures specifically designed to quantify the evidence for
+the absence of an effect \citep{Johnson2010, Morey2011} that could potentially
+improve the efficiency of the Bayes factor approach. Finally, the design of
+replication studies should align with the planned analysis \citep{Anderson2017,
+  Anderson2022, Micheloud2020, Pawel2022c}. If the goal of study is hence to
+find evidence for the absence of an effect, the replication sample size should
+also be determined so that the study has adequate power to make conclusive
+inferences regarding the absence of the effect.
+\section{Acknowledgements}
 We thank the contributors of the RPCB for their tremendous efforts and for
-making their data publicly available. We thank Maya Mathur for assistance with
+making their data publicly available. We thank Maya Mathur for helpful advice
-data preparation. This work was supported by the Swiss National Science
+with the data preparation. This work was supported by the Swiss National Science
 Foundation (grants \#189295 and \#XXXXXX). We declare no conflict of interest.
+\section{Conflict of interest}
+We declare no conflict of interest.
 \section{Data and software}
 The data from the RPCB were obtained by downloading the files from
-\url{https://github.com/mayamathur/rpcb} and executing the R script
+\url{https://github.com/mayamathur/rpcb} commit a1e0c63 and executing the R
-\texttt{Code/data\_prep.R} with the line 632 commented out so that also original
+script \texttt{Code/data\_prep.R} with the line 632 commented out so that also
-studies with null finding are included. This then produced the file
+original studies with null finding are included. This then produced the file
 \texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent
-analyses. The effect estimates and standard errors on SMD scale from this data
+analyses. The effect estimates and standard errors on SMD scale provided in this
-set differ in some cases from those in the data set available at
+data set differ in some cases from those in the data set available at
 \url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in
 \citet{Errington2021}. We used this particular version of the data set because
-it was recommended to us by the RPCB statistician (M. Mathur) upon request.
+it was recommended to us by the RPCB statistician (Maya Mathur) upon request.
 The code and data to reproduce our analyses is openly available at
 \url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository