diff --git a/bibliography.bib b/bibliography.bib index 17c7372ea75e75b1f108672de5163d5b99aed4c0..63541a7145df1a28adfca5e0b310d4dbd2ab4f01 100644 --- a/bibliography.bib +++ b/bibliography.bib @@ -1,3 +1,24 @@ +@article{Morey2011, + doi = {10.1037/a0024377}, + year = {2011}, + volume = {16}, + number = {4}, + pages = {406--419}, + author = {Richard D. Morey and Jeffrey N. Rouder}, + title = {Bayes factor approaches for testing interval null hypotheses}, + journal = {Psychological Methods} +} + +@article{Micheloud2022, + doi = {10.48550/ARXIV.2204.06960}, + author = {Micheloud, Charlotte and Held, Leonhard}, + title = {The replication of non-inferiority and equivalence studies}, + publisher = {arXiv}, + year = {2022}, + copyright = {arXiv.org perpetual, non-exclusive license}, + note = {arXiv preprint} +} + @article{Schuirmann1987, doi = {10.1007/bf01068419}, year = {1987}, diff --git a/rsAbsence.Rnw b/rsAbsence.Rnw index 19964a7de2148e04615389075837cc4a88db111b..71d96c3bba70a29c1f1f70072790f70e186fa1fb 100755 --- a/rsAbsence.Rnw +++ b/rsAbsence.Rnw @@ -421,7 +421,7 @@ $p$-values are greater than 5\% in both the original and the replication study), but intuition would suggest that these two pairs are very much different. \begin{figure}[ht] -<< "2-example-studies", fig.height = 3.5 >>= +<< "2-example-studies", fig.height = 3.25 >>= ## some evidence for absence of effect (when a really genereous margin Delta = 1 ## of a lenient BF = 3 threshold) ## https://doi.org/10.7554/eLife.45120 I can't find the replication effect like reported in the data set :( @@ -446,17 +446,17 @@ ggplot(data = plotDF1) + ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) + geom_text(aes(x = 1.05, y = 2.5, - label = paste("italic(n[o]) ==", no)), col = "darkblue", - parse = TRUE, size = 4, hjust = 0) + + label = paste("italic(n) ==", no)), col = "darkblue", + parse = TRUE, size = 3.8, hjust = 0) + geom_text(aes(x = 2.05, y = 2.5, - label = paste("italic(n[r]) ==", nr)), col = "darkblue", - parse = TRUE, size = 4, hjust = 0) + + label = paste("italic(n) ==", nr)), col = "darkblue", + parse = TRUE, size = 3.8, hjust = 0) + geom_text(aes(x = 1.05, y = 3, - label = paste("italic(p[o]) ==", biostatUZH::formatPval(po))), col = "darkblue", - parse = TRUE, size = 4, hjust = 0) + + label = paste("italic(p) ==", biostatUZH::formatPval(po))), col = "darkblue", + parse = TRUE, size = 3.8, hjust = 0) + geom_text(aes(x = 2.05, y = 3, - label = paste("italic(p[r]) ==", biostatUZH::formatPval(pr))), col = "darkblue", - parse = TRUE, size = 4, hjust = 0) + + label = paste("italic(p) ==", biostatUZH::formatPval(pr))), col = "darkblue", + parse = TRUE, size = 3.8, hjust = 0) + labs(x = "", y = "Standardized mean difference (SMD)") + theme_bw() + theme(panel.grid.minor = element_blank(), @@ -503,7 +503,7 @@ discuss how the two can be quantitatively distinguished. \begin{figure}[!htb] -<< "plot-null-findings-rpcb", fig.height = 8.2 >>= +<< "plot-null-findings-rpcb", fig.height = 8.25 >>= margin <- 1 conflevel <- 0.9 rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = margin, sd = so, @@ -514,6 +514,10 @@ rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = margin, sd = sr, lower.tail = TRUE), pnorm(q = smdr, mean = -margin, sd = sr, lower.tail = FALSE))) +## highlight the studies from Goetz and Dawson +rpcbNull$id <- ifelse(rpcbNull$id == "(20, 1, 1, 1)", "(20, 1, 1, 1) - Goetz et al. (2011)", rpcbNull$id) +rpcbNull$id <- ifelse(rpcbNull$id == "(29, 2, 2, 1)", "(29, 2, 2, 1) - Dawson et al. (2011)", rpcbNull$id) + estypes <- c("r", "Cohen's dz", "Cohen's d") ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) + facet_wrap(~ id ## + effectType @@ -538,26 +542,26 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) + label = paste("italic(p)['TOST']", ifelse(ptosto < 0.0001, "", "=="), biostatUZH::formatPval(ptosto))), - col = "darkblue", parse = TRUE, size = 2.5, hjust = 0, + col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 0.5) + geom_text(aes(x = 1.51, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)['TOST']", ifelse(ptostr < 0.0001, "", "=="), biostatUZH::formatPval(ptostr))), - col = "darkblue", parse = TRUE, size = 2.5, hjust = 0, + col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 0.5) + geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="), BForigformat)), col = "darkblue", - parse = TRUE, size = 2.5, vjust = 1.7, hjust = 0,) + + parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) + geom_text(aes(x = 1.59, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="), BFrepformat)), col = "darkblue", - parse = TRUE, size = 2.5, vjust = 1.7, hjust = 0,) + + parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) + theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), - strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5), + strip.text = element_text(size = 6.4, margin = margin(3), vjust = 2), # panel.margin = unit(-1, "lines"), strip.background = element_rect(fill = alpha("tan", .4)), axis.text = element_text(size = 8)) @@ -567,15 +571,17 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) + (those with two-sided $p$-value $p_{o} > 0.05$) and their replication studies from the Reproducibility Project: Cancer Biology \citep{Errington2021}. The identifier above each plot indicates (Original paper number, Experiment - number, Effect number, Internal replication number). The dashed grey line + number, Effect number, Internal replication number). The two examples + from Figure~\ref{fig:2examples} are indicated + in the plot titles. The dashed grey line depicts the value of no effect ($\text{SMD} = 0$) whereas the dotted red lines depict the equivalence range with margin $\Delta = \Sexpr{margin}$. The $p$-values $p_{\text{TOST}}$ are the maximum of the two one-sided $p$-values for the effect being smaller or greater than $+\Delta$ or $-\Delta$, - respectively. The Bayes factors $\BF_{01}$ quantify the evidence for the null - hypothesis $H_{0} \colon \text{SMD} = 0$ to the alternative - $H_{1} \colon \text{SMD} \neq 0$ with unit-information prior assigned to the - SMD under the alternative $H_{1}$. + respectively. The Bayes factors $\BF_{01}$ quantify evidence for the null + hypothesis $H_{0} \colon \text{SMD} = 0$ against the alternative + $H_{1} \colon \text{SMD} \neq 0$ with normal unit-information prior assigned to the + SMD under $H_{1}$. % Additionally, the % original effect size type is indicated, while all effect sizes were % transformed to the SMD scale. @@ -629,14 +635,14 @@ research; we chose it primarily for illustrative purposes and because effect sizes in preclinical research are typically much larger than in clinical research. In practice, the margin should be determined on a case-by-case basis by researchers who are familiar with the subject matter. However, even with this -generous margin, only two of the study pairs -- one of them being the previously -discussed study by \citet{Goetz2011} with identifier (20, 1, 1, 1) -- are able -to establish equivalence at the 5\% level in the sense that both the original -and the replication 90\% confidence interval fall within the equivalence range -or both TOST $p$-value are smaller than $5\%$. For the remaining 18 studies, for -example, the previously discussed study by \citet{Dawson2011} with identifier -(29, 2, 2, 1), the situation remains inconclusive and there is neither evidence -for the absence or presence of the effect. +generous margin, only four of the twenty study pairs -- one of them being the +previously discussed example from \citet{Goetz2011} -- are able to establish +equivalence at the 5\% level in the sense that both the original and the +replication 90\% confidence interval fall within the equivalence range or both +TOST $p$-values are smaller than $5\%$. For the remaining 18 studies, for +example, the previously discussed example from \citet{Dawson2011}, the situation +remains inconclusive and there is neither evidence for the absence or presence +of the effect. \subsection{Bayesian hypothesis testing} @@ -664,31 +670,57 @@ different from one indicates absence of evidence for either hypothesis When the observed data are dichotomized into positive (\mbox{$p < 5\%$}) or null results (\mbox{$p > 5\%$}), the Bayes factor based on a null result is the probability of observing \mbox{$p > 5\%$} when the effect is indeed absent -($95\%$) divided by the probability of observing $p > 5\%$ when the effect is -indeed present (which is one minus the power of the study). For example, if the -power is 90\%, we have +(which is $95\%$) divided by the probability of observing $p > 5\%$ when the +effect is indeed present (which is one minus the power of the study). For +example, if the power is 90\%, we have \mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten times more evidence for the absence of the effect than for its presence. On the other hand, if the power is only 50\%, we have -\mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating hardly any -evidence for either absence or presence of the effect. This example also +\mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating only +slightly more evidence for the absence of the effect. This example also highlights the main challenge with Bayes factors -- the specification of the -alternative hypothesis $H_{1}$. The assumed effect size under $H_{1}$ is -directly related to the power of the study, and researchers who assume different -effect sizes under $H_{1}$ will end up with different Bayes factors. Instead of -specifying a single effect, one therefore typically specifies a ``prior -distribution'' of plausible effects. Importantly, the prior distribution, like -the equivalence margin, should be determined by researchers with subject -knowledge and before the data are observed. Furthermore, when computing Bayes -factors, the observed data should not be dichotomized into positive or null -results, as this leads to a loss of information. - - -Figure~\ref{fig:nullfindings} shows also the Bayes factors contrasting the null -hypothesis of no effect ($H_{0} \colon \text{SMD} = 0$) to the alternative that -there is an effect ($H_{0} \colon \text{SMD} \neq 0$) using a -``unit-information'' normal prior distribution \citep{Kass1995b} for the SMD -under the alternative $H_{1}$ +alternative hypothesis $H_{1}$. The assumed effect under $H_{1}$ is directly +related to the power of the study, and researchers who assume different effects +under $H_{1}$ will end up with different Bayes factors. Instead of specifying a +single effect, one therefore typically specifies a ``prior distribution'' of +plausible effects. Importantly, the prior distribution, like the equivalence +margin, should be determined by researchers with subject knowledge and before +the data are observed. + +In practice, the observed data should not be dichotomized into positive or null +results, as this leads to a loss of information. Therefore, to compute the Bayes +factors for the RPCB null results, we used the observed effect estimates as the +data and assumed a normal sampling distribution for them, as in a meta-analysis. +The Bayes factors $\BF_{01}$ shown in Figure~\ref{fig:nullfindings} then +quantify the evidence for the null hypothesis of no effect +($H_{0} \colon \text{SMD} = 0$) against the alternative hypothesis that there is +an effect ($H_{1} \colon \text{SMD} \neq 0$) using a ``unit-information'' normal +prior distribution \citep{Kass1995b} for the effect size under the alternative +$H_{1}$. There are several more advanced prior distributions that could be used +here, and they should ideally be specified for each effect individually based on +domain knowledge. The normal unit-information prior (with a standard deviation +of 2 for SMDs) is only a reasonable default choice, as it implies that small to +large effects are plausible under the alternative. We see that in most cases +there is no substantial evidence for either the absence or the presence of an +effect, as with the equivalence tests. The Bayes factors for the two previously +discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent +with our intuititons -- there is indeed some evidence for the absence of an +effect in \citet{Goetz2011}, while there is even slightly more evidence for the +presence of an effect in \citet{Dawson2011}, though the Bayes factor is very +close to one due to the small sample sizes. If we use a lenient Bayes factor +threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect, +only one of the twenty study pairs meets this criteiron in both the original and +replication study. There is one interesting case -- the rightmost plot in the +fourth row (48, 2, 4, 1) -- where the Bayes factor is qualitatively different +from the equivalence test, revealing a fundamental difference between the two +approaches. The Bayes factor is concerned with testing whether the effect is +\emph{exactly zero}, whereas the equivalence test is concerned with whether the +effect is within an \emph{interval around zero}. Due to the very large sample +size in this replication study, the data are incompatible with an exactly zero +effect, but compatible with effects within the equivalence range. Apart from +this example, however, the approaches lead to the same qualitative conclusion -- +most RPCB null results are highly ambiguous. +% regarding the presence or absence of an effect. @@ -699,30 +731,49 @@ We showed that in most of the RPCB studies with ``null results'' (those with $p > 5\%$), neither the original nor the replication study provided conclusive evidence for the presence or absence of an effect. It seems logically questionable to declare an inconclusive replication of an inconclusive original -study a replication success. While it is important to replicate original studies -with null results, our analysis highlights that they should be analyzed, -designed, and interpreted appropriately. - - - -\section{Acknowledgements and conflicts of interest} +study as a replication success. While it is important to replicate original +studies with null results, our analysis highlights that they should be analyzed +and interpreted appropriately. + +While the equivalence test and Bayes factor approaches are two principled +methods for analyzing original and replication studies with null results, they +are not the only possible methods for doing so. Other methods specifically +tailored to the replication setting, such as the reverse-Bayes approach of +\citet{Micheloud2022}, may lead to more appropriate inferences as they also take +into account the compatibility of the effect estimates from original and +replication studies. In addition, there are various more advanced Bayesian +hypothesis testing procedures specifically designed to quantify the evidence for +the absence of an effect \citep{Johnson2010, Morey2011} that could potentially +improve the efficiency of the Bayes factor approach. Finally, the design of +replication studies should align with the planned analysis \citep{Anderson2017, + Anderson2022, Micheloud2020, Pawel2022c}. If the goal of study is hence to +find evidence for the absence of an effect, the replication sample size should +also be determined so that the study has adequate power to make conclusive +inferences regarding the absence of the effect. + + + +\section{Acknowledgements} We thank the contributors of the RPCB for their tremendous efforts and for -making their data publicly available. We thank Maya Mathur for assistance with -data preparation. This work was supported by the Swiss National Science +making their data publicly available. We thank Maya Mathur for helpful advice +with the data preparation. This work was supported by the Swiss National Science Foundation (grants \#189295 and \#XXXXXX). We declare no conflict of interest. +\section{Conflict of interest} +We declare no conflict of interest. + \section{Data and software} The data from the RPCB were obtained by downloading the files from -\url{https://github.com/mayamathur/rpcb} and executing the R script -\texttt{Code/data\_prep.R} with the line 632 commented out so that also original -studies with null finding are included. This then produced the file +\url{https://github.com/mayamathur/rpcb} commit a1e0c63 and executing the R +script \texttt{Code/data\_prep.R} with the line 632 commented out so that also +original studies with null finding are included. This then produced the file \texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent -analyses. The effect estimates and standard errors on SMD scale from this data -set differ in some cases from those in the data set available at +analyses. The effect estimates and standard errors on SMD scale provided in this +data set differ in some cases from those in the data set available at \url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in \citet{Errington2021}. We used this particular version of the data set because -it was recommended to us by the RPCB statistician (M. Mathur) upon request. +it was recommended to us by the RPCB statistician (Maya Mathur) upon request. The code and data to reproduce our analyses is openly available at \url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository