Skip to content
Snippets Groups Projects
Commit 802d7992 authored by SamCH93's avatar SamCH93
Browse files

more Bayes stuff, conclusions

parent e7c89bd8
No related branches found
No related tags found
No related merge requests found
@article{Morey2011,
doi = {10.1037/a0024377},
year = {2011},
volume = {16},
number = {4},
pages = {406--419},
author = {Richard D. Morey and Jeffrey N. Rouder},
title = {Bayes factor approaches for testing interval null hypotheses},
journal = {Psychological Methods}
}
@article{Micheloud2022,
doi = {10.48550/ARXIV.2204.06960},
author = {Micheloud, Charlotte and Held, Leonhard},
title = {The replication of non-inferiority and equivalence studies},
publisher = {arXiv},
year = {2022},
copyright = {arXiv.org perpetual, non-exclusive license},
note = {arXiv preprint}
}
@article{Schuirmann1987, @article{Schuirmann1987,
doi = {10.1007/bf01068419}, doi = {10.1007/bf01068419},
year = {1987}, year = {1987},
......
...@@ -421,7 +421,7 @@ $p$-values are greater than 5\% in both the original and the replication study), ...@@ -421,7 +421,7 @@ $p$-values are greater than 5\% in both the original and the replication study),
but intuition would suggest that these two pairs are very much different. but intuition would suggest that these two pairs are very much different.
\begin{figure}[ht] \begin{figure}[ht]
<< "2-example-studies", fig.height = 3.5 >>= << "2-example-studies", fig.height = 3.25 >>=
## some evidence for absence of effect (when a really genereous margin Delta = 1 ## some evidence for absence of effect (when a really genereous margin Delta = 1
## of a lenient BF = 3 threshold) ## of a lenient BF = 3 threshold)
## https://doi.org/10.7554/eLife.45120 I can't find the replication effect like reported in the data set :( ## https://doi.org/10.7554/eLife.45120 I can't find the replication effect like reported in the data set :(
...@@ -446,17 +446,17 @@ ggplot(data = plotDF1) + ...@@ -446,17 +446,17 @@ ggplot(data = plotDF1) +
ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr,
ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) + ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) +
geom_text(aes(x = 1.05, y = 2.5, geom_text(aes(x = 1.05, y = 2.5,
label = paste("italic(n[o]) ==", no)), col = "darkblue", label = paste("italic(n) ==", no)), col = "darkblue",
parse = TRUE, size = 4, hjust = 0) + parse = TRUE, size = 3.8, hjust = 0) +
geom_text(aes(x = 2.05, y = 2.5, geom_text(aes(x = 2.05, y = 2.5,
label = paste("italic(n[r]) ==", nr)), col = "darkblue", label = paste("italic(n) ==", nr)), col = "darkblue",
parse = TRUE, size = 4, hjust = 0) + parse = TRUE, size = 3.8, hjust = 0) +
geom_text(aes(x = 1.05, y = 3, geom_text(aes(x = 1.05, y = 3,
label = paste("italic(p[o]) ==", biostatUZH::formatPval(po))), col = "darkblue", label = paste("italic(p) ==", biostatUZH::formatPval(po))), col = "darkblue",
parse = TRUE, size = 4, hjust = 0) + parse = TRUE, size = 3.8, hjust = 0) +
geom_text(aes(x = 2.05, y = 3, geom_text(aes(x = 2.05, y = 3,
label = paste("italic(p[r]) ==", biostatUZH::formatPval(pr))), col = "darkblue", label = paste("italic(p) ==", biostatUZH::formatPval(pr))), col = "darkblue",
parse = TRUE, size = 4, hjust = 0) + parse = TRUE, size = 3.8, hjust = 0) +
labs(x = "", y = "Standardized mean difference (SMD)") + labs(x = "", y = "Standardized mean difference (SMD)") +
theme_bw() + theme_bw() +
theme(panel.grid.minor = element_blank(), theme(panel.grid.minor = element_blank(),
...@@ -503,7 +503,7 @@ discuss how the two can be quantitatively distinguished. ...@@ -503,7 +503,7 @@ discuss how the two can be quantitatively distinguished.
\begin{figure}[!htb] \begin{figure}[!htb]
<< "plot-null-findings-rpcb", fig.height = 8.2 >>= << "plot-null-findings-rpcb", fig.height = 8.25 >>=
margin <- 1 margin <- 1
conflevel <- 0.9 conflevel <- 0.9
rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = margin, sd = so, rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = margin, sd = so,
...@@ -514,6 +514,10 @@ rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = margin, sd = sr, ...@@ -514,6 +514,10 @@ rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = margin, sd = sr,
lower.tail = TRUE), lower.tail = TRUE),
pnorm(q = smdr, mean = -margin, sd = sr, pnorm(q = smdr, mean = -margin, sd = sr,
lower.tail = FALSE))) lower.tail = FALSE)))
## highlight the studies from Goetz and Dawson
rpcbNull$id <- ifelse(rpcbNull$id == "(20, 1, 1, 1)", "(20, 1, 1, 1) - Goetz et al. (2011)", rpcbNull$id)
rpcbNull$id <- ifelse(rpcbNull$id == "(29, 2, 2, 1)", "(29, 2, 2, 1) - Dawson et al. (2011)", rpcbNull$id)
estypes <- c("r", "Cohen's dz", "Cohen's d") estypes <- c("r", "Cohen's dz", "Cohen's d")
ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) + ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
facet_wrap(~ id ## + effectType facet_wrap(~ id ## + effectType
...@@ -538,26 +542,26 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) + ...@@ -538,26 +542,26 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
label = paste("italic(p)['TOST']", label = paste("italic(p)['TOST']",
ifelse(ptosto < 0.0001, "", "=="), ifelse(ptosto < 0.0001, "", "=="),
biostatUZH::formatPval(ptosto))), biostatUZH::formatPval(ptosto))),
col = "darkblue", parse = TRUE, size = 2.5, hjust = 0, col = "darkblue", parse = TRUE, size = 2.3, hjust = 0,
vjust = 0.5) + vjust = 0.5) +
geom_text(aes(x = 1.51, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), geom_text(aes(x = 1.51, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
label = paste("italic(p)['TOST']", label = paste("italic(p)['TOST']",
ifelse(ptostr < 0.0001, "", "=="), ifelse(ptostr < 0.0001, "", "=="),
biostatUZH::formatPval(ptostr))), biostatUZH::formatPval(ptostr))),
col = "darkblue", parse = TRUE, size = 2.5, hjust = 0, col = "darkblue", parse = TRUE, size = 2.3, hjust = 0,
vjust = 0.5) + vjust = 0.5) +
geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="), label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="),
BForigformat)), col = "darkblue", BForigformat)), col = "darkblue",
parse = TRUE, size = 2.5, vjust = 1.7, hjust = 0,) + parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) +
geom_text(aes(x = 1.59, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), geom_text(aes(x = 1.59, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="), label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="),
BFrepformat)), col = "darkblue", BFrepformat)), col = "darkblue",
parse = TRUE, size = 2.5, vjust = 1.7, hjust = 0,) + parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) +
theme_bw() + theme_bw() +
theme(panel.grid.minor = element_blank(), theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(), panel.grid.major = element_blank(),
strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5), strip.text = element_text(size = 6.4, margin = margin(3), vjust = 2),
# panel.margin = unit(-1, "lines"), # panel.margin = unit(-1, "lines"),
strip.background = element_rect(fill = alpha("tan", .4)), strip.background = element_rect(fill = alpha("tan", .4)),
axis.text = element_text(size = 8)) axis.text = element_text(size = 8))
...@@ -567,15 +571,17 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) + ...@@ -567,15 +571,17 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
(those with two-sided $p$-value $p_{o} > 0.05$) and their replication studies (those with two-sided $p$-value $p_{o} > 0.05$) and their replication studies
from the Reproducibility Project: Cancer Biology \citep{Errington2021}. The from the Reproducibility Project: Cancer Biology \citep{Errington2021}. The
identifier above each plot indicates (Original paper number, Experiment identifier above each plot indicates (Original paper number, Experiment
number, Effect number, Internal replication number). The dashed grey line number, Effect number, Internal replication number). The two examples
from Figure~\ref{fig:2examples} are indicated
in the plot titles. The dashed grey line
depicts the value of no effect ($\text{SMD} = 0$) whereas the dotted red lines depicts the value of no effect ($\text{SMD} = 0$) whereas the dotted red lines
depict the equivalence range with margin $\Delta = \Sexpr{margin}$. The depict the equivalence range with margin $\Delta = \Sexpr{margin}$. The
$p$-values $p_{\text{TOST}}$ are the maximum of the two one-sided $p$-values $p$-values $p_{\text{TOST}}$ are the maximum of the two one-sided $p$-values
for the effect being smaller or greater than $+\Delta$ or $-\Delta$, for the effect being smaller or greater than $+\Delta$ or $-\Delta$,
respectively. The Bayes factors $\BF_{01}$ quantify the evidence for the null respectively. The Bayes factors $\BF_{01}$ quantify evidence for the null
hypothesis $H_{0} \colon \text{SMD} = 0$ to the alternative hypothesis $H_{0} \colon \text{SMD} = 0$ against the alternative
$H_{1} \colon \text{SMD} \neq 0$ with unit-information prior assigned to the $H_{1} \colon \text{SMD} \neq 0$ with normal unit-information prior assigned to the
SMD under the alternative $H_{1}$. SMD under $H_{1}$.
% Additionally, the % Additionally, the
% original effect size type is indicated, while all effect sizes were % original effect size type is indicated, while all effect sizes were
% transformed to the SMD scale. % transformed to the SMD scale.
...@@ -629,14 +635,14 @@ research; we chose it primarily for illustrative purposes and because effect ...@@ -629,14 +635,14 @@ research; we chose it primarily for illustrative purposes and because effect
sizes in preclinical research are typically much larger than in clinical sizes in preclinical research are typically much larger than in clinical
research. In practice, the margin should be determined on a case-by-case basis research. In practice, the margin should be determined on a case-by-case basis
by researchers who are familiar with the subject matter. However, even with this by researchers who are familiar with the subject matter. However, even with this
generous margin, only two of the study pairs -- one of them being the previously generous margin, only four of the twenty study pairs -- one of them being the
discussed study by \citet{Goetz2011} with identifier (20, 1, 1, 1) -- are able previously discussed example from \citet{Goetz2011} -- are able to establish
to establish equivalence at the 5\% level in the sense that both the original equivalence at the 5\% level in the sense that both the original and the
and the replication 90\% confidence interval fall within the equivalence range replication 90\% confidence interval fall within the equivalence range or both
or both TOST $p$-value are smaller than $5\%$. For the remaining 18 studies, for TOST $p$-values are smaller than $5\%$. For the remaining 18 studies, for
example, the previously discussed study by \citet{Dawson2011} with identifier example, the previously discussed example from \citet{Dawson2011}, the situation
(29, 2, 2, 1), the situation remains inconclusive and there is neither evidence remains inconclusive and there is neither evidence for the absence or presence
for the absence or presence of the effect. of the effect.
\subsection{Bayesian hypothesis testing} \subsection{Bayesian hypothesis testing}
...@@ -664,31 +670,57 @@ different from one indicates absence of evidence for either hypothesis ...@@ -664,31 +670,57 @@ different from one indicates absence of evidence for either hypothesis
When the observed data are dichotomized into positive (\mbox{$p < 5\%$}) or null When the observed data are dichotomized into positive (\mbox{$p < 5\%$}) or null
results (\mbox{$p > 5\%$}), the Bayes factor based on a null result is the results (\mbox{$p > 5\%$}), the Bayes factor based on a null result is the
probability of observing \mbox{$p > 5\%$} when the effect is indeed absent probability of observing \mbox{$p > 5\%$} when the effect is indeed absent
($95\%$) divided by the probability of observing $p > 5\%$ when the effect is (which is $95\%$) divided by the probability of observing $p > 5\%$ when the
indeed present (which is one minus the power of the study). For example, if the effect is indeed present (which is one minus the power of the study). For
power is 90\%, we have example, if the power is 90\%, we have
\mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten \mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten
times more evidence for the absence of the effect than for its presence. On the times more evidence for the absence of the effect than for its presence. On the
other hand, if the power is only 50\%, we have other hand, if the power is only 50\%, we have
\mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating hardly any \mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating only
evidence for either absence or presence of the effect. This example also slightly more evidence for the absence of the effect. This example also
highlights the main challenge with Bayes factors -- the specification of the highlights the main challenge with Bayes factors -- the specification of the
alternative hypothesis $H_{1}$. The assumed effect size under $H_{1}$ is alternative hypothesis $H_{1}$. The assumed effect under $H_{1}$ is directly
directly related to the power of the study, and researchers who assume different related to the power of the study, and researchers who assume different effects
effect sizes under $H_{1}$ will end up with different Bayes factors. Instead of under $H_{1}$ will end up with different Bayes factors. Instead of specifying a
specifying a single effect, one therefore typically specifies a ``prior single effect, one therefore typically specifies a ``prior distribution'' of
distribution'' of plausible effects. Importantly, the prior distribution, like plausible effects. Importantly, the prior distribution, like the equivalence
the equivalence margin, should be determined by researchers with subject margin, should be determined by researchers with subject knowledge and before
knowledge and before the data are observed. Furthermore, when computing Bayes the data are observed.
factors, the observed data should not be dichotomized into positive or null
results, as this leads to a loss of information. In practice, the observed data should not be dichotomized into positive or null
results, as this leads to a loss of information. Therefore, to compute the Bayes
factors for the RPCB null results, we used the observed effect estimates as the
Figure~\ref{fig:nullfindings} shows also the Bayes factors contrasting the null data and assumed a normal sampling distribution for them, as in a meta-analysis.
hypothesis of no effect ($H_{0} \colon \text{SMD} = 0$) to the alternative that The Bayes factors $\BF_{01}$ shown in Figure~\ref{fig:nullfindings} then
there is an effect ($H_{0} \colon \text{SMD} \neq 0$) using a quantify the evidence for the null hypothesis of no effect
``unit-information'' normal prior distribution \citep{Kass1995b} for the SMD ($H_{0} \colon \text{SMD} = 0$) against the alternative hypothesis that there is
under the alternative $H_{1}$ an effect ($H_{1} \colon \text{SMD} \neq 0$) using a ``unit-information'' normal
prior distribution \citep{Kass1995b} for the effect size under the alternative
$H_{1}$. There are several more advanced prior distributions that could be used
here, and they should ideally be specified for each effect individually based on
domain knowledge. The normal unit-information prior (with a standard deviation
of 2 for SMDs) is only a reasonable default choice, as it implies that small to
large effects are plausible under the alternative. We see that in most cases
there is no substantial evidence for either the absence or the presence of an
effect, as with the equivalence tests. The Bayes factors for the two previously
discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent
with our intuititons -- there is indeed some evidence for the absence of an
effect in \citet{Goetz2011}, while there is even slightly more evidence for the
presence of an effect in \citet{Dawson2011}, though the Bayes factor is very
close to one due to the small sample sizes. If we use a lenient Bayes factor
threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect,
only one of the twenty study pairs meets this criteiron in both the original and
replication study. There is one interesting case -- the rightmost plot in the
fourth row (48, 2, 4, 1) -- where the Bayes factor is qualitatively different
from the equivalence test, revealing a fundamental difference between the two
approaches. The Bayes factor is concerned with testing whether the effect is
\emph{exactly zero}, whereas the equivalence test is concerned with whether the
effect is within an \emph{interval around zero}. Due to the very large sample
size in this replication study, the data are incompatible with an exactly zero
effect, but compatible with effects within the equivalence range. Apart from
this example, however, the approaches lead to the same qualitative conclusion --
most RPCB null results are highly ambiguous.
% regarding the presence or absence of an effect.
...@@ -699,30 +731,49 @@ We showed that in most of the RPCB studies with ``null results'' (those with ...@@ -699,30 +731,49 @@ We showed that in most of the RPCB studies with ``null results'' (those with
$p > 5\%$), neither the original nor the replication study provided conclusive $p > 5\%$), neither the original nor the replication study provided conclusive
evidence for the presence or absence of an effect. It seems logically evidence for the presence or absence of an effect. It seems logically
questionable to declare an inconclusive replication of an inconclusive original questionable to declare an inconclusive replication of an inconclusive original
study a replication success. While it is important to replicate original studies study as a replication success. While it is important to replicate original
with null results, our analysis highlights that they should be analyzed, studies with null results, our analysis highlights that they should be analyzed
designed, and interpreted appropriately. and interpreted appropriately.
While the equivalence test and Bayes factor approaches are two principled
methods for analyzing original and replication studies with null results, they
\section{Acknowledgements and conflicts of interest} are not the only possible methods for doing so. Other methods specifically
tailored to the replication setting, such as the reverse-Bayes approach of
\citet{Micheloud2022}, may lead to more appropriate inferences as they also take
into account the compatibility of the effect estimates from original and
replication studies. In addition, there are various more advanced Bayesian
hypothesis testing procedures specifically designed to quantify the evidence for
the absence of an effect \citep{Johnson2010, Morey2011} that could potentially
improve the efficiency of the Bayes factor approach. Finally, the design of
replication studies should align with the planned analysis \citep{Anderson2017,
Anderson2022, Micheloud2020, Pawel2022c}. If the goal of study is hence to
find evidence for the absence of an effect, the replication sample size should
also be determined so that the study has adequate power to make conclusive
inferences regarding the absence of the effect.
\section{Acknowledgements}
We thank the contributors of the RPCB for their tremendous efforts and for We thank the contributors of the RPCB for their tremendous efforts and for
making their data publicly available. We thank Maya Mathur for assistance with making their data publicly available. We thank Maya Mathur for helpful advice
data preparation. This work was supported by the Swiss National Science with the data preparation. This work was supported by the Swiss National Science
Foundation (grants \#189295 and \#XXXXXX). We declare no conflict of interest. Foundation (grants \#189295 and \#XXXXXX). We declare no conflict of interest.
\section{Conflict of interest}
We declare no conflict of interest.
\section{Data and software} \section{Data and software}
The data from the RPCB were obtained by downloading the files from The data from the RPCB were obtained by downloading the files from
\url{https://github.com/mayamathur/rpcb} and executing the R script \url{https://github.com/mayamathur/rpcb} commit a1e0c63 and executing the R
\texttt{Code/data\_prep.R} with the line 632 commented out so that also original script \texttt{Code/data\_prep.R} with the line 632 commented out so that also
studies with null finding are included. This then produced the file original studies with null finding are included. This then produced the file
\texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent \texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent
analyses. The effect estimates and standard errors on SMD scale from this data analyses. The effect estimates and standard errors on SMD scale provided in this
set differ in some cases from those in the data set available at data set differ in some cases from those in the data set available at
\url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in \url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in
\citet{Errington2021}. We used this particular version of the data set because \citet{Errington2021}. We used this particular version of the data set because
it was recommended to us by the RPCB statistician (M. Mathur) upon request. it was recommended to us by the RPCB statistician (Maya Mathur) upon request.
The code and data to reproduce our analyses is openly available at The code and data to reproduce our analyses is openly available at
\url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository \url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment