diff --git a/paper/rsabsence.Rnw b/paper/rsabsence.Rnw index 02faafef87cb33c4827d8ed59922077a3f757f34..fa3a92f8cf4d846d9d416698242d45dd7fe5a1fb 100755 --- a/paper/rsabsence.Rnw +++ b/paper/rsabsence.Rnw @@ -179,9 +179,20 @@ replicability rate based on significance. In contrast, the RPCB explicitly defined null results in both the original and the replication study as a criterion for ``replication success''. According to this criterion, $11/15 = \Sexpr{round(11/15*100, 0)}\%$ replications of original null effects -were successful. - -We believe that there are several logical problems with this +were successful. Four additional criteria were used to assess successful +replications of original null results: comparing original effect size with a +95\% confidence interval of the replication effect size, comparing replication +effect size with a 95\% confidence interval of the original effect size, +comparing replication effect size with a 95\% prediction interval of the +original effect size, and combining the original and replication effect sizes in +meta-analysis. The suitability of these criteria in the context of replications +of original null effects will be discussed in our conclusion. +\todo[inline]{RH: listed. but some the significance criterion comments are valid +also for some of those criteria. Should we here say that we will discuss them in +the conclusion, or mention it here already? - maybe delete the sentence "The +suitability..."} + +We believe that there are several logical problems with the ``non-significance'' criterion. First, if the original study had low statistical power, a non-significant result is highly inconclusive and does not provide evidence for the absence of an effect. It is then unclear what exactly the goal @@ -286,33 +297,33 @@ be quantitatively distinguished. << "2-example-studies", fig.height = 3 >>= ## create plot showing two example study pairs with null results ggplot(data = plotDF1) + - facet_wrap(~ label) + - geom_hline(yintercept = 0, lty = 2, alpha = 0.3) + - geom_pointrange(aes(x = "Original", y = smdo, - ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, - ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), fatten = 3) + - geom_pointrange(aes(x = "Replication", y = smdr, - ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, - ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) + - geom_text(aes(x = 1.05, y = 2.5, - label = paste("italic(n) ==", no)), col = "darkblue", - parse = TRUE, size = 3.8, hjust = 0) + - geom_text(aes(x = 2.05, y = 2.5, - label = paste("italic(n) ==", nr)), col = "darkblue", - parse = TRUE, size = 3.8, hjust = 0) + - geom_text(aes(x = 1.05, y = 3, - label = paste("italic(p) ==", formatPval(po))), col = "darkblue", - parse = TRUE, size = 3.8, hjust = 0) + - geom_text(aes(x = 2.05, y = 3, - label = paste("italic(p) ==", formatPval(pr))), col = "darkblue", - parse = TRUE, size = 3.8, hjust = 0) + - labs(x = "", y = "Standardized mean difference") + - theme_bw() + - theme(panel.grid.minor = element_blank(), - panel.grid.major.x = element_blank(), - strip.text = element_text(size = 12, margin = margin(4), vjust = 1.5), - strip.background = element_rect(fill = alpha("tan", 0.4)), - axis.text = element_text(size = 12)) + facet_wrap(~ label, scales = "free_x") + + geom_hline(yintercept = 0, lty = 2, alpha = 0.3) + + geom_pointrange(aes(x = paste0("Original \n", "(n=", no, ")") , y = smdo, + ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, + ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), fatten = 3) + + geom_pointrange(aes(x = paste0("Replication \n", "(n=", nr, ")"), y = smdr, + ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, + ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), fatten = 3) + + # geom_text(aes(x = 1.05, y = 2.5, + # label = paste("italic(n) ==", no)), col = "darkblue", + # parse = TRUE, size = 3.8, hjust = 0) + + # geom_text(aes(x = 2.05, y = 2.5, + # label = paste("italic(n) ==", nr)), col = "darkblue", + # parse = TRUE, size = 3.8, hjust = 0) + + geom_text(aes(x = 1.05, y = 2.8, + label = paste("italic(p) ==", formatPval(po))), col = "darkblue", + parse = TRUE, size = 3.8, hjust = 0) + + geom_text(aes(x = 2.05, y = 2.8, + label = paste("italic(p) ==", formatPval(pr))), col = "darkblue", + parse = TRUE, size = 3.8, hjust = 0) + + labs(x = "", y = "Standardized mean difference") + + theme_bw() + + theme(panel.grid.minor = element_blank(), + panel.grid.major.x = element_blank(), + strip.text = element_text(size = 12, margin = margin(4), vjust = 1.5), + strip.background = element_rect(fill = alpha("tan", 0.4)), + axis.text = element_text(size = 10)) @ \caption{\label{fig:2examples} Two examples of original and replication study pairs which meet the non-significance replication success criterion from the @@ -362,14 +373,15 @@ of the effect being greater/smaller than $+\Delta$ and $-\Delta$, are both significant at level $\alpha$ \citep{Schuirmann1987}. A quantitative measure of evidence for the absence of an effect is then given by the maximum of the two one-sided \textit{p}-values (the TOST \textit{p}-value). A reasonable -replication success criterion for null results may therefore be to require that -both the original and the replication TOST \textit{p}-values be smaller than -some level $\alpha$ (conventionally $0.05$), or, equivalently, that their -$(1-2\alpha)\times 100\%$ confidence intervals are included in the equivalence -region. In contrast to the non-significance criterion, this criterion controls -the error of falsely claiming replication success at level $\alpha^{2}$ when -there is a true effect outside the equivalence margin, thus complementing the -usual two-trials rule in drug regulation \citep[section 12.2.8]{Senn2008}. +criterion for replication success of original null results may therefore be to +require that both the original and the replication TOST \textit{p}-values are +smaller than some level $\alpha$ (conventionally $0.05$). Equivalently, the +criterion would require the $(1-2\alpha)\times 100\%$ confidence intervals of +the original and the replication to be included in the equivalence region. In +contrast to the non-significance criterion, this criterion controls the error of +falsely claiming replication success at level $\alpha^{2}$ when there is a true +effect outside the equivalence margin, thus complementing the usual two-trials +rule in drug regulation \citep[section 12.2.8]{Senn2008}. \begin{figure}[!htb] @@ -462,51 +474,51 @@ ggplot(data = rpcbNull) + ## ymin = smdr - qnorm(p = (1 + 0.95)/2)*sr, ## ymax = smdr + qnorm(p = (1 + 0.95)/2)*sr), size = 0.2, alpha = 0.6) + ## 90% CIs - geom_pointrange(aes(x = "Original", y = smdo, + geom_pointrange(aes(x = paste0("Original \n", "(n=", no, ")"), y = smdo, ymin = smdo - qnorm(p = (1 + conflevel)/2)*so, ymax = smdo + qnorm(p = (1 + conflevel)/2)*so), size = 0.5, fatten = 1.5) + - geom_pointrange(aes(x = "Replication", y = smdr, + geom_pointrange(aes(x = paste0("Replication \n", "(n=", nr, ")"), y = smdr, ymin = smdr - qnorm(p = (1 + conflevel)/2)*sr, ymax = smdr + qnorm(p = (1 + conflevel)/2)*sr), size = 0.5, fatten = 1.5) + annotate(geom = "ribbon", x = seq(0, 3, 0.01), ymin = -margin, ymax = margin, alpha = 0.05, fill = 2) + labs(x = "", y = "Standardized mean difference") + - geom_text(aes(x = 1.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), - label = paste("italic(n) ==", no)), col = "darkblue", - parse = TRUE, size = 2.3, hjust = 0, vjust = 2) + - geom_text(aes(x = 2.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), - label = paste("italic(n) ==", nr)), col = "darkblue", - parse = TRUE, size = 2.3, hjust = 0, vjust = 2) + + # geom_text(aes(x = 1.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), + # label = paste("italic(n) ==", no)), col = "darkblue", + # parse = TRUE, size = 2.3, hjust = 0, vjust = 2) + + # geom_text(aes(x = 2.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), + # label = paste("italic(n) ==", nr)), col = "darkblue", + # parse = TRUE, size = 2.3, hjust = 0, vjust = 2) + geom_text(aes(x = 1.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)", ifelse(po < 0.0001, "", "=="), formatPval(po))), col = "darkblue", - parse = TRUE, size = 2.3, hjust = 0) + + parse = TRUE, size = 2.3, hjust = 0, vjust = .75) + geom_text(aes(x = 2.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)", ifelse(pr < 0.0001, "", "=="), formatPval(pr))), col = "darkblue", - parse = TRUE, size = 2.3, hjust = 0) + + parse = TRUE, size = 2.3, hjust = 0, vjust = .75) + geom_text(aes(x = 1.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)['TOST']", ifelse(ptosto < 0.0001, "", "=="), formatPval(ptosto))), - col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 3) + + col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 2) + geom_text(aes(x = 2.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("italic(p)['TOST']", ifelse(ptostr < 0.0001, "", "=="), formatPval(ptostr))), - col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 3) + + col = "darkblue", parse = TRUE, size = 2.3, hjust = 0, vjust = 2) + geom_text(aes(x = 1.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="), BForigformat)), col = "darkblue", parse = TRUE, - size = 2.3, hjust = 0, vjust = 4.5) + + size = 2.3, hjust = 0, vjust = 3.25) + geom_text(aes(x = 2.05, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin), label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="), BFrepformat)), col = "darkblue", parse = TRUE, - size = 2.3, hjust = 0, vjust = 4.5) + + size = 2.3, hjust = 0, vjust = 3.25) + coord_cartesian(x = c(1.1, 2.4)) + theme_bw() + theme(panel.grid.minor = element_blank(), @@ -561,7 +573,7 @@ BFrep2 <- rpcbNull$BFrep[ind2] Returning to the RPCB data, Figure~\ref{fig:nullfindings} shows the standardized mean difference effect estimates with $\Sexpr{round(conflevel*100, 2)}\%$ -confidence intervals for all 15 effects which were treated as quantitative null +confidence intervals for all 15 effects which were treated as null results by the RPCB.\footnote{There are four original studies with null effects for which two or three ``internal'' replication studies were conducted, leading in total to 20 replications of null effects. As in the RPCB main @@ -572,11 +584,10 @@ results by the RPCB.\footnote{There are four original studies with null effects provided by the RPCB.} Most of them showed non-significant \textit{p}-values ($p > 0.05$) in the original study. It is, however, noteworthy that two effects from the second experiment from the original paper 48 were regarded as null -results despite their statistical significance. We see that there are -$\Sexpr{nullSuccesses}$ ``successes'' according to the non-significance -criterion (with $p > 0.05$ in original and replication study) out of total -$\Sexpr{ntotal}$ null effects, as reported in Table 1 -from~\citet{Errington2021}. +results despite their statistical significance. According to the +non-significance criterion (requiring $p > 0.05$ in original and replication +study), there are $\Sexpr{nullSuccesses}$ ``successes'' out of total +$\Sexpr{ntotal}$ null effects, as reported in Table 1 from~\citet{Errington2021}. We will now apply equivalence testing to the RPCB data. The dotted red lines in Figure~\ref{fig:nullfindings} represent an equivalence range for the margin @@ -598,7 +609,6 @@ $p_{\text{TOST}} = \Sexpr{formatPval(ptostr2)}$ in the replication) as both effect estimates even lie outside the equivalence margin. - The post-hoc specification of equivalence margins is controversial. Ideally, the margin should be specified on a case-by-case basis in a pre-registered protocol before the studies are conducted by researchers familiar with the subject @@ -609,156 +619,35 @@ typically larger in preclinical research, it seems unrealistic to specify margins larger than $1$ on SMD scale to represent effect sizes that are absent for practical purposes. It could also be argued that the chosen margin $\Delta = \Sexpr{margin}$ is too lax compared to margins commonly used in -clinical research; for instance, in oncology, a margin of $\Delta = \log(1.3)$ -is commonly used for log odds/hazard ratios, whereas in bioequivalence studies a -margin of \mbox{$\Delta = \log(1.25) % = \Sexpr{round(log(1.25), 2)} - $} is the convention \citep[chapter 22]{Senn2008}. These margins would -translate into much more stringent margins of $\Delta -= % \log(1.3)\sqrt{3}/\pi = -\Sexpr{round(log(1.3)*sqrt(3)/pi, 2)}$ and $\Delta = % \log(1.25)\sqrt{3}/\pi = -\Sexpr{round(log(1.25)*sqrt(3)/pi, 2)}$ on the SMD scale, respectively, using -the $\text{SMD} = (\surd{3} / \pi) \log\text{OR}$ conversion \citep[p. -233]{Cooper2019}. Therefore, we report a sensitivity analysis in -Figure~\ref{fig:sensitivity}. The top plot shows the number of successful -replications as a function of the margin $\Delta$ and for different TOST -\textit{p}-value thresholds. Such an ``equivalence curve'' approach was first -proposed by \citet{Hauck1986}. We see that for realistic margins between $0$ and -$1$, the proportion of replication successes remains below $50\%$ for the -conventional $\alpha = 0.05$ level. To achieve a success rate of -$11/15 = \Sexpr{round(11/15*100, 0)}\%$, as was achieved with the +clinical research \citep[chapter 22]{Senn2008}. However, as illustrated in +Figure~\ref{fig:sensitivity} from the sensitivity analysis in our appendix, for +realistic margins between $0$ and $1$, the proportion of replication successes +remains below $50\%$ for the conventional $\alpha = 0.05$ level. To achieve a +success rate of $11/15 = \Sexpr{round(11/15*100, 0)}\%$, as was achieved with the non-significance criterion from the RPCB, unrealistic margins of $\Delta > 2$ -are required, highlighting the paucity of evidence provided by these studies. -Changing the success criterion to a more lenient level ($\alpha = 0.1$) or a -more stringent level ($\alpha = 0.01$) hardly changes this conclusion. - - -\begin{figure}[!htb] -<< "sensitivity", fig.height = 6.5 >>= -## compute number of successful replications as a function of the equivalence margin -marginseq <- seq(0.01, 4.5, 0.01) -alphaseq <- c(0.01, 0.05, 0.1) -sensitivityGrid <- expand.grid(m = marginseq, a = alphaseq) -equivalenceDF <- lapply(X = seq(1, nrow(sensitivityGrid)), FUN = function(i) { - m <- sensitivityGrid$m[i] - a <- sensitivityGrid$a[i] - rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = m, sd = so, - lower.tail = TRUE), - pnorm(q = smdo, mean = -m, sd = so, - lower.tail = FALSE))) - rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = m, sd = sr, - lower.tail = TRUE), - pnorm(q = smdr, mean = -m, sd = sr, - lower.tail = FALSE))) - successes <- sum(rpcbNull$ptosto <= a & rpcbNull$ptostr <= a) - data.frame(margin = m, alpha = a, - successes = successes, proportion = successes/nrow(rpcbNull)) -}) %>% - bind_rows() - -## plot number of successes as a function of margin -nmax <- nrow(rpcbNull) -bks <- c(0, 3, 6, 9, 11, 15) -labs <- paste0(bks, " (", round(bks/nmax*100, 0), "%)") -rpcbSuccesses <- 11 -marbks <- c(0, margin, 1, 2, 3, 4) -plotA <- ggplot(data = equivalenceDF, - aes(x = margin, y = successes, - color = factor(alpha, ordered = TRUE, levels = rev(alphaseq)))) + - facet_wrap(~ 'italic("p")["TOST"] <= alpha ~ "in original and replication study"', - labeller = label_parsed) + - geom_vline(xintercept = margin, lty = 3, alpha = 0.4) + - annotate(geom = "segment", x = margin + 0.25, xend = margin + 0.01, y = 2, yend = 2, - arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, - color = "darkgrey") + - annotate(geom = "text", x = margin + 0.28, y = 2, color = "darkgrey", - label = "margin used in main analysis", - size = 3, alpha = 0.9, hjust = 0) + - geom_hline(yintercept = rpcbSuccesses, lty = 2, alpha = 0.4) + - annotate(geom = "segment", x = 0.1, xend = 0.1, y = 13, yend = 11.2, - arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, - color = "darkgrey") + - annotate(geom = "text", x = -0.04, y = 13.5, color = "darkgrey", - label = "non-significance criterion successes", - size = 3, alpha = 0.9, hjust = 0) + - geom_step(alpha = 0.8, linewidth = 0.8) + - scale_y_continuous(breaks = bks, labels = labs) + - scale_x_continuous(breaks = marbks) + - coord_cartesian(xlim = c(0, max(equivalenceDF$margin))) + - labs(x = bquote("Equivalence margin" ~ Delta), - y = "Successful replications", - color = bquote("threshold" ~ alpha)) + - theme_bw() + - theme(panel.grid.minor = element_blank(), - panel.grid.major = element_blank(), - strip.background = element_rect(fill = alpha("tan", 0.4)), - strip.text = element_text(size = 12), - legend.position = c(0.85, 0.25), - plot.background = element_rect(fill = "transparent", color = NA), - legend.box.background = element_rect(fill = "transparent", colour = NA)) - -## compute number of successful replications as a function of the prior scale -priorsdseq <- seq(0, 40, 0.1) -bfThreshseq <- c(3, 6, 10) -sensitivityGrid2 <- expand.grid(s = priorsdseq, thresh = bfThreshseq) -bfDF <- lapply(X = seq(1, nrow(sensitivityGrid2)), FUN = function(i) { - priorsd <- sensitivityGrid2$s[i] - thresh <- sensitivityGrid2$thresh[i] - rpcbNull$BForig <- with(rpcbNull, BF01(estimate = smdo, se = so, unitvar = priorsd^2)) - rpcbNull$BFrep <- with(rpcbNull, BF01(estimate = smdr, se = sr, unitvar = priorsd^2)) - successes <- sum(rpcbNull$BForig >= thresh & rpcbNull$BFrep >= thresh) - data.frame(priorsd = priorsd, thresh = thresh, - successes = successes, proportion = successes/nrow(rpcbNull)) -}) %>% - bind_rows() - -## plot number of successes as a function of prior sd -priorbks <- c(0, 2, 10, 20, 30, 40) -plotB <- ggplot(data = bfDF, - aes(x = priorsd, y = successes, color = factor(thresh, ordered = TRUE))) + - facet_wrap(~ '"BF"["01"] >= gamma ~ "in original and replication study"', - labeller = label_parsed) + - geom_vline(xintercept = 2, lty = 3, alpha = 0.4) + - geom_hline(yintercept = rpcbSuccesses, lty = 2, alpha = 0.4) + - annotate(geom = "segment", x = 7, xend = 2 + 0.2, y = 0.5, yend = 0.5, - arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, - color = "darkgrey") + - annotate(geom = "text", x = 7.5, y = 0.5, color = "darkgrey", - label = "standard deviation used in main analysis", - size = 3, alpha = 0.9, hjust = 0) + - annotate(geom = "segment", x = 0.5, xend = 0.5, y = 13, yend = 11.2, - arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, - color = "darkgrey") + - annotate(geom = "text", x = 0.05, y = 13.5, color = "darkgrey", - label = "non-significance criterion successes", - size = 3, alpha = 0.9, hjust = 0) + - geom_step(alpha = 0.8, linewidth = 0.8) + - scale_y_continuous(breaks = bks, labels = labs, limits = c(0, nmax)) + - scale_x_continuous(breaks = priorbks) + - coord_cartesian(xlim = c(0, max(bfDF$priorsd))) + - labs(x = "Prior standard deviation", - y = "Successful replications ", - color = bquote("threshold" ~ gamma)) + - theme_bw() + - theme(panel.grid.minor = element_blank(), - panel.grid.major = element_blank(), - strip.background = element_rect(fill = alpha("tan", 0.4)), - strip.text = element_text(size = 12), - legend.position = c(0.85, 0.25), - plot.background = element_rect(fill = "transparent", color = NA), - legend.box.background = element_rect(fill = "transparent", colour = NA)) - -grid.arrange(plotA, plotB, ncol = 1) -@ - -\caption{Number of successful replications of original null results in the RPCB - as a function of the margin $\Delta$ of the equivalence test - ($p_{\text{TOST}} \leq \alpha$ in both studies for - $\alpha = \Sexpr{rev(alphaseq)}$) or the standard deviation of the zero-mean - normal prior distribution for the SMD effect size under the alternative - $H_{1}$ of the Bayes factor test ($\BF_{01} \geq \gamma$ in both studies for - $\gamma = \Sexpr{bfThreshseq}$).} -\label{fig:sensitivity} -\end{figure} +are required. +% ; for instance, in oncology, a margin of $\Delta = \log(1.3)$ +% is commonly used for log odds/hazard ratios, whereas in bioequivalence studies a +% margin of \mbox{$\Delta = \log(1.25) % = \Sexpr{round(log(1.25), 2)} +% $} is the convention \citep[chapter 22]{Senn2008}. These margins would +% translate into much more stringent margins of $\Delta +% = % \log(1.3)\sqrt{3}/\pi = +% \Sexpr{round(log(1.3)*sqrt(3)/pi, 2)}$ and $\Delta = % \log(1.25)\sqrt{3}/\pi = +% \Sexpr{round(log(1.25)*sqrt(3)/pi, 2)}$ on the SMD scale, respectively, using +% the $\text{SMD} = (\surd{3} / \pi) \log\text{OR}$ conversion \citep[p. +% 233]{Cooper2019}. +% Therefore, we report a sensitivity analysis in Figure~\ref{fig:sensitivity}. +% The top plot shows the number of successful +% replications as a function of the margin $\Delta$ and for different TOST +% \textit{p}-value thresholds. Such an ``equivalence curve'' approach was first +% proposed by \citet{Hauck1986}. We see that for realistic margins between $0$ and +% $1$, the proportion of replication successes remains below $50\%$ for the +% conventional $\alpha = 0.05$ level. To achieve a success rate of +% $11/15 = \Sexpr{round(11/15*100, 0)}\%$, as was achieved with the +% non-significance criterion from the RPCB, unrealistic margins of $\Delta > 2$ +% are required, highlighting the paucity of evidence provided by these studies. +% Changing the success criterion to a more lenient level ($\alpha = 0.1$) or a +% more stringent level ($\alpha = 0.01$) hardly changes this conclusion. @@ -769,12 +658,16 @@ evidence is the Bayes factor \citep{Kass1995}, which is the updating factor of the prior odds to the posterior odds of the null hypothesis $H_{0}$ versus the alternative hypothesis $H_{1}$ \begin{align*} - \underbrace{\frac{\Pr(H_{0} \given \mathrm{data})}{\Pr(H_{1} \given - \mathrm{data})}}_{\mathrm{Posterior~odds}} - = \underbrace{\frac{\Pr(H_{0})}{\Pr(H_{1})}}_{\mathrm{Prior~odds}} - \times \underbrace{\frac{p(\mathrm{data} \given H_{0})}{p(\mathrm{data} - \given H_{1})}}_{\mathrm{Bayes~factor}~\BF_{01}}. + \mathrm{Posterior~odds} = \mathrm{Prior~odds} \times \mathrm{Bayes~factor}~\BF_{01}. \end{align*} + +% \begin{align*} +% \underbrace{\frac{\Pr(H_{0} \given \mathrm{data})}{\Pr(H_{1} \given +% \mathrm{data})}}_{\mathrm{Posterior~odds}} +% = \underbrace{\frac{\Pr(H_{0})}{\Pr(H_{1})}}_{\mathrm{Prior~odds}} +% \times \underbrace{\frac{p(\mathrm{data} \given H_{0})}{p(\mathrm{data} +% \given H_{1})}}_{\mathrm{Bayes~factor}~\BF_{01}}. +% \end{align*} The Bayes factor quantifies how much the observed data have increased or decreased the probability of the null hypothesis $H_{0}$ relative to the alternative $H_{1}$. If the null hypothesis states the absence of an effect, a @@ -790,18 +683,19 @@ respectively \citep{Jeffreys1961}. In contrast to the non-significance criterion, this criterion provides a genuine measure of evidence that can distinguish absence of evidence from evidence of absence. -When the observed data are dichotomized into positive (\mbox{$p < 0.05$}) or -null results (\mbox{$p > 0.05$}), the Bayes factor based on a null result is the -probability of observing \mbox{$p > 0.05$} when the effect is indeed absent -(which is $95\%$) divided by the probability of observing $p > 0.05$ when the -effect is indeed present (which is one minus the power of the study). For -example, if the power is $90\%$, we have -\mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten -times more evidence for the absence of the effect than for its presence. On the -other hand, if the power is only $50\%$, we have -\mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating only -slightly more evidence for the absence of the effect. This example also -highlights the main challenge with Bayes factors -- the specification of the +% When the observed data are dichotomized into positive (\mbox{$p < 0.05$}) or +% null results (\mbox{$p > 0.05$}), the Bayes factor based on a null result is the +% probability of observing \mbox{$p > 0.05$} when the effect is indeed absent +% (which is $95\%$) divided by the probability of observing $p > 0.05$ when the +% effect is indeed present (which is one minus the power of the study). For +% example, if the power is $90\%$, we have +% \mbox{$\BF_{01} = 95\%/10\% = \Sexpr{round(0.95/0.1, 2)}$} indicating almost ten +% times more evidence for the absence of the effect than for its presence. On the +% other hand, if the power is only $50\%$, we have +% \mbox{$\BF_{01} = 95\%/50\% = \Sexpr{round(0.95/0.5,2)}$} indicating only +% slightly more evidence for the absence of the effect. This example also +% highlights +The main challenge with Bayes factors is the specification of the alternative hypothesis $H_{1}$. The assumed effect under $H_{1}$ is directly related to the power of the study, and researchers who assume different effects under $H_{1}$ will end up with different Bayes factors. Instead of specifying a @@ -810,26 +704,28 @@ plausible effects. Importantly, the prior distribution, like the equivalence margin, should be determined by researchers with subject knowledge and before the data are collected. -In practice, the observed data should not be dichotomized into positive or null -results, as this leads to a loss of information. Therefore, to compute the Bayes -factors for the RPCB null results, we used the observed effect estimates as the -data and assumed a normal sampling distribution for them, as typically done in a -meta-analysis. The Bayes factors $\BF_{01}$ shown in +% In practice, the observed data should not be dichotomized into positive or null +% results, as this leads to a loss of information. Therefore, +To compute the Bayes factors for the RPCB null results, we used the observed +effect estimates as the data and assumed a normal sampling distribution for +them, as typically done in a meta-analysis. The Bayes factors $\BF_{01}$ shown in Figure~\ref{fig:nullfindings} then quantify the evidence for the null hypothesis of no effect ($H_{0} \colon \text{SMD} = 0$) against the alternative hypothesis that there is an effect ($H_{1} \colon \text{SMD} \neq 0$) using a normal ``unit-information'' prior distribution\footnote{For SMD effect sizes, a normal unit-information prior is a normal distribution centered around the value of - no effect with a standard deviation corresponding to one observation. Assuming - that the group means are normally distributed - \mbox{$\overline{X}_{1} \sim \Nor(\theta_{1}, 2\sigma^{2}/n)$} and - \mbox{$\overline{X}_{2} \sim \Nor(\theta_{2}, 2\sigma^{2}/n)$} with $n$ the - total sample size and $\sigma$ the known data standard deviation, the - distribution of the SMD is - \mbox{$\text{SMD} = (\overline{X}_{1} - \overline{X}_{2})/\sigma \sim \Nor\{(\theta_{1} - \theta_{2})/\sigma, 4/n\}$}. - The standard deviation of the SMD based on one unit ($n = 1$) is hence $2$, - just as the unit standard deviation for log hazard/odds/rate ratio effect - sizes \citep[Section 2.4]{Spiegelhalter2004}.} \citep{Kass1995b} for the + no effect with a standard deviation corresponding to one observation. + % Assuming + % that the group means are normally distributed + % \mbox{$\overline{X}_{1} \sim \Nor(\theta_{1}, 2\sigma^{2}/n)$} and + % \mbox{$\overline{X}_{2} \sim \Nor(\theta_{2}, 2\sigma^{2}/n)$} with $n$ the + % total sample size and $\sigma$ the known data standard deviation, the + % distribution of the SMD is + % \mbox{$\text{SMD} = (\overline{X}_{1} - \overline{X}_{2})/\sigma \sim \Nor\{(\theta_{1} - \theta_{2})/\sigma, 4/n\}$}. + % The standard deviation of the SMD based on one unit ($n = 1$) is hence $2$, + % just as the unit standard deviation for log hazard/odds/rate ratio effect + % sizes \citep[Section 2.4]{Spiegelhalter2004}. + } \citep{Kass1995b} for the effect size under the alternative $H_{1}$. We see that in most cases there is no substantial evidence for either the absence or the presence of an effect, as with the equivalence tests. For instance, with a lenient Bayes factor threshold @@ -854,22 +750,29 @@ more sensitive to smaller/larger true effect sizes. % There are also several more advanced prior distributions that could be used % here \citep{Johnson2010,Morey2011}, and any prior distribution should ideally % be specified for each effect individually based on domain knowledge. -We therefore report a sensitivity analysis with respect to the choice of the -prior standard deviation and the Bayes factor threshold in the bottom plot of -Figure~\ref{fig:sensitivity}. It is uncommon to specify prior standard -deviations larger than the unit-information standard deviation of $2$, as this -corresponds to the assumption of very large effect sizes under the alternatives. -However, to achieve replication success for a larger proportion of replications -than the observed -$\Sexpr{bfSuccesses}/\Sexpr{ntotal} = \Sexpr{round(bfSuccesses/ntotal*100, 0)}\%$, -unreasonably large prior standard deviations have to be specified. For instance, -a standard deviation of roughly $5$ is required to achieve replication success -in $50\%$ of the replications at a lenient Bayes factor threshold of -$\gamma = 3$. The standard deviation needs to be almost $20$ so that the same -success rate $11/15 = \Sexpr{round(11/15*100, 0)}\%$ as with the -non-significance criterion is achieved. The necessary standard deviations are -even higher for stricter Bayes factor threshold, such as $\gamma = 6$ or -$\gamma = 10$. +The sensitivity analysis in the appendix therefore also includes an analysis on +the effect of varying prior standard deviations and the Bayes factor thresholds. +However, again, to achieve replication success for a larger proportion of +replications than the observed $\Sexpr{bfSuccesses}/\Sexpr{ntotal} = +\Sexpr{round(bfSuccesses/ntotal*100, 0)}\%$, unreasonably large prior standard +deviations have to be specified. + +% We therefore report a sensitivity analysis with respect to the choice of the +% prior standard deviation and the Bayes factor threshold in the bottom plot of +% Figure~\ref{fig:sensitivity}. It is uncommon to specify prior standard +% deviations larger than the unit-information standard deviation of $2$, as this +% corresponds to the assumption of very large effect sizes under the alternatives. +% However, to achieve replication success for a larger proportion of replications +% than the observed +% $\Sexpr{bfSuccesses}/\Sexpr{ntotal} = \Sexpr{round(bfSuccesses/ntotal*100, 0)}\%$, +% unreasonably large prior standard deviations have to be specified. For instance, +% a standard deviation of roughly $5$ is required to achieve replication success +% in $50\%$ of the replications at a lenient Bayes factor threshold of +% $\gamma = 3$. The standard deviation needs to be almost $20$ so that the same +% success rate $11/15 = \Sexpr{round(11/15*100, 0)}\%$ as with the +% non-significance criterion is achieved. The necessary standard deviations are +% even higher for stricter Bayes factor threshold, such as $\gamma = 6$ or +% $\gamma = 10$. << "interesting-study" >>= @@ -904,7 +807,7 @@ recommendations. \begin{table}[!htb] - \centering + \centering \small \caption*{Box 1: Recommendations for the analysis of replication studies of original null results. Calculations are based on effect estimates $\hat{\theta}_{i}$ with standard errors $\sigma_{i}$ for $i \in \{o, r\}$ @@ -928,6 +831,14 @@ recommendations. ~ i \in \{o, r\}$$ with $\Phi(\cdot)$ the cumulative distribution function of the standard normal distribution. +<<eval = FALSE, echo = TRUE>>= +ptost <- + function(estimate, margin, se){ + p <- pmax(pnorm(q = estimate, mean = margin, sd = se, lower.tail = TRUE), + pnorm(q = estimate, mean = -margin, sd = se, lower.tail = FALSE)) + return(p) + } +@ \item Declare replication success at level $\alpha$ if $p_{\text{TOST},o} \leq \alpha$ and $p_{\text{TOST},r} \leq \alpha$, conventionally $\alpha = 0.05$. @@ -953,6 +864,14 @@ recommendations. = \sqrt{1 + \frac{v}{\sigma^{2}_{i}}} \, \exp\left[-\frac{1}{2} \left\{\frac{(\hat{\theta}_{i} - \theta_{0})^{2}}{\sigma^{2}_{i}} - \frac{(\hat{\theta}_{i} - m)^{2}}{\sigma^{2}_{i} + v} \right\}\right], ~ i \in \{o, r\}.$$ +<<eval = FALSE, echo = TRUE>>= +BF01 <- + function(estimate, se, null = 0, unitvar = 4) { + bf <- dnorm(x = estimate, mean = null, sd = se) / + dnorm(x = estimate, mean = null, sd = sqrt(se**2 + unitvar)) + return(bf) + } +@ \item Declare replication success at level $\gamma > 1$ if $\BF_{01,o} \geq \gamma$ and $\BF_{01,r} \geq \gamma$, conventionally $\gamma = 3$ (substantial evidence) or $\gamma = 10$ (strong @@ -968,15 +887,21 @@ recommendations. \end{table} When analyzed with equivalence tests or Bayes factors, the conclusions are far -less optimistic than those of the RPCB investigators, who state that ``original -null results were twice as likely as original positive results to mostly -replicate successfully (80\% vs. 40\%)'' \citep[p.16]{Errington2021}. While the -exact success rate depends on the equivalence margin and the prior distribution, -sensitivity analyses showed that even with unrealistically liberal choices, the -success rate remains below 40\%. This is not unexpected, as a study typically -requires larger sample sizes to detect the absence of an effect than to detect -its presence \citep[section 11.5.3]{Matthews2006}. However, the RPCB sample -sizes were only chosen so that each replication had at least 80\% power to +less optimistic than those of the RPCB investigators, who state that if they +``consider a replication to be successful overall if it succeeded on a majority +of criteria, original null results were twice as likely as original positive +results to mostly replicate successfully (80\% vs. 40\%)'' +\citep[p.16]{Errington2021}. +\todo[inline]{RH: I think this is problematic because this claim relates to a +definition of success as a replication succeeding on a majority of criteria, and +not just the one based on significance, so I added this bit... not sure though. +maybe this is also were we can say sth more about the other criteria} +While the exact success rate depends on the equivalence margin and the prior +distribution, sensitivity analyses showed that even with unrealistically liberal +choices, the success rate remains below 40\%. This is not unexpected, as a study +typically requires larger sample sizes to detect the absence of an effect than +to detect its presence \citep[section 11.5.3]{Matthews2006}. However, the RPCB +sample sizes were only chosen so that each replication had at least 80\% power to detect the original effect estimate. The design of replication studies should ideally align with the planned analysis \citep{Anderson2017, Anderson2022, Micheloud2020, Pawel2022c}. If the goal of the study is to find evidence for @@ -985,7 +910,6 @@ so that the study has adequate power to make conclusive inferences regarding the absence of the effect. - For both the equivalence test and the Bayes factor approach, it is critical that the equivalence margin and the prior distribution are specified independently of the data, ideally before the original and replication studies are conducted. @@ -1012,7 +936,6 @@ practice in replication assessment (\eg{} the RPCB used seven different methods), so our proposal does not require a major paradigm shift. - While the equivalence test and the Bayes factor are two principled methods for analyzing original and replication studies with null results, they are not the only possible methods for doing so. A straightforward extension would be to @@ -1073,6 +996,181 @@ which is available in our git repository. \bibliography{bibliography} + +\section*{Appendix} + +\subsection*{Sensitivity analysis} + +As discussed before, the post-hoc specification of equivalence margins and the +choice of the prior distribution for the SMD under the alternative $H_{1}$ is +debatable and controversial. Commonly used margins in clinical research are much +more stringent; for instance, in oncology, a margin of $\Delta = \log(1.3)$ is +commonly used for log odds/hazard ratios, whereas in bioequivalence studies a +margin of \mbox{$\Delta = \log(1.25) % = \Sexpr{round(log(1.25), 2)} + $} is the convention \citep[chapter 22]{Senn2008}. These margins would +translate into margins of $\Delta += % \log(1.3)\sqrt{3}/\pi = +\Sexpr{round(log(1.3)*sqrt(3)/pi, 2)}$ and $\Delta = % \log(1.25)\sqrt{3}/\pi = +\Sexpr{round(log(1.25)*sqrt(3)/pi, 2)}$ on the SMD scale, respectively, using +the $\text{SMD} = (\surd{3} / \pi) \log\text{OR}$ conversion \citep[p. +233]{Cooper2019}. As for the Bayesian hypothesis testing we specified a normal +unit-information prior under the alternative while other normal priors with +smaller/larger standard deviations could have been considered. +In the sensitivity analysis in the top plot of Figure~\ref{fig:sensitivity} we +show the number of successful replications as a function of the margin $\Delta$ +and for different TOST \textit{p}-value thresholds. Such an ``equivalence +curve'' approach was first proposed by \citet{Hauck1986}. We see that for +realistic margins between $0$ and $1$, the proportion of replication successes +remains below $50\%$ for the conventional $\alpha = 0.05$ level. To achieve a +success rate of $11/15 = \Sexpr{round(11/15*100, 0)}\%$, as was achieved with +the non-significance criterion from the RPCB, unrealistic margins of $\Delta > +2$ are required, highlighting the paucity of evidence provided by these studies. +Changing the success criterion to a more lenient level ($\alpha = 0.1$) or a +more stringent level ($\alpha = 0.01$) hardly changes this conclusion. + +In the bottom plot of Figure~\ref{fig:sensitivity} a sensitivity analysis is +reported with respect to the choice of the prior standard deviation and the +Bayes factor threshold. It is uncommon to specify prior standard +deviations larger than the unit-information standard deviation of $2$, as this +corresponds to the assumption of very large effect sizes under the alternatives. +However, to achieve replication success for a larger proportion of replications +than the observed $\Sexpr{bfSuccesses}/\Sexpr{ntotal} = +\Sexpr{round(bfSuccesses/ntotal*100, 0)}\%$, unreasonably large prior standard +deviations have to be specified. For instance, a standard deviation of roughly +$5$ is required to achieve replication success in $50\%$ of the replications at +a lenient Bayes factor threshold of $\gamma = 3$. The standard deviation needs +to be almost $20$ so that the same success rate $11/15 = \Sexpr{round(11/15*100, +0)}\%$ as with the non-significance criterion is achieved. The necessary +standard deviations are even higher for stricter Bayes factor threshold, +such as $\gamma = 6$ or $\gamma = 10$. + +\begin{figure}[!htb] +<< "sensitivity", fig.height = 6.5 >>= +## compute number of successful replications as a function of the equivalence margin +marginseq <- seq(0.01, 4.5, 0.01) +alphaseq <- c(0.01, 0.05, 0.1) +sensitivityGrid <- expand.grid(m = marginseq, a = alphaseq) +equivalenceDF <- lapply(X = seq(1, nrow(sensitivityGrid)), FUN = function(i) { + m <- sensitivityGrid$m[i] + a <- sensitivityGrid$a[i] + rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = m, sd = so, + lower.tail = TRUE), + pnorm(q = smdo, mean = -m, sd = so, + lower.tail = FALSE))) + rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = m, sd = sr, + lower.tail = TRUE), + pnorm(q = smdr, mean = -m, sd = sr, + lower.tail = FALSE))) + successes <- sum(rpcbNull$ptosto <= a & rpcbNull$ptostr <= a) + data.frame(margin = m, alpha = a, + successes = successes, proportion = successes/nrow(rpcbNull)) +}) %>% + bind_rows() + +## plot number of successes as a function of margin +nmax <- nrow(rpcbNull) +bks <- c(0, 3, 6, 9, 11, 15) +labs <- paste0(bks, " (", round(bks/nmax*100, 0), "%)") +rpcbSuccesses <- 11 +marbks <- c(0, margin, 1, 2, 3, 4) +plotA <- ggplot(data = equivalenceDF, + aes(x = margin, y = successes, + color = factor(alpha, ordered = TRUE, levels = rev(alphaseq)))) + + facet_wrap(~ 'italic("p")["TOST"] <= alpha ~ "in original and replication study"', + labeller = label_parsed) + + geom_vline(xintercept = margin, lty = 3, alpha = 0.4) + + annotate(geom = "segment", x = margin + 0.25, xend = margin + 0.01, y = 2, yend = 2, + arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, + color = "darkgrey") + + annotate(geom = "text", x = margin + 0.28, y = 2, color = "darkgrey", + label = "margin used in main analysis", + size = 3, alpha = 0.9, hjust = 0) + + geom_hline(yintercept = rpcbSuccesses, lty = 2, alpha = 0.4) + + annotate(geom = "segment", x = 0.1, xend = 0.1, y = 13, yend = 11.2, + arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, + color = "darkgrey") + + annotate(geom = "text", x = -0.04, y = 13.5, color = "darkgrey", + label = "non-significance criterion successes", + size = 3, alpha = 0.9, hjust = 0) + + geom_step(alpha = 0.8, linewidth = 0.8) + + scale_y_continuous(breaks = bks, labels = labs) + + scale_x_continuous(breaks = marbks) + + coord_cartesian(xlim = c(0, max(equivalenceDF$margin))) + + labs(x = bquote("Equivalence margin" ~ Delta), + y = "Successful replications", + color = bquote("threshold" ~ alpha)) + + theme_bw() + + theme(panel.grid.minor = element_blank(), + panel.grid.major = element_blank(), + strip.background = element_rect(fill = alpha("tan", 0.4)), + strip.text = element_text(size = 12), + legend.position = c(0.85, 0.25), + plot.background = element_rect(fill = "transparent", color = NA), + legend.box.background = element_rect(fill = "transparent", colour = NA)) + +## compute number of successful replications as a function of the prior scale +priorsdseq <- seq(0, 40, 0.1) +bfThreshseq <- c(3, 6, 10) +sensitivityGrid2 <- expand.grid(s = priorsdseq, thresh = bfThreshseq) +bfDF <- lapply(X = seq(1, nrow(sensitivityGrid2)), FUN = function(i) { + priorsd <- sensitivityGrid2$s[i] + thresh <- sensitivityGrid2$thresh[i] + rpcbNull$BForig <- with(rpcbNull, BF01(estimate = smdo, se = so, unitvar = priorsd^2)) + rpcbNull$BFrep <- with(rpcbNull, BF01(estimate = smdr, se = sr, unitvar = priorsd^2)) + successes <- sum(rpcbNull$BForig >= thresh & rpcbNull$BFrep >= thresh) + data.frame(priorsd = priorsd, thresh = thresh, + successes = successes, proportion = successes/nrow(rpcbNull)) +}) %>% + bind_rows() + +## plot number of successes as a function of prior sd +priorbks <- c(0, 2, 10, 20, 30, 40) +plotB <- ggplot(data = bfDF, + aes(x = priorsd, y = successes, color = factor(thresh, ordered = TRUE))) + + facet_wrap(~ '"BF"["01"] >= gamma ~ "in original and replication study"', + labeller = label_parsed) + + geom_vline(xintercept = 2, lty = 3, alpha = 0.4) + + geom_hline(yintercept = rpcbSuccesses, lty = 2, alpha = 0.4) + + annotate(geom = "segment", x = 7, xend = 2 + 0.2, y = 0.5, yend = 0.5, + arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, + color = "darkgrey") + + annotate(geom = "text", x = 7.5, y = 0.5, color = "darkgrey", + label = "standard deviation used in main analysis", + size = 3, alpha = 0.9, hjust = 0) + + annotate(geom = "segment", x = 0.5, xend = 0.5, y = 13, yend = 11.2, + arrow = arrow(type = "closed", length = unit(0.02, "npc")), alpha = 0.9, + color = "darkgrey") + + annotate(geom = "text", x = 0.05, y = 13.5, color = "darkgrey", + label = "non-significance criterion successes", + size = 3, alpha = 0.9, hjust = 0) + + geom_step(alpha = 0.8, linewidth = 0.8) + + scale_y_continuous(breaks = bks, labels = labs, limits = c(0, nmax)) + + scale_x_continuous(breaks = priorbks) + + coord_cartesian(xlim = c(0, max(bfDF$priorsd))) + + labs(x = "Prior standard deviation", + y = "Successful replications ", + color = bquote("threshold" ~ gamma)) + + theme_bw() + + theme(panel.grid.minor = element_blank(), + panel.grid.major = element_blank(), + strip.background = element_rect(fill = alpha("tan", 0.4)), + strip.text = element_text(size = 12), + legend.position = c(0.85, 0.25), + plot.background = element_rect(fill = "transparent", color = NA), + legend.box.background = element_rect(fill = "transparent", colour = NA)) + +grid.arrange(plotA, plotB, ncol = 1) +@ +\caption{Number of successful replications of original null results in the RPCB + as a function of the margin $\Delta$ of the equivalence test + ($p_{\text{TOST}} \leq \alpha$ in both studies for + $\alpha = \Sexpr{rev(alphaseq)}$) or the standard deviation of the zero-mean + normal prior distribution for the SMD effect size under the alternative + $H_{1}$ of the Bayes factor test ($\BF_{01} \geq \gamma$ in both studies for + $\gamma = \Sexpr{bfThreshseq}$).} +\label{fig:sensitivity} +\end{figure} + << "sessionInfo1", eval = Reproducibility, results = "asis" >>= ## print R sessionInfo to see system information and package versions ## used to compile the manuscript (set Reproducibility = FALSE, to not do that) @@ -1084,4 +1182,5 @@ cat(paste(Sys.time(), Sys.timezone(), "\n")) sessionInfo() @ + \end{document}