some updates text

928b8590 · SamCH93 · 54b4893f · 928b8590 · 928b8590
Commit 928b8590 authored 1 year ago by SamCH93
--- a/paper/bibliography.bib
+++ b/paper/bibliography.bib
+@article{Bedard2007,
+  doi = {10.1200/jco.2007.11.3670},
+  year = {2007},
+  volume = {25},
+  number = {23},
+  pages = {3482--3487},
+  author = {Philippe L. Bedard and Monika K. Krzyzanowska and Melania Pintilie and Ian F. Tannock},
+  title = {Statistical Power of Negative Randomized Controlled Trials Presented at American Society for Clinical Oncology Annual Meetings},
+  journal = {Journal of Clinical Oncology}
+}
+
 @book{Wellek2010,
  title={Testing statistical hypotheses of equivalence and noninferiority},
  author={Wellek, Stefan},

--- a/paper/rsabsence.Rnw
+++ b/paper/rsabsence.Rnw
@@ -12,8 +12,8 @@
 \definecolor{darkblue2}{HTML}{273B81}
 \definecolor{darkred2}{HTML}{D92102}

-\title{Meta-Research: Replication of ``null results'' -- Absence of evidence or
-  evidence of absence?}
+\title{Replication of ``null results'' -- Absence of evidence or evidence of
+  absence?}

 \author[1*\authfn{1}]{Samuel Pawel}
 \author[1\authfn{1}]{Rachel Heyard}
@@ -245,12 +245,7 @@ rpcbNull <- rpcb %>%
 @


-Figure~\ref{fig:2examples} shows standardized mean difference effect estimates
-with confidence intervals from two RPCB study pairs. Both are ``null results''
-and meet the non-significance criterion for replication success (the two-sided
-$p$-values are greater than 0.05 in both the original and the replication study),
-but intuition would suggest that these two pairs are very much different.
-\begin{figure}[ht]
+\begin{figure}[!htb]
 << "2-example-studies", fig.height = 3.25 >>=
 ## some evidence for absence of effect https://doi.org/10.7554/eLife.45120 I
 ## can't find the replication effect like reported in the data set :( let's take
@@ -311,6 +306,13 @@ ggplot(data = plotDF1) +
  null hypothesis that the standardized mean difference is zero.}
 \end{figure}

+Figure~\ref{fig:2examples} shows standardized mean difference effect estimates
+with \Sexpr{round(100*conflevel, 2)}\% confidence intervals from two RPCB study
+pairs. Both are ``null results'' and meet the non-significance criterion for
+replication success (the two-sided $p$-values are greater than 0.05 in both the
+original and the replication study), but intuition would suggest that these two
+pairs are very much different.
+
 The original study from \citet{Dawson2011} and its replication both show large
 effect estimates in magnitude, but due to the small sample sizes, the
 uncertainty of these estimates is very large, too. If the sample sizes of the
@@ -325,7 +327,7 @@ difference between absence of evidence and evidence of absence, we will now
 discuss how the two can be quantitatively distinguished.


-\section{Methods for asssessing replicability of null results}
+\section{Methods for assessing replicability of null results}
 \label{sec:methods}
 There are both frequentist and Bayesian methods that can be used for assessing
 evidence for the absence of an effect. \citet{Anderson2016} provide an excellent
@@ -340,7 +342,7 @@ data.
 Equivalence testing was developed in the context of clinical trials to assess
 whether a new treatment -- typically cheaper or with fewer side effects than the
 established treatment -- is practically equivalent to the established treatment
-\citep{Westlake1972,Schuirmann1987}. The method can also be used to assess
+\citep{Wellek2010}. The method can also be used to assess
 whether an effect is practically equivalent to the value of an absent effect,
 usually zero. Using equivalence testing as a remedy for non-significant results
 has been suggested by several authors \citep{Hauck1986, Campbell2018}. The main
@@ -350,10 +352,9 @@ practical purposes. The goal is then to reject
 the % composite %% maybe too technical?
 null hypothesis that the true effect is outside the equivalence range. This is
 in contrast to the usual null hypothesis of a superiority test which states that
-the effect is zero or smaller than zero, see Figure~\ref{fig:hypotheses} for an
-illustration.
+the effect is zero, see Figure~\ref{fig:hypotheses} for an illustration.

-\begin{figure}
+\begin{figure}[!htb]
  \begin{center}
    \begin{tikzpicture}[ultra thick]
      \draw[stealth-stealth] (0,0) -- (6,0);
@@ -362,55 +363,69 @@ illustration.
      \draw (3,0.2) -- (3,-0.2) node[below]{$0$};
      \draw (4,0.2) -- (4,-0.2) node[below]{$+\Delta$};

-      \node[text width=5cm, align=left] at (0,1.5) {\textbf{Equivalence}};
+      \node[text width=5cm, align=left] at (0,1.25) {\textbf{Equivalence}};
      \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt}]
-      (2.05,1) -- (3.95,1) node[midway,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
+      (2.05,0.75) -- (3.95,0.75) node[midway,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
      \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}]
-      (0,1) -- (1.95,1) node[pos=0.6,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
+      (0,0.75) -- (1.95,0.75) node[pos=0.6,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
      \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}]
-      (4.05,1) -- (6,1) node[pos=0.4,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
+      (4.05,0.75) -- (6,0.75) node[pos=0.4,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};

-      \node[text width=5cm, align=left] at (0,3.5) {\textbf{Superiority \\(two-sided)}};
+      \node[text width=5cm, align=left] at (0,2.5) {\textbf{Superiority}};
      \draw [decorate,decoration={brace,amplitude=5pt}]
-      (3,3) -- (3,3) node[midway,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
-      \draw[darkblue2] (3,2.9) -- (3,3.2);
+      (3,2) -- (3,2) node[midway,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
+      \draw[darkblue2] (3,1.95) -- (3,2.2);
      \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}]
-      (0,3) -- (2.95,3) node[pos=0.6,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
-      \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}]
-      (3.05,3) -- (6,3) node[pos=0.4,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
-
-      \node[text width=5cm, align=left] at (0,5.5) {\textbf{Superiority  \\ (one-sided)}};
+      (0,2) -- (2.95,2) node[pos=0.6,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
      \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}]
-      (3.05,5) -- (6,5) node[pos=0.4,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
-      \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}]
-      (0,5) -- (3,5) node[pos=0.6,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
-
-      \draw [dashed] (2,0) -- (2,1);
-      \draw [dashed] (4,0) -- (4,1);
-      \draw [dashed] (3,0) -- (3,1);
-      \draw [dashed] (3,1.9) -- (3,2.8);
-      \draw [dashed] (3,3.9) -- (3,5);
+      (3.05,2) -- (6,2) node[pos=0.4,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
+
+      % \node[text width=5cm, align=left] at (0,5.5) {\textbf{Superiority  \\ (one-sided)}};
+      % \draw [draw={darkred2},decorate,decoration={brace,amplitude=5pt,aspect=0.4}]
+      % (3.05,5) -- (6,5) node[pos=0.4,yshift=1.5em]{\textcolor{darkred2}{$H_1$}};
+      % \draw [draw={darkblue2},decorate,decoration={brace,amplitude=5pt,aspect=0.6}]
+      % (0,5) -- (3,5) node[pos=0.6,yshift=1.5em]{\textcolor{darkblue2}{$H_0$}};
+
+      \draw [dashed] (2,0) -- (2,0.75);
+      \draw [dashed] (4,0) -- (4,0.75);
+      \draw [dashed] (3,0) -- (3,0.75);
+      \draw [dashed] (3,1.5) -- (3,1.9);
+      % \draw [dashed] (3,3.9) -- (3,5);
    \end{tikzpicture}
  \end{center}
  \caption{Null hypothesis ($H_0$) and alternative hypothesis ($H_1$) for
-    different study designs with equivalence margin $\Delta$.}
+    superiority and equivalence tests (with equivalence margin $\Delta > 0$).}
  \label{fig:hypotheses}
 \end{figure}

 To ensure that the null hypothesis is falsely rejected at most
-$\alpha \times 100\%$ of the time, one either rejects it if the
-$(1-2\alpha)\times 100\%$ confidence interval for the effect is contained within
-the equivalence range (for example, a 90\% confidence interval for
-$\alpha = 5\%$), or if two one-sided tests (TOST) for the effect being
-smaller/greater than $+\Delta$ and $-\Delta$ are significant at level $\alpha$,
-respectively. A quantitative measure of evidence for the absence of an effect is
-then given by the maximum of the two one-sided $p$-values (the TOST $p$-value).
+$\alpha \times 100\%$ of the time, the standard approach is to declare
+equivalence if the $(1-2\alpha)\times 100\%$ confidence interval for the effect
+is contained within the equivalence range (for example, a 90\% confidence
+interval for $\alpha = 5\%$) \citep{Westlake1972}, which is equivalent to two
+one-sided tests (TOST) for the null hypotheses of the effect being
+greater/smaller than $+\Delta$ and $-\Delta$ being significant at level $\alpha$
+\citep{Schuirmann1987}. A quantitative measure of evidence for the absence of an
+effect is then given by the maximum of the two one-sided $p$-values (the TOST
+$p$-value). A reasonable replication success criterion for null results may
+therefore be to require that both the original and the replication TOST
+$p$-values be smaller than some level $\alpha$ (e.g., 0.05), or, equivalently,
+that their $(1-2\alpha)\times 100\%$ confidence intervals are included in the
+equivalence region (e.g., 90\%). In contrast to the non-significance criterion,
+this criterion controls the error of falsely claiming replication success at
+level $\alpha^{2}$ when there is a true effect outside the equivalence margin,
+thus complementing the usual two-trials rule.
+

 \begin{figure}
  \begin{fullwidth}
 << "plot-null-findings-rpcb", fig.height = 8.25, fig.width = "0.95\\linewidth" >>=
 ## compute TOST p-values
-margin <- 0.3 # Cohen: small - 0.3 # medium - 0.5 # large - 0.8
+## Wellek (2010): strict - 0.36 # liberal - .74
+# Cohen: small - 0.3 # medium - 0.5 # large - 0.8
+## 80-125% convention for AUC and Cmax FDA/EMA
+## 1.3 for oncology OR/HR -> log(1.3)*sqrt(3)/pi = 0.1446
+margin <- 0.74
 conflevel <- 0.9
 rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = margin, sd = so,
                                             lower.tail = TRUE),
@@ -421,9 +436,13 @@ rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = margin, sd = sr,
                                       pnorm(q = smdr, mean = -margin, sd = sr,
                                             lower.tail = FALSE)))
 ## highlight the studies from Goetz and Dawson
-rpcbNull$id <- ifelse(rpcbNull$id == "(20, 1, 1)",
+ex1 <- "(20, 1, 1)"
+ind1 <- which(rpcbNull$id == ex1)
+ex2 <- "(29, 2, 2)"
+ind2 <- which(rpcbNull$id == ex2)
+rpcbNull$id <- ifelse(rpcbNull$id == ex1,
                      "(20, 1, 1) - Goetz et al. (2011)", rpcbNull$id)
-rpcbNull$id <- ifelse(rpcbNull$id == "(29, 2, 2)",
+rpcbNull$id <- ifelse(rpcbNull$id == ex2,
                      "(29, 2, 2) - Dawson et al. (2011)", rpcbNull$id)

 ## create plots of all study pairs with null results in original study
@@ -487,7 +506,6 @@ ggplot(data = rpcbNull) +
 @
 \caption{Standardized mean difference (SMD) effect estimates with
  \Sexpr{round(conflevel*100, 2)}\% confidence interval for the ``null results''
-  % (those with original two-sided $p$-value $p > 0.05$)
  and their replication studies from the Reproducibility Project: Cancer Biology
  \citep{Errington2021}. The identifier above each plot indicates (original
  paper number, experiment number, effect number). Two original effect estimates
@@ -496,10 +514,11 @@ ggplot(data = rpcbNull) +
  null results by the RPCB. The two examples from Figure~\ref{fig:2examples} are
  indicated in the plot titles. The dashed gray line represents the value of no
  effect ($\text{SMD} = 0$), while the dotted red lines represent the
-  equivalence range with a margin of $\Delta = \Sexpr{margin}$. The $p$-values
-  $p_{\text{TOST}}$ are the maximum of the two one-sided $p$-values for the
-  effect being less than or greater than $+\Delta$ or $-\Delta$, respectively.
-  The Bayes factors $\BF_{01}$ quantify the evidence for the null hypothesis
+  equivalence range with a margin of $\Delta = \Sexpr{margin}$, classified as
+  ``liberal'' by \citet[Table 1.1]{Wellek2010}. The $p$-values $p_{\text{TOST}}$
+  are the maximum of the two one-sided $p$-values for the effect being less than
+  or greater than $+\Delta$ or $-\Delta$, respectively. The Bayes factors
+  $\BF_{01}$ quantify the evidence for the null hypothesis
  $H_{0} \colon \text{SMD} = 0$ against the alternative
  $H_{1} \colon \text{SMD} \neq 0$ with normal unit-information prior assigned
  to the SMD under $H_{1}$.}
@@ -507,24 +526,185 @@ ggplot(data = rpcbNull) +
 \end{fullwidth}
 \end{figure}

+<< "successes-RPCB" >>=
+ntotal <- nrow(rpcbNull)
+
+## successes non-significance criterion
+nullSuccesses <- sum(rpcbNull$po > 0.05 & rpcbNull$pr > 0.05)
+
+## success equivalence testing criterion
+equivalenceSuccesses <- sum(rpcbNull$ptosto <= 0.05 & rpcbNull$ptostr <= 0.05)
+ptosto1 <- rpcbNull$ptosto[ind1]
+ptostr1 <- rpcbNull$ptostr[ind1]
+ptosto2 <- rpcbNull$ptosto[ind2]
+ptostr2 <- rpcbNull$ptostr[ind2]
+
+## success BF criterion
+bfSuccesses <- sum(rpcbNull$BForig > 3 & rpcbNull$BFrep > 3)
+@
+
 Returning to the RPCB data, Figure~\ref{fig:nullfindings} shows the standardized
 mean difference effect estimates with \Sexpr{round(conflevel*100, 2)}\%
-confidence intervals for the 20 study pairs with quantitative null results in
-the original study ($p > 0.05$). The dotted red lines represent an equivalence
-range for the margin $\Delta = \Sexpr{margin}$, for which the shown TOST
-$p$-values are computed. This margin is rather lax compared to the margins
-typically used in clinical research; we chose it primarily for illustrative
-purposes and because effect sizes in preclinical research are typically much
-larger than in clinical research. In practice, the margin should be determined
-on a case-by-case basis by researchers who are familiar with the subject matter.
-However, even with this generous margin, only four of the twenty study pairs --
-one of them being the previously discussed example from \citet{Goetz2011} -- are
-able to establish equivalence at the 5\% level in the sense that both the
-original and the replication 90\% confidence interval fall within the
-equivalence range (or equivalently that their TOST $p$-values are smaller than
-$0.05$). For the remaining 16 studies -- for instance, the previously discussed
-example from \citet{Dawson2011} -- the situation remains inconclusive and there
-is neither evidence for the absence nor the presence of the effect.
+confidence intervals for the 15 effects which were treated as quantitative null
+results by the RPCB.\footnote{There are four original studies with null effects
+  for which several internal replication studies were conducted, leading in
+  total to 20 replications of null effects. As in the RPCB main analysis
+  \citet{Errington2021}, we aggregated their SMD estimates into a single SMD
+  estimate with fixed-effect meta-analysis.} Most of them showed non-significant
+$p$-values ($p > 0.05$) in the original study, but there are two effects in
+paper 48 which the original authors regarded as null results despite their
+statistical significance. We see that there are \Sexpr{nullSuccesses}
+``success'' (with $p > 0.05$ in original and replication study) out of total
+\Sexpr{ntotal} null effects, as reported in Table 1 from~\citet{Errington2021}.
+% , and which were therefore treated as null results also by the RPCB.
+
+We will now apply equivalence testing to the RPCB data. The dotted red lines
+represent an equivalence range for the margin $\Delta =
+\Sexpr{margin}$, % , for which the shown TOST $p$-values are computed.
+which \citet[Table 1.1]{Wellek2010} classifies as ``liberal''. However, even
+with this generous margin, only \Sexpr{equivalenceSuccesses} of the
+\Sexpr{ntotal} study pairs are able to establish replication success at the 5\%
+level, in the sense that both the original and the replication 90\% confidence
+interval fall within the equivalence range (or, equivalently, that their TOST
+$p$-values are smaller than $0.05$). For the remaining \Sexpr{ntotal -
+  equivalenceSuccesses} studies, the situation remains inconclusive and there is
+no evidence for the absence or the presence of the effect. For instance, the
+previously discussed example from \citet{Goetz2011} marginally fails the
+criterion ($p_{\text{TOST}} = \Sexpr{formatPval(ptosto1)}$ in the original study
+and $p_{\text{TOST}} = \Sexpr{formatPval(ptostr1)}$ in the replication), while
+the example from \citet{Dawson2011} is a clearer failure
+($p_{\text{TOST}} = \Sexpr{formatPval(ptosto2)}$ in the original study and
+$p_{\text{TOST}} = \Sexpr{formatPval(ptostr2)}$ in the replication).
+
+
+
+% We chose the margin $\Delta = \Sexpr{margin}$ primarily for illustrative
+% purposes and because effect sizes in preclinical research are typically much
+% larger than in clinical research.
+The post-hoc determination of the equivalence margin is debateable. Ideally, the
+margin should be determined on a case-by-case basis before the studies are
+conducted by researchers familiar with the subject matter. One could also argue
+that the chosen margin $\Delta = \Sexpr{margin}$ is too lax compared to margins
+typically used in clinical research; for instance, in oncology, a margin of
+$\Delta = \log(1.3)$ is commonly used for log odds/hazard ratios, whereas in
+bioequivalence studies a margin of $\Delta =
+\log(1.25) % = \Sexpr{round(log(1.25), 2)}
+$ is the convention, which translates to $\Delta = % \log(1.3)\sqrt{3}/\pi =
+\Sexpr{round(log(1.3)*sqrt(3)/pi, 2)}$ and $\Delta = % \log(1.25)\sqrt{3}/\pi =
+\Sexpr{round(log(1.25)*sqrt(3)/pi, 2)}$ on the SMD scale, respectively, using
+the $\text{SMD} = (\surd{3} / \pi) \log\text{OR}$ conversion \citep[p.
+233]{Cooper2019}. Therefore, we report a sensitivity analysis in
+Figure~\ref{fig:sensitivity}. The top plot shows the number of successful
+replications as a function of the margin $\Delta$ and for different TOST
+$p$-value thresholds. Such an ``equivalence curve'' approach was first proposed
+by \citet{Hauck1986}, see also \citet{Campbell2021} for alternative approaches
+to post-hoc equivalence margin specification. We see that for realistic margins
+between 0 and 1, the proportion of replication successes remains below 50\%. To
+achieve a success rate of 11 of the 15 studies, as with the RCPB
+non-significance criterion, unrealistic margins of $\Delta > 2$ are required,
+which illustrates the paucity of evidence provided by these studies.
+
+
+\begin{figure}[!htb]
+<< "sensitivity", fig.height = 6.5 >>=
+## compute number of successful replications as a function of the equivalence margin
+marginseq <- seq(0.01, 4.5, 0.01)
+alphaseq <- c(0.005, 0.05, 0.1)
+sensitivityGrid <- expand.grid(m = marginseq, a = alphaseq)
+equivalenceDF <- lapply(X = seq(1, nrow(sensitivityGrid)), FUN = function(i) {
+    m <- sensitivityGrid$m[i]
+    a <- sensitivityGrid$a[i]
+    rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = m, sd = so,
+                                                 lower.tail = TRUE),
+                                           pnorm(q = smdo, mean = -m, sd = so,
+                                                 lower.tail = FALSE)))
+    rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = m, sd = sr,
+                                                 lower.tail = TRUE),
+                                           pnorm(q = smdr, mean = -m, sd = sr,
+                                                 lower.tail = FALSE)))
+    successes <- sum(rpcbNull$ptosto <= a & rpcbNull$ptostr <= a)
+    data.frame(margin = m, alpha = a,
+               successes = successes, proportion = successes/nrow(rpcbNull))
+}) %>%
+    bind_rows()
+
+## plot number of successes as a function of margin
+nmax <- nrow(rpcbNull)
+bks <- seq(0, nmax, round(nmax/5))
+labs <- paste0(bks, " (", bks/nmax*100, "%)")
+plotA <- ggplot(data = equivalenceDF,
+                aes(x = margin, y = successes,
+                    color = factor(alpha, ordered = TRUE))) +
+    facet_wrap(~ 'italic("p")["TOST"] <= alpha ~ "in original and replication study"',
+               labeller = label_parsed) +
+    geom_vline(xintercept = margin, lty = 2, alpha = 0.4) +
+    geom_step(alpha = 0.8, linewidth = 0.8) +
+    scale_y_continuous(breaks = bks, labels = labs) +
+    ## scale_y_continuous(labels = scales::percent) +
+    guides(color = guide_legend(reverse = TRUE)) +
+    labs(x = bquote("Equivalence margin" ~ Delta),
+         y = "Successful replications",
+         color = bquote("threshold" ~ alpha)) +
+    theme_bw() +
+    theme(panel.grid.minor = element_blank(),
+          panel.grid.major = element_blank(),
+          strip.background = element_rect(fill = alpha("tan", 0.4)),
+          strip.text = element_text(size = 12),
+          legend.position = c(0.85, 0.25),
+          plot.background = element_rect(fill = "transparent", color = NA),
+          ## axis.text.y = element_text(hjust = 0),
+          legend.box.background = element_rect(fill = "transparent", colour = NA))
+
+## compute number of successful replications as a function of the prior scale
+priorsdseq <- seq(0, 40, 0.1)
+bfThreshseq <- c(3, 6, 10)
+sensitivityGrid2 <- expand.grid(s = priorsdseq, thresh = bfThreshseq)
+bfDF <- lapply(X = seq(1, nrow(sensitivityGrid2)), FUN = function(i) {
+    priorsd <- sensitivityGrid2$s[i]
+    thresh <- sensitivityGrid2$thresh[i]
+    rpcbNull$BForig <- with(rpcbNull, BF01(estimate = smdo, se = so, unitvar = priorsd^2))
+    rpcbNull$BFrep <- with(rpcbNull, BF01(estimate = smdr, se = sr, unitvar = priorsd^2))
+    successes <- sum(rpcbNull$BForig >= thresh & rpcbNull$BFrep >= thresh)
+    data.frame(priorsd = priorsd, thresh = thresh,
+               successes = successes, proportion = successes/nrow(rpcbNull))
+}) %>%
+    bind_rows()
+
+## plot number of successes as a function of prior sd
+plotB <- ggplot(data = bfDF,
+                aes(x = priorsd, y = successes, color = factor(thresh, ordered = TRUE))) +
+    facet_wrap(~ '"BF"["01"] >= gamma ~ "in original and replication study"',
+               labeller = label_parsed) +
+    geom_vline(xintercept = 4, lty = 2, alpha = 0.4) +
+    geom_step(alpha = 0.8, linewidth = 0.8) +
+    scale_y_continuous(breaks = bks, labels = labs, limits = c(0, nmax)) +
+    ## scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
+    labs(x = "Prior standard deviation",
+         y = "Successful replications ",
+         color = bquote("threshold" ~ gamma)) +
+    theme_bw() +
+    theme(panel.grid.minor = element_blank(),
+          panel.grid.major = element_blank(),
+          strip.background = element_rect(fill = alpha("tan", 0.4)),
+          strip.text = element_text(size = 12),
+          legend.position = c(0.85, 0.25),
+          plot.background = element_rect(fill = "transparent", color = NA),
+          ## axis.text.y = element_text(hjust = 0),
+          legend.box.background = element_rect(fill = "transparent", colour = NA))
+
+grid.arrange(plotA, plotB, ncol = 1)
+@
+
+\caption{Number of successful replications of original null results in
+  the RPCB as a function of the margin $\Delta$ of the equivalence test
+  ($p_{\text{TOST}} \leq \alpha$ in both studies) or the standard deviation of
+  the normal prior distribution for the effect under the alternative $H_{1}$ of
+  the Bayes factor test ($\BF_{01} \geq \gamma$ in both studies). The dashed
+  gray lines represent the parameters used in the main analysis shown in
+  Figure~\ref{fig:nullfindings}.}
+\label{fig:sensitivity}
+\end{figure}
+


 \subsection{Bayesian hypothesis testing}
@@ -586,14 +766,17 @@ large effects are plausible under the alternative. We see that in most cases
 there is no substantial evidence for either the absence or the presence of an
 effect, as with the equivalence tests. The Bayes factors for the two previously
 discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent
-with our intuititons -- there is indeed some evidence for the absence of an
+with our intuitions -- there is indeed some evidence for the absence of an
 effect in \citet{Goetz2011}, while there is even slightly more evidence for the
 presence of an effect in \citet{Dawson2011}, though the Bayes factor is very
 close to one due to the small sample sizes. With a lenient Bayes factor
 threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect,
-only one of the twenty study pairs meets this criterion in both the original and
-replication study.
+only \Sexpr{bfSuccesses} of the \Sexpr{ntotal} study pairs meets this criterion
+in both the original and replication study.

+The sensitivity of the Bayes factor choice of the of the prior may again be
+assessed visually, as shown in the bottom plot of Figure~\ref{fig:sensitivity}.
+We see ....

 << >>=
 studyInteresting <- filter(rpcbNull, id == "(48, 2, 4)")
@@ -602,8 +785,8 @@ nrInteresting <- studyInteresting$nr
 ## write.csv(rpcbNull, "rpcb-Null.csv", row.names = FALSE)
 @

-Among the twenty RPCB null results, there is one interesting case (the rightmost
-plot in the fourth row (48, 2, 4, 1)) where the Bayes factor is qualitatively
+Among the \Sexpr{ntotal} RPCB null results, there are three interesting cases
+(the three effects from paper 48) where the Bayes factor is qualitatively
 different from the equivalence test, revealing a fundamental difference between
 the two approaches. The Bayes factor is concerned with testing whether the
 effect is \emph{exactly zero}, whereas the equivalence test is concerned with
@@ -616,32 +799,23 @@ conclusion -- most RPCB null results are highly ambiguous.

 \section{Conclusions}

-We showed that in most of the RPCB studies with ``null results'' (those with
-$p > 0.05$), neither the original nor the replication study provided conclusive
-evidence for the presence or absence of an effect. It seems logically
-questionable to declare an inconclusive replication of an inconclusive original
-study as a replication success. While it is important to replicate original
-studies with null results, our analysis highlights that they should be analyzed
-and interpreted appropriately.
+We showed that in most of the RPCB studies with ``null results'', neither the
+original nor the replication study provided conclusive evidence for the presence
+or absence of an effect. It seems logically questionable to declare an
+inconclusive replication of an inconclusive original study as a replication
+success. While it is important to replicate original studies with null results,
+our analysis highlights that they should be analyzed and interpreted
+appropriately.

 For both the equivalence testing and the Bayes factor approach, it is critical
 that the parameters of the procedure (the equivalence margin and the prior
 distribution) are specified independently of the data, ideally before the
 studies are conducted. Typically, however, the original studies were designed to
 find evidence for the presence of an effect, and the goal of replicating the
-``null result'' was formulated only after failure to do so. \citet{Campbell2021}
-discuss various approaches to post-hoc specification of equivalence margins,
-such as motivating it using data from previous studies or using field
-conventions. \citet{Hauck1986} propose a sensitivity analysis approach in the
-form of plotting the TOST $p$-value against a range of possible margins
-(``equivalence curves''). Post-hoc specification of a prior distribution for a
-Bayes factor may likewise be based on historical data, field conventions, or
-assessed visually with sensitivity analyses.
-% As error rate control may no longer be ensured in this case, the TOST
-% $p$-values should not be used as dichotomous decision tools, but rather as
-% descriptive measures of compatibility between the data and effects outside the
-% equivalence region \citep{Amrhein2019, Rafi2020, Greenland2023}.
-
+``null result'' was formulated only after failure to do so. It is therefore
+important that margins and prior distributions are motivated from historical
+data and/or field conventions, and that sensitivity analyses regarding their
+choice are reported \citet{Campbell2021}.

 While the equivalence test and the Bayes factor are two principled methods for
 analyzing original and replication studies with null results, they are not the
@@ -701,108 +875,6 @@ which is available in our git repository.% The effect estimates and standard

 \bibliography{bibliography}

-\appendix
-\begin{appendixbox}
-% \label{first:sensitivity}
-% \section{Sensitivity analyses}
-
-
-\begin{center}
-<< "sensitivity", fig.height = 8 >>=
-## compute number of successful replications as a function of the equivalence margin
-marginseq <- seq(0.01, 4.5, 0.01)
-alphaseq <- c(0.005, 0.05, 0.1)
-sensitivityGrid <- expand.grid(m = marginseq, a = alphaseq)
-equivalenceDF <- lapply(X = seq(1, nrow(sensitivityGrid)), FUN = function(i) {
-    m <- sensitivityGrid$m[i]
-    a <- sensitivityGrid$a[i]
-    rpcbNull$ptosto <- with(rpcbNull, pmax(pnorm(q = smdo, mean = m, sd = so,
-                                                 lower.tail = TRUE),
-                                           pnorm(q = smdo, mean = -m, sd = so,
-                                                 lower.tail = FALSE)))
-    rpcbNull$ptostr <- with(rpcbNull, pmax(pnorm(q = smdr, mean = m, sd = sr,
-                                                 lower.tail = TRUE),
-                                           pnorm(q = smdr, mean = -m, sd = sr,
-                                                 lower.tail = FALSE)))
-    successes <- sum(rpcbNull$ptosto <= a & rpcbNull$ptostr <= a)
-    data.frame(margin = m, alpha = a,
-               successes = successes, proportion = successes/nrow(rpcbNull))
-}) %>%
-    bind_rows()
-
-## plot number of successes as a function of margin
-nmax <- nrow(rpcbNull)
-bks <- seq(0, nmax, round(nmax/5))
-labs <- paste0(bks, " (", bks/nmax*100, "%)")
-plotA <- ggplot(data = equivalenceDF,
-                aes(x = margin, y = successes,
-                    color = factor(alpha, ordered = TRUE))) +
-    facet_wrap(~ "Equivalence test") +
-    geom_vline(xintercept = margin, lty = 2, alpha = 0.4) +
-    geom_step(alpha = 0.9, linewidth = 0.8) +
-    scale_y_continuous(breaks = bks, labels = labs) +
-    ## scale_y_continuous(labels = scales::percent) +
-    guides(color = guide_legend(reverse = TRUE)) +
-    labs(x = bquote("Equivalence margin" ~ Delta),
-         y = "Successful replications",
-         color = bquote(italic("p")["TOST"] ~ "threshold" ~ alpha)) +
-    theme_bw() +
-    theme(panel.grid.minor = element_blank(),
-          panel.grid.major = element_blank(),
-          strip.background = element_rect(fill = alpha("tan", 0.4)),
-          strip.text = element_text(size = 12),
-          legend.position = c(0.85, 0.25),
-          plot.background = element_rect(fill = "transparent", color = NA),
-          ## axis.text.y = element_text(hjust = 0),
-          legend.box.background = element_rect(fill = "transparent", colour = NA))
-
-## compute number of successful replications as a function of the prior scale
-priorsdseq <- seq(0, 40, 0.1)
-bfThreshseq <- c(3, 6, 10)
-sensitivityGrid2 <- expand.grid(s = priorsdseq, thresh = bfThreshseq)
-bfDF <- lapply(X = seq(1, nrow(sensitivityGrid2)), FUN = function(i) {
-    priorsd <- sensitivityGrid2$s[i]
-    thresh <- sensitivityGrid2$thresh[i]
-    rpcbNull$BForig <- with(rpcbNull, BF01(estimate = smdo, se = so, unitvar = priorsd^2))
-    rpcbNull$BFrep <- with(rpcbNull, BF01(estimate = smdr, se = sr, unitvar = priorsd^2))
-    successes <- sum(rpcbNull$BForig >= thresh & rpcbNull$BFrep >= thresh)
-    data.frame(priorsd = priorsd, thresh = thresh,
-               successes = successes, proportion = successes/nrow(rpcbNull))
-}) %>%
-    bind_rows()
-
-## plot number of successes as a function of prior sd
-plotB <- ggplot(data = bfDF,
-                aes(x = priorsd, y = successes, color = factor(thresh, ordered = TRUE))) +
-    facet_wrap(~ "Bayes factor") +
-    geom_vline(xintercept = 4, lty = 2, alpha = 0.4) +
-    geom_step(alpha = 0.9, linewidth = 0.8) +
-    scale_y_continuous(breaks = bks, labels = labs, limits = c(0, nmax)) +
-    ## scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
-    guides(color = guide_legend(reverse = TRUE)) +
-    labs(x = "Prior distribution scale",
-         y = "Successful replications ",
-         color = bquote("BF"["01"] ~ "threshold" ~ gamma)) +
-    theme_bw() +
-    theme(panel.grid.minor = element_blank(),
-          panel.grid.major = element_blank(),
-          strip.background = element_rect(fill = alpha("tan", 0.4)),
-          strip.text = element_text(size = 12),
-          legend.position = c(0.85, 0.25),
-          plot.background = element_rect(fill = "transparent", color = NA),
-          ## axis.text.y = element_text(hjust = 0),
-          legend.box.background = element_rect(fill = "transparent", colour = NA))
-grid.arrange(plotA, plotB, ncol = 1)
-@
-
-\captionof{figure}{Number of successful replications of original null results in
-  the RPCB as a function of the margin $\Delta$ of equivalence test
-  ($p_{\text{TOST}} \leq \alpha$ in both studies) or the scale of the prior
-  distribution for the effect under the alternative $H_{1}$ of the Bayes
-  factor ($\BF_{01} \geq \gamma$ in both studies).}
-\end{center}
-
-\end{appendixbox}


 << "sessionInfo1", eval = Reproducibility, results = "asis" >>=