From b78884b1e796bdaea886c118053f68ec9911e520 Mon Sep 17 00:00:00 2001
From: SamCH93 <samuel.pawel@gmail.com>
Date: Mon, 27 Mar 2023 10:35:31 +0200
Subject: [PATCH] some of Leo's suggestions

---
 paper/bibliography.bib |  2 +-
 paper/rsabsence.Rnw    | 84 +++++++++++++++++++++++++++++-------------
 2 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/paper/bibliography.bib b/paper/bibliography.bib
index 6a88e9c..f1e1ceb 100755
--- a/paper/bibliography.bib
+++ b/paper/bibliography.bib
@@ -3158,7 +3158,7 @@ Discrimination},
 
 
 @book{Senn2008,
-  title={Statistical issues in drug development},
+  title={Statistical Issues in Drug Development},
   author={Stephen Senn},
   volume={69},
   year={2008},
diff --git a/paper/rsabsence.Rnw b/paper/rsabsence.Rnw
index 51d12c2..ca02b0c 100755
--- a/paper/rsabsence.Rnw
+++ b/paper/rsabsence.Rnw
@@ -25,7 +25,8 @@
 }
 
 \title{\vspace{-4em}
-\textbf{Meta-research:\\Replication studies of original ``null results'' -- \\ Absence of evidence or evidence of absence?}}
+\textbf{% Meta-research:\\
+  Replication of ``null results'' -- Absence of evidence or evidence of absence?}}
 \author{{\bf Samuel Pawel\textsuperscript{*},
     Rachel Heyard\textsuperscript{*},
     Charlotte Micheloud,
@@ -135,9 +136,9 @@ BF01 <- function(estimate, se, null = 0, unitvar = 4) {
         the context of replication studies. In several large-scale replication
         projects, non-significant results in both the original and the
         replication study have been interpreted as a ``replication success''.
-        Here we discuss the logical problems with this approach. It does not
-        ensure that the studies provide evidence for the absence of an effect
-        and
+        Here we discuss the logical problems with this approach.
+        Non-significance in both studies does not ensure that the studies
+        provide evidence for the absence of an effect and
         % Because the null hypothesis of the statistical tests in both studies
         % is misaligned,
         ``replication success'' can virtually always be achieved if the sample
@@ -153,7 +154,7 @@ BF01 <- function(estimate, se, null = 0, unitvar = 4) {
         appropriately.
       } \\
       \rule{\textwidth}{0.5pt} \emph{Keywords}: Bayesian hypothesis testing,
-      equivalence testing, null hypothesis, replication success}
+      equivalence testing, meta-reasearch, null hypothesis, replication success}
   \end{minipage}
 \end{center}
 
@@ -178,7 +179,7 @@ adequately powered and under what assumed effect the power was calculated
 \citep{Hoenig2001, Greenland2012}. However, if the goal of a study is to
 explicitly quantify the evidence for the absence of an effect, more appropriate
 methods designed for this task, such as equivalence testing or Bayes factors,
-should ideally be used from the outset.
+should be used from the outset.
 
 % two systematic reviews that I found which show that animal studies are very
 % much underpowered on average \citep{Jennions2003,Carneiro2018}
@@ -190,7 +191,7 @@ similar results can be obtained with new data \citep{NSF2019}. There have been
 various large-scale replication projects in the biomedical and social sciences
 in the last decade \citep[among
 others]{Prinz2011,Begley2012,Klein2014,Opensc2015,Camerer2016,Camerer2018,Klein2018,Cova2018,Errington2021}.
-Most of these projects suggested alarmingly low replicability rates across a
+Most of these projects reported alarmingly low replicability rates across a
 broad spectrum of criteria for quantifying replicability. While most of these
 projects restricted their focus on original studies with statistically
 significant results (``positive results''), the \emph{Reproducibility Project:
@@ -239,6 +240,7 @@ solutions using the null results from the RPCB.
 rpcbRaw <- read.csv(file = "../data/prepped_outcome_level_data.csv")
 rpcb <- rpcbRaw %>%
     select(
+        osf = OSF.project.link,
         paper = pID,
         experiment = eID,
         effect = oID,
@@ -305,6 +307,21 @@ rpcbNull <- rpcb %>%
 ##     mutate(success = sign(smdo) == sign(smdr) & pr >= 0.05) %>%
 ##     summarise(sum(success))
 ## ### noooo :)
+
+## check the sample sizes
+## paper 5 (https://osf.io/q96yj) - 1 Cohen's d - sample size correspond to forest plot
+## paper 9 (https://osf.io/yhq4n) - 3 Cohen's w- sample size do not correspond at all
+## paper 15 (https://osf.io/ytrx5) - 1 r - sample size correspond to forest plot
+## paper 19 (https://osf.io/465r3) - 2 Cohen's dz - sample size correspond to forest plot
+## paper 20 (https://osf.io/acg8s) - 1 r and 1 Cliff's delta - sample size correspond to forest plot
+## paper 21 (https://osf.io/ycq5g) - 1 Cohen's d - sample size correspond to forest plot
+## paper 24 (https://osf.io/pcuhs) - 2 Cohen's d - sample size correspond to forest plot
+## paper 28 (https://osf.io/gb7sr/) - 3 Cohen's d - sample size correspond to forest plot
+## paper 29 (https://osf.io/8acw4) - 1 Cohen's d - sample size do not correspond, seem to be double
+## paper 41 (https://osf.io/qnpxv) - 1 Hazard ratio - sample size correspond to forest plot
+## paper 47 (https://osf.io/jhp8z) - 2 r - sample size correspond to forest plot
+## paper 48 (https://osf.io/zewrd) - 1 r - sample size do not correspond to forest plot for original study
+
 @
 
 
@@ -328,13 +345,13 @@ plotDF1 <- rpcbNull %>%
     mutate(label = ifelse(id == study1,
                           "Goetz et al. (2011)\nEvidence of absence",
                           "Dawson et al. (2011)\nAbsence of evidence"))
-## RH: this data is really a mess. turns out for Dawson n represents the group
-## size (n = 6 in https://osf.io/8acw4) while in Goetz it is the sample size of
-## the whole experiment (n = 34 and 61 in https://osf.io/acg8s). in study 2 the
-## so multiply by 2 to have the total sample size, see Figure 5A
-## https://doi.org/10.7554/eLife.25306.012
-plotDF1$no[plotDF1$id == study2] <- plotDF1$no[plotDF1$id == study2]*2
-plotDF1$nr[plotDF1$id == study2] <- plotDF1$nr[plotDF1$id == study2]*2
+## ## RH: this data is really a mess. turns out for Dawson n represents the group
+## ## size (n = 6 in https://osf.io/8acw4) while in Goetz it is the sample size of
+## ## the whole experiment (n = 34 and 61 in https://osf.io/acg8s). in study 2 the
+## ## so multiply by 2 to have the total sample size, see Figure 5A
+## ## https://doi.org/10.7554/eLife.25306.012
+## plotDF1$no[plotDF1$id == study2] <- plotDF1$no[plotDF1$id == study2]*2
+## plotDF1$nr[plotDF1$id == study2] <- plotDF1$nr[plotDF1$id == study2]*2
 ## create plot showing two example study pairs with null results
 conflevel <- 0.95
 ggplot(data = plotDF1) +
@@ -370,8 +387,8 @@ ggplot(data = plotDF1) +
   pairs which meet the non-significance replication success criterion from the
   Reproducibility Project: Cancer Biology \citep{Errington2021}. Shown are
   standardized mean difference effect estimates with \Sexpr{round(conflevel*100,
-    2)}\% confidence intervals, total sample size, and $p$-values for the null
-  hypothesis that the standardized mean difference is zero.}
+    2)}\% confidence intervals, sample sizes, and two-sided $p$-values for the
+  null hypothesis that the standardized mean difference is zero.}
 \end{figure}
 
 The original study from \citet{Dawson2011} and its replication both show large
@@ -434,11 +451,17 @@ ggplot(data = rpcbNull) +
     geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                   label = paste("BF['01']", ifelse(BForig <= 1/1000, "", "=="),
                                 BForigformat)), col = "darkblue",
-              parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) +
+              parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0) +
     geom_text(aes(x = 1.59, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                   label = paste("BF['01']", ifelse(BFrep <= 1/1000, "", "=="),
                                 BFrepformat)), col = "darkblue",
-              parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0,) +
+              parse = TRUE, size = 2.3, vjust = 1.7, hjust = 0) +
+    geom_text(aes(x = 1.05, y = pmin(-0.9*margin, smdr - 1.6*sr, smdo - 1.6*so),
+                  label = paste("italic(n) ==", no)), col = "darkblue",
+              parse = TRUE, size = 2.3, hjust = 0) +
+    geom_text(aes(x = 2.05, y = pmin(-0.9*margin, smdr - 1.6*sr, smdo - 1.6*so),
+                  label = paste("italic(n) ==", nr)), col = "darkblue",
+              parse = TRUE, size = 2.3, hjust = 0) +
     theme_bw() +
     theme(panel.grid.minor = element_blank(),
           panel.grid.major = element_blank(),
@@ -578,18 +601,27 @@ discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent
 with our intuititons -- there is indeed some evidence for the absence of an
 effect in \citet{Goetz2011}, while there is even slightly more evidence for the
 presence of an effect in \citet{Dawson2011}, though the Bayes factor is very
-close to one due to the small sample sizes. If we use a lenient Bayes factor
+close to one due to the small sample sizes. With a lenient Bayes factor
 threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect,
-only one of the twenty study pairs meets this criteiron in both the original and
+only one of the twenty study pairs meets this criterion in both the original and
 replication study.
 
+
+<< >>=
+studyInteresting <- filter(rpcbNull, id == "(48, 2, 4, 1)")
+noInteresting <- studyInteresting$no
+nrInteresting <- studyInteresting$nr
+write.csv(rpcbNull, "rpcb-Null.csv", row.names = FALSE)
+@
+
 Among the twenty RPCB null results, there is one interesting case (the rightmost
 plot in the fourth row (48, 2, 4, 1)) where the Bayes factor is qualitatively
 different from the equivalence test, revealing a fundamental difference between
 the two approaches. The Bayes factor is concerned with testing whether the
 effect is \emph{exactly zero}, whereas the equivalence test is concerned with
 whether the effect is within an \emph{interval around zero}. Due to the very
-large sample size in this replication study, the data are incompatible with an
+large sample size in the original study ($n = \Sexpr{noInteresting}$) and the
+replication ($n = \Sexpr{nrInteresting}$), the data are incompatible with an
 exactly zero effect, but compatible with effects within the equivalence range.
 Apart from this example, however, the approaches lead to the same qualitative
 conclusion -- most RPCB null results are highly ambiguous.
@@ -664,11 +696,11 @@ data set differ in some cases from those in the data set available at
 \url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in
 \citet{Errington2021}. We used this particular version of the data set because
 it was recommended to us by the RPCB statistician (Maya Mathur) upon request.
-For the \citet{Dawson2011} example study and its replication \citep{Shan2017},
-the sample sizes $n = 3$ in th data set seem to correspond to the group sample
-sizes, see Figure 5A in the replication study
-(\url{https://doi.org/10.7554/eLife.25306.012}), which is why we report the
-total sample sizes of $n = 6$ in Figure~\ref{fig:2examples}.
+% For the \citet{Dawson2011} example study and its replication \citep{Shan2017},
+% the sample sizes $n = 3$ in th data set seem to correspond to the group sample
+% sizes, see Figure 5A in the replication study
+% (\url{https://doi.org/10.7554/eLife.25306.012}), which is why we report the
+% total sample sizes of $n = 6$ in Figure~\ref{fig:2examples}.
 
 The code and data to reproduce our analyses is openly available at
 \url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository
-- 
GitLab