\documentclass[a4paper, 11pt]{article}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
\usepackage{graphics}
\usepackage[dvipsnames]{xcolor}
\usepackage{amsmath, amssymb}
\usepackage{doi} % automatic doi-links
\usepackage[round]{natbib} % bibliography
\usepackage{booktabs} % nicer tables
\usepackage[title]{appendix} % better appendices
\usepackage[onehalfspacing]{setspace} % more space
\usepackage[labelfont=bf,font=small]{caption} % smaller captions
\usepackage{todonotes}

%% margins
\usepackage{geometry}
\geometry{
  a4paper,
  total={170mm,257mm},
  left=25mm,
  right=25mm,
  top=30mm,
  bottom=25mm,
}

\title{\bf Meta-research: Replication studies and the ``absence of evidence''-fallacy}
\author{{\bf Rachel Heyard, Charlotte Micheloud, Samuel Pawel, Leonhard Held} \\
  Epidemiology, Biostatistics and Prevention Institute \\
  Center for Reproducible Science \\
  University of Zurich}
\date{\today} %don't forget to hard-code date when submitting to arXiv!

%% hyperref options
\usepackage{hyperref}
\hypersetup{
  unicode=true,
  bookmarksopen=true,
  breaklinks=true,
  colorlinks=true,
  linkcolor=blue,
  anchorcolor=black,
  citecolor=blue,
  urlcolor=black,
}

%% custom commands
\input{defs.tex}
\begin{document}
\maketitle

%% Disclaimer that a preprint
\vspace{-3em}
\begin{center}
  {\color{red}This is a preprint which has not yet been peer reviewed.}
\end{center}

<< "setup", include = FALSE >>=
## knitr options
library(knitr)
opts_chunk$set(fig.height = 4,
               echo = FALSE,
               warning = FALSE,
               message = FALSE,
               cache = FALSE,
               eval = TRUE)

## should sessionInfo be printed at the end?
Reproducibility <- TRUE

## packages
library(ggplot2) # plotting
library(dplyr) # data manipulation
library(ggrepel) # to highlight data points with non-overlapping labels

## the replication Bayes factor under normality
BFr <- function(to, tr, so, sr) {
    bf <- dnorm(x = tr, mean = 0, sd = so) /
        dnorm(x = tr, mean = to, sd = sqrt(so^2 + sr^2))
    return(bf)
}
formatBF. <- function(BF) {
    if (is.na(BF)) {
        BFform <- NA
    } else if (BF > 1) {
        if (BF > 1000) {
            BFform <- "> 1000"
        } else {
            BFform <- as.character(signif(BF, 2))
        }
    } else {
        if (BF < 1/1000) {
            BFform <- "< 1/1000"
        } else {
            BFform <- paste0("1/", signif(1/BF, 2))
        }
    }
    if (!is.na(BFform) && BFform == "1/1") {
        return("1")
    } else {
        return(BFform)
    }
}
formatBF <- Vectorize(FUN = formatBF.)
@


%% Abstract
%% -----------------------------------------------------------------------------
\begin{center}
  \begin{minipage}{13cm} {\small
      \rule{\textwidth}{0.5pt} \\
      {\centering \textbf{Abstract} \\
        ``Absence of evidence is not evidence of absence'' -- the title of a
        1995 Statistics Note by Douglas Altman and Martin Bland has since become
        some sort of a mantra in statistics and medical lectures. The
        misinterpretation of non-significant results as ``null-findings'' is
        however still common and has important consequences for the
        interpretation of replication projects and alike. In many replication
        attempts and large replication projects, failure to reject the null
        hypothesis in the replication study is interpreted as successfully
        replicating or even proving an original null-effect. Methods to
        adequately summarize the evidence for the null have been proposed. With
        this paper we want to highlight the consequences of the ``absence of
        evidence''-fallacy in the replication setting and want to guide the
        reader and future author of replication studies to existing methods to
        appropriately design and analyse replication attempts of non-significant
        original findings.
      } \\
      \rule{\textwidth}{0.5pt} \emph{Keywords}: Bayesian hypothesis testing,
      equivalence test, non-inferiority test, null hypothesis, replication
      success}
  \end{minipage}
\end{center}


\section{Introduction}

The general misconception that statistical non-significance indicates evidence
for the absence of an effect is unfortunately widespread \citep{Altman1995}. A
well-designed study is constructed in a way that a large enough sample (of
participants, n) is used to achieve an 80-90\% power of correctly rejecting the
null hypothesis. This leaves us with a 10-20\% chance of a false negative.
Somehow this fact from ``Hypothesis Testing 101'' is all too often forgotten and
studies showing an effect with a $p$-value larger than the conventionally used
significance level of $\alpha = 0.05$ are doomed to be a ``negative study'' or
showing a ``null effect''. Some have called to abolish the term ``negative
study'' altogether, as every well-designed and well-conducted study is a
``positive contribution to knowledge'', regardless it’s results
\citep{Chalmers1002}. In general, $p$-values and signifcance testing are often
misinterpreted \citep{Goodman2008, Greenland2016}. This is why suggestions to
shift away from significance testing \citep{Berner2022} or to redefine
statistical significance \citep{Benjamin2017} have been made.

Turning to the replication context, replicability has been
defined as ``obtaining consistent results across studies aimed at answering the
same scientific question, each of which has obtained its own data''
\citep{NSF2019}. Hence, a replication study of an original finding attempts to find
consistent results while applying the same methods and protocol as published in
the original study on newly collected data. In the past decade, an increasing
number of collaborations of researcher and research groups conducted large-scale
replication projects (RP) to estimate the replicability of their respective
research field. In these projects, a set of high impact and influential original
studies were selected to be replicated as close as possible to the original
methodology. The results and conclusions of the RPs showed alarmingly low levels
of replicability in most fields. The Replication Project Cancer Biology
\citep[RPCB]{Errington2021}, the RP Experimental Philosophy
\citep[RPEP]{Cova2018} and the RP Psychology
\citep[RPP]{Opensc2015} also attempted to replicate original studies with
non-significant effects. The authors of those RPs unfortunately fell into the
``absence of evidence''-fallacy trap when defining successful replications.
As described in \citet{Cobey2022}, there is a large variability in how success
is defined in replication studies. They found that in their sample of
replication attempts most authors used a comparison of effect sizes to assess
replication success, while many others used a definition based on statistical
significance, where a replication is successful if it replicates the
significance and direction of the effect published in the original study. When
it comes to the replication of a non-significant original effect some
definitions are more useful than others. The authors of the RPCB and the RPEP
explicitly define a replication of a non-significant original effect as
successful if the effect in the replication study is also non-significant.
While the authors of the RPEP warn the reader that the use of $p$-values as
criterion for success is problematic when applied to replications of original
non-significant findings, the authors of the RPCB do not. In the RP Psychology,
on the other hand, ``original nulls'' were excluded when assessing replication
success based on significance. While we would further like to encourage the
replication of non-significant original findings we urgently argue against using
statistical significance when assessing the replication of an ``original null''.
Indeed, the non-significance of the original effect should already be considered
in the design of the replication study.

% In general, using the significance criterion as definition of replication success
% arises from a false interpretation of the failure to find evidence against the null
% hypothesis as evidence for the null. Non-significant original finding does not
% mean that the underlying true effect is zero nor that it does not exist. This is
% especially true if the original study is under-powered.


\textbf{To replicate or not to replicate an original ``null'' finding?} The
previously presented fallacy leads to the situation in which only a few studies
with non-significant effects are replicated. These same non-significant original
finding additionally might not have been published in the first place
(\textit{i.e.} publication bias). Given the cost of replication
studies and especially large-scale replication projects, it is also
unwise to advise replicating a study that is unlikely to replicate successfully.
To help deciding what studies are worth repeating, efforts to
predict which studies have a higher chance to replicate successfully emerged
\citep{Altmejd2019, Pawel2020}. Of note is that the chance of a successful
replication intrinsically depends on the definition of replication success. If
for a successful replication we need a ``significant result in the same
direction in both the original and the replication study'' \citep[i.e. the
two-trials rule][]{Senn2008}, there is indeed no point in replicating a
non-significant original result. The use of significance as sole criterion
for replication success has its shortcomings and other definitions for
replication success have been proposed \citep{Simonsohn2015, Ly2018, Hedges2019,
Held2020}. An other common problem is low power in the original study which
might render the results hard to replicate \citep{Button2013, Anderson2017}.

In general, if the decision to attempt replication has been taken, the
replication study has to be well-designed too in order to ensure high enough
replication power \citep{Anderson2017, Micheloud2020}. According to
\citet{Anderson2016}, if the goal of a replications is to infer a ``null
effect'' evidence for the null hypothesis has to be provided. To achieve this
they recommend to use equivalence tests or Bayesian methods to quantify the
evidence for the null hypothesis can be used. In the following, we will
illustrate methods to accurately interpret the potential replication of original
non-significant results in the Replication Project Cancer Biology.

\section{Example: ``Null findings'' from the Replication Project Cancer
  Biology}
<< "data" >>=
## data
rpcbRaw <- read.csv(file = "data/prepped_outcome_level_data.csv")

rpcb <- rpcbRaw %>%
  select(paper = pID,
         experiment = eID,
         effect = oID,
         internalReplication = internalID,
         effectType = Effect.size.type,
         ## effect sizes, standard errors, p-values on original scale
         ESo = Original.effect.size,
         seESo = Original.standard.error,
         lowerESo = Original.lower.CI,
         upperESo = Original.upper.CI,
         po = origPval,
         ESr = Replication.effect.size,
         seESr = Replication.standard.error,
         lowerESr = Replication.lower.CI,
         upperESr = Replication.upper.CI,
         pr = repPval,
         ## effect sizes, standard errors, p-values on SMD scale
         smdo = origES3,
         so = origSE3,
         lowero = origESLo3,
         uppero = origESHi3,
         smdr = repES3,
         sr = repSE3,
         # Original and replication sample size
         no = origN,
         nr = repN) %>%
    mutate(
        ## define identifier for effect
        id = paste0("(", paper, ", ", experiment, ", ", effect, ", ",
                    internalReplication, ")"),
        ## recompute one-sided p-values based on normality
        ## (in direction of original effect estimate)
        zo = smdo/so,
        zr = smdr/sr,
        po1 = pnorm(q = abs(zo), lower.tail = FALSE),
        pr1 = pnorm(q = abs(zr), lower.tail = ifelse(sign(zo) < 0, TRUE, FALSE)),
        ## compute some other quantities
        c = so^2/sr^2, # variance ratio
        d = smdr/smdo, # relative effect size
        po2 = 2*(1 - pnorm(q = abs(zo))), # two-sided original p-value
        pr2 = 2*(1 - pnorm(q = abs(zr))), # two-sided replication p-value
        sm = 1/sqrt(1/so^2 + 1/sr^2), # standard error of fixed effect estimate
        smdm = (smdo/so^2 + smdr/sr^2)*sm^2, # fixed effect estimate
        pm2 = 2*(1 - pnorm(q = abs(smdm/sm))), # two-sided fixed effect p-value
        Q = (smdo - smdr)^2/(so^2 + sr^2), # Q-statistic
        pQ = pchisq(q = Q, df = 1, lower.tail = FALSE), # p-value from Q-test
        BFr = BFr(to = smdo, tr = smdr, so = so, sr = sr), # replication BF
        BFrformat = formatBF(BF = BFr)
    )

# TODO identify correct "null" findings as in paper
rpcbNull <- rpcb %>%
    ## filter(po1 > 0.025) #?
    filter(po > 0.05) #?


# Exclude

## ## check HR studies
# rpcb %>%
#     filter(effectType == "Hazard ratio") %>%
#     mutate(seESo2 = (log(upperESo) - log(lowerESo))/(2*1.96)) %>%
#     select(seESo, seESo2)
@

One hundred fifty-eight original effects presented in 23 original studies were
repeated in the RPCB \citep{Errington2021}. Twenty-two effects (14\%) were
interpreted as ``null effects'' by the original authors. We were able to
extract the data by executing the script \texttt{Code/data\_prep.R} from the
github repository \texttt{mayamathur/rpcb.git}. We did however adapt the
\texttt{R}-script to also include null-originals\footnote{By commenting-out line
632.}. The final data includes all effect sizes, from original and replication
study, on the standardized mean difference scale. We found only
\Sexpr{nrow(rpcbNull)} original-replication study-pairs with an original ``null
effect``, \textit{i.e.} with original $p$-value $p_{o} > 0.05$. \todo{explain
discrepancy: 22 vs 23?}


Figure~\ref{fig:nullfindings} shows effect estimates with confidence
intervals for these original ``null findings'' and their replication studies.


\begin{figure}[!htb]
<< "plot-null-findings-rpcb", fig.height =8.5 >>=
ggplot(data = rpcbNull) +
  facet_wrap(~ id + effectType, scales = "free", ncol = 4) +
  geom_hline(yintercept = 0, lty = 2, alpha = 0.5) +
  geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - 2*so,
                      ymax = smdo + 2*so), size = .25) +
  geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - 2*sr,
                      ymax = smdr + 2*sr), size = .25) +
  labs(x = "", y = "Standardized mean difference (SMD)") +
  geom_text(aes(x = 1.4, y = smdo,
                label = paste("n[o]==", no)), col = "darkblue",
            parse = TRUE, size = 2.5,
            nudge_x = -.05) +
  geom_text(aes(x = 2.4, y = smdr,
                label = paste("n[r]==", nr)), col = "darkblue",
            parse = TRUE, size = 2.5,
            nudge_x = -.05) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5),
        # panel.margin = unit(-1, "lines"),
        strip.background = element_rect(fill = alpha("tan", .4)),
        axis.text = element_text(size = 8))

# TODO: some replications are missing, e.g. id == "(37, 2, 2, 1)"
# what should we do with it?

@
\caption{
  Standardized mean difference (SMD) effect estimates with 95\% confidence
  interval for the ``null findings'' (with $p_{o} > 0.05$) and their replication
  studies from the Reproducibility Project: Cancer Biology \citep{Errington2021}.
  The identifier above each plot indicates (Original paper number, Experiment
  number, Effect number, Internal replication number). Additionally, the
  original effect size type is indicated, while all effect sizes were
  transformed to the SMD scale. %The data were downloaded from \url{https://doi.org/10.17605/osf.io/e5nvr}.
  % The relevant variables were
  % extracted from the file ``\texttt{RP\_CB Final Analysis - Effect level
    % data.csv}''.
  The original ($n_o$) and replication ($n_r$) sample sizes are indicated in
  each plot, where sample size represents the total sample size of the two
  groups being compared as was retrieved from the code-book.}
\label{fig:nullfindings}
\end{figure}


\section{Dealing with original non-significant findings in replication projects}

\subsection{Equivalence Design}
For many years, equivalence designs have been used in clinical trials to
understand whether a new drug, which might be cheaper or have less side effects
is equivalent to a drug already on the market [some general REF]. Essentially,
this type of design tests whether the difference between the effects of both
treatments or interventions is smaller than a predefined margin/threshold.
Turning back to the replication contexts and our example ....



\subsection{Bayesian Hypothesis Testing}
Bayesian hypothesis testing is a hypothesis testing framework in which the
distinction between absence of evidence and evidence of absence is more natural.
The central quantity is the Bayes factor \citep{Jeffreys1961, Good1958,
  Kass1995}, that is, the updating factor of the prior odds to the corresponding
posterior odds of the null hypothesis $H_{0}$ versus the alternative hypothesis
$H_{1}$
\begin{align*}
  \underbrace{\frac{\Pr(H_{0} \given \mathrm{data})}{\Pr(H_{1} \given
  \mathrm{data})}}_{\mathrm{Posterior~odds}}
  =  \underbrace{\frac{\Pr(H_{0})}{\Pr(H_{1})}}_{\mathrm{Prior~odds}}
  \times \underbrace{\frac{f(\mathrm{data} \given H_{0})}{f(\mathrm{data}
  \given H_{1})}}_{\mathrm{Bayes~factor}~\BF_{01}}.
\end{align*}
As such, the Bayes factor is an evidence measure which is inferentially relevant
to researchers as it quantifies how much the data have increased
($\BF_{01} > 1$) or decreased ($\BF_{01} < 1$) the odds of the null hypothesis
$H_{0}$ relative to the alternative $H_{1}$. Bayes factors are symmetric
($\BF_{01} = 1/\BF_{10}$), so if a Bayes factor is oriented toward the null
hypothesis ($\BF_{01}$), it can easily be transformed to a Bayes factor oriented
toward the alternative ($\BF_{10}$), and vice versa.

The data thus provide evidence for the null hypothesis if the Bayes factor is
larger than one ($\BF_{01} > 1$), whereas a Bayes factor around one indicates
absence of evidence for either hypothesis ($\BF_{01} \approx 1$).

% Bayes factor have also been proposed for the replication setting. Specifically,
% the replication Bayes factor \citep{Verhagen2014}.


\begin{figure}[!htb]
<< "plot-null-findings-rpcb-br", fig.height = 8.5 >>=
ggplot(data = rpcbNull) +
    facet_wrap(~ id, scales = "free", ncol = 4) +
    geom_hline(yintercept = 0, lty = 2, alpha = 0.5) +
    geom_pointrange(aes(x = "Original", y = smdo, ymin = smdo - 2*so,
                        ymax = smdo + 2*so)) +
    geom_pointrange(aes(x = "Replication", y = smdr, ymin = smdr - 2*sr,
                        ymax = smdr + 2*sr)) +
    geom_text(aes(x = "Replication", y = pmax(smdr + 2.1*sr, smdo + 2.1*so),
                  label = paste("'BF'['01']",
                                ifelse(BFrformat == "< 1/1000", "", "=="),
                                BFrformat)),
              parse = TRUE, size = 3,
              nudge_y = -0.5) +
    labs(x = "", y = "Standardized mean difference (SMD)") +
    theme_bw() +
    theme(panel.grid.minor = element_blank(),
          panel.grid.major.x = element_blank(),
          strip.text = element_text(size = 8, margin = margin(4), vjust = 1.5),
          # panel.margin = unit(-1, "lines"),
          strip.background = element_rect(fill = alpha("tan", .4)),
          axis.text = element_text(size = 8))

@
\caption{tba}
\label{fig:nullfindings_BF}
\end{figure}


\bibliographystyle{apalikedoiurl}
\bibliography{bibliography}

\appendix

\section{Note on $p$-values}


\todo[inline]{SP: I have used the original $p$-values as reported in the data
  set to select the studies in the figure . I think in this way we have the data
  correctly identified as the RPCP paper reports that there are 20 null findings
  in the ``All outcomes'' category. I wonder how they go from the all outcomes
  category to the ``effects'' category (15 null findings), perhaps pool the
  internal replications by meta-analysis? I think it would be better to stay in
  the all outcomes category, but of course it needs to be discussed. Also some
  of the $p$-values were probably computed in a different way than under
  normality (e.g., the $p$-value from (47, 1, 6, 1) under normality is clearly
  significant).}

\begin{figure}[!htb]
<< "plot-p-values", fig.height = 3.5 >>=
## check discrepancy between reported and recomputed p-values for null results
pbreaks <- c(0.005, 0.02, 0.05, 0.15, 0.4)
ggplot(data = rpcbNull, aes(x = po, y = po2)) +
    geom_abline(intercept = 0, slope = 1, alpha = 0.2) +
    geom_vline(xintercept = 0.05, alpha = 0.2, lty = 2) +
    geom_hline(yintercept = 0.05, alpha = 0.2, lty = 2) +
    geom_point(alpha = 0.8, shape = 21, fill = "darkgrey") +
    geom_label_repel(data = filter(rpcbNull, po2 < 0.05),
                     aes(x = po, y = po2, label = id), alpha = 0.8, size = 3,
                     min.segment.length = 0, box.padding = 0.7) +
    labs(x = bquote(italic(p["o"]) ~ "(reported)"),
         y =  bquote(italic(p["o"]) ~ "(recomputed under normality)")) +
    scale_x_log10(breaks = pbreaks, label = scales::percent) +
    scale_y_log10(breaks = pbreaks, labels = scales::percent) +
    coord_fixed(xlim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1),
                ylim = c(min(c(rpcbNull$po2, rpcbNull$po)), 1)) +
    theme_bw() +
    theme(panel.grid.minor = element_blank())


@
\caption{Reported versus recomputed under normality two-sided $p$-values from
  original studies declared as ``null findings'' ($p_{o} > 0.05$) in
  Reproducibility Project: Cancer Biology \citep{Errington2021}.}
\end{figure}

<< "sessionInfo1", eval = Reproducibility, results = "asis" >>=
## print R sessionInfo to see system information and package versions
## used to compile the manuscript (set Reproducibility = FALSE, to not do that)
cat("\\newpage \\section*{Computational details}")
@

<< "sessionInfo2", echo = Reproducibility, results = Reproducibility >>=
cat(paste(Sys.time(), Sys.timezone(), "\n"))
sessionInfo()
@

\end{document}