From 413222a5fb49da362b235058c004718d58d9ed81 Mon Sep 17 00:00:00 2001 From: SamCH93 <samuel.pawel@gmail.com> Date: Thu, 23 Mar 2023 16:29:28 +0100 Subject: [PATCH] add conditional equivalence testing --- bibliography.bib | 32 ++++++++++++++++++++++++++++++++ rsAbsence.Rnw | 48 ++++++++++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/bibliography.bib b/bibliography.bib index 208fc19..6a88e9c 100644 --- a/bibliography.bib +++ b/bibliography.bib @@ -1,3 +1,35 @@ +@article{Campbell2021, + doi = {10.15626/mp.2020.2506}, + year = {2021}, + volume = {5}, + author = {Harlan Campbell and Paul Gustafson}, + title = {What to make of equivalence testing with a post-specified margin?}, + journal = {Meta-Psychology} +} + +@article{Hauck1986, + doi = {10.1002/sim.4780050302}, + year = {1986}, + volume = {5}, + number = {3}, + pages = {203--209}, + author = {Walter W. Hauck and Sharon Anderson}, + title = {A proposal for interpreting and reporting negative studies}, + journal = {Statistics in Medicine} +} + +@article{Campbell2018, + doi = {10.1371/journal.pone.0195145}, + year = {2018}, + volume = {13}, + number = {4}, + pages = {e0195145}, + author = {Harlan Campbell and Paul Gustafson}, + editor = {Daniele Marinazzo}, + title = {Conditional equivalence testing: An alternative remedy for publication bias}, + journal = {{PLOS} {ONE}} +} + @article{Rafi2020, doi = {10.1186/s12874-020-01105-9}, year = {2020}, diff --git a/rsAbsence.Rnw b/rsAbsence.Rnw index 8635bd8..12ccb14 100755 --- a/rsAbsence.Rnw +++ b/rsAbsence.Rnw @@ -481,18 +481,20 @@ whether a new treatment -- typically cheaper or with fewer side effects than the established treatment -- is practically equivalent to the established treatment \citep{Westlake1972,Schuirmann1987}. The method can also be used to assess whether an effect is practically equivalent to the value of an absent effect, -usually zero. The main challenge is to specify the margin $\Delta > 0$ that -defines an equivalence range $[-\Delta, +\Delta]$ in which an effect is -considered as absent for practical purposes. The goal is then to reject the -composite null hypothesis that the true effect is outside the equivalence range. -To ensure that the null hypothesis is falsely rejected at most -$\alpha \times 100\%$ of the time, one either rejects it if the -$(1-2\alpha)\times 100\%$ confidence interval for the effect is contained within -the equivalence range (for example, a 90\% confidence interval for -$\alpha = 5\%$), or if two one-sided tests (TOST) for the effect being -smaller/greater than $+\Delta$ and $-\Delta$ are significant at level $\alpha$, -respectively. A quantitative measure of evidence for the absence of an effect is -then given by the maximum of the two one-sided $p$-values (the TOST $p$-value). +usually zero. Using equivalence testing as a remedy for non-significant results +has been suggested by several authors \citep{Hauck1986, Campbell2018}. The main +challenge is to specify the margin $\Delta > 0$ that defines an equivalence +range $[-\Delta, +\Delta]$ in which an effect is considered as absent for +practical purposes. The goal is then to reject the composite null hypothesis +that the true effect is outside the equivalence range. To ensure that the null +hypothesis is falsely rejected at most $\alpha \times 100\%$ of the time, one +either rejects it if the $(1-2\alpha)\times 100\%$ confidence interval for the +effect is contained within the equivalence range (for example, a 90\% confidence +interval for $\alpha = 5\%$), or if two one-sided tests (TOST) for the effect +being smaller/greater than $+\Delta$ and $-\Delta$ are significant at level +$\alpha$, respectively. A quantitative measure of evidence for the absence of an +effect is then given by the maximum of the two one-sided $p$-values (the TOST +$p$-value). Returning to the RPCB data, Figure~\ref{fig:nullfindings} shows the standarized mean difference effect estimates with \Sexpr{round(conflevel*100, 2)}\% @@ -607,13 +609,19 @@ that the parameters of the procedure (the equivalence margin and the prior distribution) are specified independently of the data, ideally before the studies are conducted. Typically, however, the original studies were designed to find evidence for the presence of an effect, and the goal of replicating the -``null finding'' was formulated only after failure to do so. Such a change in -the inferential objective of the study is particularly problematic for the -frequentist equivalence testing approach, as it affects the error rates of the -procedure \citep{CPMP2001}. Since error rate control is no longer ensured in -this case, the TOST $p$-values should not be used as dichotomous decision tools, -but rather as descriptive measures of compatibility between the data and effects -outside the equivalence region \citep{Amrhein2019, Rafi2020, Greenland2023}. +``null result'' was formulated only after failure to do so. \citet{Campbell2021} +discuss various approaches to post-hoc specification of equivalence margins, +such as motivating it using data from previous studies or using field +conventions. \citet{Hauck1986} propose a sensitivity analysis approach in the +form of plotting the TOST $p$-value against a range of possible margins +(``equivalence curves''). Post-hoc specification of a prior distribution for a +Bayes factor may likewise be based on historical data, field conventions, or +assessed visually with sensitivity analyses. +% As error rate control may no longer be ensured in this case, the TOST +% $p$-values should not be used as dichotomous decision tools, but rather as +% descriptive measures of compatibility between the data and effects outside the +% equivalence region \citep{Amrhein2019, Rafi2020, Greenland2023}. + While the equivalence test and the Bayes factor are two principled methods for analyzing original and replication studies with null results, they are not the @@ -649,7 +657,7 @@ We declare no conflict of interest. The data from the RPCB were obtained by downloading the files from \url{https://github.com/mayamathur/rpcb} (commit a1e0c63) and executing the R script \texttt{Code/data\_prep.R} with the line 632 commented out so that also -original studies with null finding are included. This then produced the file +original studies with null results are included. This then produced the file \texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent analyses. The effect estimates and standard errors on SMD scale provided in this data set differ in some cases from those in the data set available at -- GitLab