diff --git a/bibliography.bib b/bibliography.bib
index b469b35cd3b9856c35d200c29f880c6e2f321d92..8cb90ba871e456d871c8e172dd896f96f7f5e0b5 100644
--- a/bibliography.bib
+++ b/bibliography.bib
@@ -1,3 +1,44 @@
+@article{Shan2017,
+  doi = {10.7554/elife.25306},
+  year = {2017},
+  volume = {6},
+  author = {Xiaochuan Shan and Juan Jose Fung and Alan Kosaka and Gwenn Danet-Desnoyers and},
+  title = {Replication Study: Inhibition of {BET} recruitment to chromatin as an effective treatment for {MLL}-fusion leukaemia},
+  journal = {{eLife}}
+}
+
+@Article{Rufibach2009,
+    title = {{reporttools}: {R} Functions to Generate {\LaTeX} Tables of Descriptive Statistics},
+    author = {Kaspar Rufibach},
+    journal = {Journal of Statistical Software, Code Snippets},
+    year = {2009},
+    volume = {31},
+    number = {1},
+    doi = {10.18637/jss.v031.c01},
+}
+
+@article{Greenland2012,
+  doi = {10.1016/j.annepidem.2012.02.007},
+  year = {2012},
+  volume = {22},
+  number = {5},
+  pages = {364--368},
+  author = {Sander Greenland},
+  title = {Nonsignificance Plus High Power Does Not Imply Support for the Null Over the Alternative},
+  journal = {Annals of Epidemiology}
+}
+
+@article{Hoenig2001,
+  doi = {10.1198/000313001300339897},
+  year = {2001},
+  volume = {55},
+  number = {1},
+  pages = {19--24},
+  author = {John M Hoenig and Dennis M Heisey},
+  title = {The Abuse of Power},
+  journal = {The American Statistician}
+}
+
 @article{Morey2011,
   doi = {10.1037/a0024377},
   year = {2011},
diff --git a/rsAbsence.Rnw b/rsAbsence.Rnw
index 2f2a1f7e8b766613a26cdf494a90f2ab3563a92b..30c9e43a64c1ea5be6b4bd0902af39c452fec533 100755
--- a/rsAbsence.Rnw
+++ b/rsAbsence.Rnw
@@ -26,7 +26,11 @@
 
 \title{\vspace{-4em}
 \textbf{Meta-research:\\Replication studies of original ``null results'' -- \\ Absence of evidence or evidence of absence?}}
-\author{{\bf Rachel Heyard, Samuel Pawel, Charlotte Micheloud, Leonhard Held} \\
+\author{{\bf Samuel Pawel\textsuperscript{*},
+    Rachel Heyard\textsuperscript{*},
+    Charlotte Micheloud,
+    Leonhard Held} \\
+  * contributed equally \\
   Epidemiology, Biostatistics and Prevention Institute \\
   Center for Reproducible Science \\
   University of Zurich}
@@ -72,6 +76,7 @@ Reproducibility <- TRUE
 ## packages
 library(ggplot2) # plotting
 library(dplyr) # data manipulation
+library(reporttools) # reporting of p-values
 
 ## the replication Bayes factor under normality
 BFr <- function(to, tr, so, sr) {
@@ -131,8 +136,8 @@ BF01 <- function(estimate, se, null = 0, unitvar = 4) {
         projects, non-significant results in both the original and the
         replication study have been interpreted as a ``replication success''.
         Here we discuss the logical problems with this approach. It does not
-        ensure that the studies provide evidence for the absence of an
-        effect, and
+        ensure that the studies provide evidence for the absence of an effect
+        and
         % Because the null hypothesis of the statistical tests in both studies
         % is misaligned,
         ``replication success'' can virtually always be achieved if the sample
@@ -156,24 +161,29 @@ BF01 <- function(estimate, se, null = 0, unitvar = 4) {
 % data as not showing evidence for a meaningful relationship or impact of an
 % intervention.
 
+
+
 \section{Introduction}
 
 The misconception that a statistically non-significant result indicates evidence
 for the absence of an effect is unfortunately widespread \citep{Altman1995}.
-Whether or not such a ``null result'' -- typically characterized by a $p$-value
-of $p > 5\%$ for the null hypothesis of an absent \mbox{effect --} provides
-evidence for the absence of an effect depends on the statistical power of the
-study.
-\todo{CM: previous sentence might be misleading, let's discuss it.}
-For example, if the sample size of the study is chosen to detect an
-effect with a power of 80\%, null results will occur incorrectly 20\% of the
-time when there is indeed a true effect. Conversely, if the power of the study
-is lower, null results will occur more often. In general, the lower the power of
-a study, the greater the ambiguity of a null result. To put a null result in
-context, it is therefore critical to know whether the study was adequately
-powered. Furthermore, if the goal of a study is to quantify the evidence for the
-absence of an effect, more appropriate methods designed for this task, such as
-equivalence testing or Bayes factors, should be used.
+% Whether or not such a ``null result'' -- typically characterized by a $p$-value
+% of $p > 5\%$ for the null hypothesis of an absent \mbox{effect --} provides
+% evidence for the absence of an effect depends on the statistical power of the
+% study.
+Such a ``null result'' -- typically characterized by a $p$-value of $p > 5\%$
+for the null hypothesis of an absent effect -- may also occur if an effect is
+actually present. For example, if the sample size of a study is chosen to detect
+an assumed effect with a power of 80\%, null results will incorrectly occur 20\%
+of the time when the assumed effect is actually present. Conversely, if the
+power of the study is lower, null results will occur more often. In general, the
+lower the power of a study, the greater the ambiguity of a null result. To put a
+null result in context, it is therefore critical to know whether the study was
+adequately powered and under what assumed effect the power was calculated
+\citep{Hoenig2001, Greenland2012}. However, if the goal of a study is to
+explicitly quantify the evidence for the absence of an effect, more appropriate
+methods designed for this task, such as equivalence testing or Bayes factors,
+should be used from the outset.
 
 % two systematic reviews that I found which show that animal studies are very
 % much underpowered on average \citep{Jennions2003,Carneiro2018}
@@ -196,9 +206,9 @@ equivalence testing or Bayes factors, should be used.
 The contextualization of null results becomes even more complicated in the
 setting of replication studies. In a replication study, researchers attempt to
 repeat an original study as closely as possible in order to assess whether
-similar results can be obtained with new data \citep{NSF2019}. There have been various
-large-scale replication projects in the biomedical and social sciences in the
-last decade \citep[among
+similar results can be obtained with new data \citep{NSF2019}. There have been
+various large-scale replication projects in the biomedical and social sciences
+in the last decade \citep[among
 others]{Prinz2011,Begley2012,Klein2014,Opensc2015,Camerer2016,Camerer2018,Klein2018,Cova2018,Errington2021}.
 Most of these projects suggested alarmingly low replicability rates across a
 broad spectrum of criteria for quantifying replicability. While most of these
@@ -222,27 +232,26 @@ the absence of an effect. It is then unclear what exactly the goal of the
 replication should be -- to replicate the inconclusiveness of the original
 result? On the other hand, if the original study was adequately powered, a
 non-significant result may indeed provide some evidence for the absence of an
-effect, so that the goal of the replication is clearer.
-\todo{CM: maybe add that additional analyses are required?}
-However, the criterion
-does not distinguish between these two cases. Second, with this criterion
-researchers can virtually always achieve replication success by conducting two
-studies with very small sample sizes, such that the $p$-values are
-non-significant and the result is inconclusive. This is because the null
-hypothesis under which the $p$-values are computed is misaligned with the goal
-of inference, which is to quantify the evidence for the absence of an effect. We
-will discuss methods that are better aligned with this inferential goal in
-Section~\ref{sec:methods}. Third, the criterion does not control the error of
-falsely claiming the absence of an effect at some predetermined rate. This is in
-contrast to the standard replication success criterion of requiring significance
-from both studies \citep[also known as the two-trials rule, see chapter 12.2.8
-in][]{Senn2008}, which ensures that the error of falsley claiming the presence
-of an effect is controlled at a rate equal to the squared significance level
-(for example, $5\% \times 5\% = 0.25\%$ for a $5\%$ significance level). The
-non-significance criterion may be intended to complement the two-trials rule for
-null results, but it fails to do so in this respect, which may be important to
-regulators, funders, and researchers. We will now demonstrate these issues and
-potential solutions using the null results from the RPCB.
+effect when analyzed with appropriate methods, so that the goal of the
+replication is clearer. However, the criterion does not distinguish between
+these two cases. Second, with this criterion researchers can virtually always
+achieve replication success by conducting two studies with very small sample
+sizes, such that the $p$-values are non-significant and the result is
+inconclusive. This is because the null hypothesis under which the $p$-values are
+computed is misaligned with the goal of inference, which is to quantify the
+evidence for the absence of an effect. We will discuss methods that are better
+aligned with this inferential goal in Section~\ref{sec:methods}. Third, the
+criterion does not control the error of falsely claiming the absence of an
+effect at some predetermined rate. This is in contrast to the standard
+replication success criterion of requiring significance from both studies
+\citep[also known as the two-trials rule, see chapter 12.2.8 in][]{Senn2008},
+which ensures that the error of falsley claiming the presence of an effect is
+controlled at a rate equal to the squared significance level (for example,
+$5\% \times 5\% = 0.25\%$ for a $5\%$ significance level). The non-significance
+criterion may be intended to complement the two-trials rule for null results,
+but it fails to do so in this respect, which may be important to regulators,
+funders, and researchers. We will now demonstrate these issues and potential
+solutions using the null results from the RPCB.
 
 
 
@@ -423,12 +432,10 @@ with confidence intervals from two RPCB study pairs. Both are ``null results''
 and meet the non-significance criterion for replication success (the two-sided
 $p$-values are greater than 5\% in both the original and the replication study),
 but intuition would suggest that these two pairs are very much different.
-\todo[inline]{RH: this data is really a mess. turns out for Dawson n represents the group size (n = 6 in https://osf.io/8acw4) while in Goetz it is the sample size of the whole experiment (n = 34 and 61 in https://osf.io/acg8s).}
 \begin{figure}[ht]
 << "2-example-studies", fig.height = 3.25 >>=
-## some evidence for absence of effect (when a really genereous margin Delta = 1
-## of a lenient BF = 3 threshold)
-## https://doi.org/10.7554/eLife.45120 I can't find the replication effect like reported in the data set :(
+## some evidence for absence of effect
+## https://doi.org/10.7554/eLife.45120 I can't find the replication effect like reported in the data set :( let's take it at face value we are not data detectives
 ## https://iiif.elifesciences.org/lax/45120%2Felife-45120-fig4-v1.tif/full/1500,/0/default.jpg
 study1 <- "(20, 1, 1, 1)"
 ## absence of evidence
@@ -439,6 +446,13 @@ study2 <- "(29, 2, 2, 1)"
 plotDF1 <- rpcbNull %>%
     filter(id %in% c(study1, study2)) %>%
     mutate(label = ifelse(id == study1, "Goetz et al. (2011)\nEvidence of absence", "Dawson et al. (2011)\nAbsence of evidence"))
+## RH: this data is really a mess. turns out for Dawson n represents the group
+## size (n = 6 in https://osf.io/8acw4) while in Goetz it is the sample size of
+## the whole experiment (n = 34 and 61 in https://osf.io/acg8s). in study 2 the
+## so multiply by 2 to have the total sample size, see Figure 5A
+## https://doi.org/10.7554/eLife.25306.012
+plotDF1$no[plotDF1$id == study2] <- plotDF1$no[plotDF1$id == study2]*2
+plotDF1$nr[plotDF1$id == study2] <- plotDF1$nr[plotDF1$id == study2]*2
 conflevel <- 0.95
 ggplot(data = plotDF1) +
     facet_wrap(~ label) +
@@ -456,10 +470,10 @@ ggplot(data = plotDF1) +
                   label = paste("italic(n) ==", nr)), col = "darkblue",
               parse = TRUE, size = 3.8, hjust = 0) +
     geom_text(aes(x = 1.05, y = 3,
-                  label = paste("italic(p) ==", biostatUZH::formatPval(po))), col = "darkblue",
+                  label = paste("italic(p) ==", formatPval(po))), col = "darkblue",
               parse = TRUE, size = 3.8, hjust = 0) +
     geom_text(aes(x = 2.05, y = 3,
-                  label = paste("italic(p) ==", biostatUZH::formatPval(pr))), col = "darkblue",
+                  label = paste("italic(p) ==", formatPval(pr))), col = "darkblue",
               parse = TRUE, size = 3.8, hjust = 0) +
     labs(x = "", y = "Standardized mean difference (SMD)") +
     theme_bw() +
@@ -473,7 +487,8 @@ ggplot(data = plotDF1) +
   pairs which meet the non-significance replication success criterion from the
   Reproducibility Project: Cancer Biology \citep{Errington2021}. Shown are
   standardized mean difference effect estimates with \Sexpr{round(conflevel*100,
-    2)}\% confidence intervals.}
+    2)}\% confidence intervals, total sample size, and $p$-values for the null
+  hypothesis that the standardized mean difference is zero.}
 \end{figure}
 
 The original study from \citet{Dawson2011} and its replication both show large
@@ -545,13 +560,13 @@ ggplot(data = rpcbNull) + ## filter(rpcbNull, effectType %in% estypes)) +
     geom_text(aes(x = 0.46, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                   label = paste("italic(p)['TOST']",
                                 ifelse(ptosto < 0.0001, "", "=="),
-                                biostatUZH::formatPval(ptosto))),
+                                formatPval(ptosto))),
               col = "darkblue", parse = TRUE, size = 2.3, hjust = 0,
               vjust = 0.5) +
     geom_text(aes(x = 1.51, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
                   label = paste("italic(p)['TOST']",
                                 ifelse(ptostr < 0.0001, "", "=="),
-                                biostatUZH::formatPval(ptostr))),
+                                formatPval(ptostr))),
               col = "darkblue", parse = TRUE, size = 2.3, hjust = 0,
               vjust = 0.5) +
     geom_text(aes(x = 0.54, y = pmax(smdo + 2.5*so, smdr + 2.5*sr, 1.1*margin),
@@ -628,32 +643,29 @@ confidence interval for the effect is contained within the equivalence range
 one-sided tests (TOST) for the effect being smaller/greater than $+\Delta$
 and $-\Delta$ are significant at level $\alpha$, respectively.
 A quantitative measure of evidence for the absence of an effect is then given
-by the maximum of the two one-sided $p$-values.
+by the maximum of the two one-sided $p$-values (the TOST $p$-value).
 
-\todo{CM: maybe more logical to first discuss margin and then mention the
-TOST $p$-values in Fig~\ref{fig:nullfindings}.}
 Returning to the RPCB data, Figure~\ref{fig:nullfindings} shows the standarized
 mean difference effect estimates with \Sexpr{round(conflevel*100, 2)}\%
-confidence intervals along with the TOST $p$-values for the 20 study pairs with
-quantitative null results in the original study ($p > 5\%$). The dotted red
-lines represent an equivalence range for the margin $\Delta = \Sexpr{margin}$.
-This margin is rather lax compared to the margins typically used in clinical
-research; we chose it primarily for illustrative purposes and because effect
-sizes in preclinical research are typically much larger than in clinical
-research. In practice, the margin should be determined on a case-by-case basis
-by researchers who are familiar with the subject matter. However, even with this
-generous margin, only four of the twenty study pairs -- one of them being the
-previously discussed example from \citet{Goetz2011} -- are able to establish
-equivalence at the 5\% level in the sense that both the original and the
-replication 90\% confidence interval fall within the equivalence range or both
-TOST $p$-values are smaller than $5\%$. For the remaining 16 studies -- for
-instance, the previously discussed example from \citet{Dawson2011} -- the
-situation remains inconclusive and there is neither evidence for the absence or
-presence of the effect.
+confidence intervals for the 20 study pairs with quantitative null results in
+the original study ($p > 5\%$). The dotted red lines represent an equivalence
+range for the margin $\Delta = \Sexpr{margin}$, for which the shown TOST
+$p$-values are computed. This margin is rather lax compared to the margins
+typically used in clinical research; we chose it primarily for illustrative
+purposes and because effect sizes in preclinical research are typically much
+larger than in clinical research. In practice, the margin should be determined
+on a case-by-case basis by researchers who are familiar with the subject matter.
+However, even with this generous margin, only four of the twenty study pairs --
+one of them being the previously discussed example from \citet{Goetz2011} -- are
+able to establish equivalence at the 5\% level in the sense that both the
+original and the replication 90\% confidence interval fall within the
+equivalence range (or equivalently that their TOST $p$-values are smaller than
+$5\%$). For the remaining 16 studies -- for instance, the previously discussed
+example from \citet{Dawson2011} -- the situation remains inconclusive and there
+is neither evidence for the absence nor the presence of the effect.
 
 
 \subsection{Bayesian hypothesis testing}
-\todo{CM: section a bit long?}
 The distinction between absence of evidence and evidence of absence is naturally
 built into the Bayesian approach to hypothesis testing. The central measure of
 evidence is the Bayes factor \citep{Kass1995}, which is the updating factor of
@@ -708,26 +720,28 @@ $H_{1}$. There are several more advanced prior distributions that could be used
 here, and they should ideally be specified for each effect individually based on
 domain knowledge. The normal unit-information prior (with a standard deviation
 of 2 for SMDs) is only a reasonable default choice, as it implies that small to
-large effects are plausible under the alternative. We see that in most cases
-there is no substantial evidence for either the absence or the presence of an
-effect, as with the equivalence tests. The Bayes factors for the two previously
-discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are consistent
-with our intuititons -- there is indeed some evidence for the absence of an
-effect in \citet{Goetz2011}, while there is even slightly more evidence for the
-presence of an effect in \citet{Dawson2011}, though the Bayes factor is very
-close to one due to the small sample sizes. If we use a lenient Bayes factor
-threshold of $\BF_{01} > 3$ to define evidence for the absence of the effect,
-only one of the twenty study pairs meets this criteiron in both the original and
-replication study. There is one interesting case -- the rightmost plot in the
-fourth row (48, 2, 4, 1) -- where the Bayes factor is qualitatively different
-from the equivalence test, revealing a fundamental difference between the two
-approaches. The Bayes factor is concerned with testing whether the effect is
-\emph{exactly zero}, whereas the equivalence test is concerned with whether the
-effect is within an \emph{interval around zero}. Due to the very large sample
-size in this replication study, the data are incompatible with an exactly zero
-effect, but compatible with effects within the equivalence range. Apart from
-this example, however, the approaches lead to the same qualitative conclusion --
-most RPCB null results are highly ambiguous.
+large effects are plausible under the alternative. We see that in most
+cases there is no substantial evidence for either the absence or the presence of
+an effect, as with the equivalence tests. The Bayes factors for the two
+previously discussed examples from \citet{Goetz2011} and \citet{Dawson2011} are
+consistent with our intuititons -- there is indeed some evidence for the absence
+of an effect in \citet{Goetz2011}, while there is even slightly more evidence
+for the presence of an effect in \citet{Dawson2011}, though the Bayes factor is
+very close to one due to the small sample sizes. If we use a lenient Bayes
+factor threshold of $\BF_{01} > 3$ to define evidence for the absence of the
+effect, only one of the twenty study pairs meets this criteiron in both the
+original and replication study.
+
+Among the twenty RPCB null results, there is one interesting case (the rightmost
+plot in the fourth row (48, 2, 4, 1)) where the Bayes factor is qualitatively
+different from the equivalence test, revealing a fundamental difference between
+the two approaches. The Bayes factor is concerned with testing whether the
+effect is \emph{exactly zero}, whereas the equivalence test is concerned with
+whether the effect is within an \emph{interval around zero}. Due to the very
+large sample size in this replication study, the data are incompatible with an
+exactly zero effect, but compatible with effects within the equivalence range.
+Apart from this example, however, the approaches lead to the same qualitative
+conclusion -- most RPCB null results are highly ambiguous.
 % regarding the presence or absence of an effect.
 
 
@@ -747,12 +761,15 @@ While the equivalence test and the Bayes factor are two principled methods for
 analyzing original and replication studies with null results, they are not the
 only possible methods for doing so. For instance, the reverse-Bayes approach
 from \citet{Micheloud2022} specifically tailored to equivalence testing in the
-replication setting may lead to more appropriate inferences as it also take into
-account the compatibility of the effect estimates from original and replication
-studies. In addition, there are various more advanced Bayesian hypothesis
-testing procedures specifically designed to quantify the evidence for the
-absence of an effect \citep{Johnson2010, Morey2011} that could potentially
-improve the efficiency of the Bayes factor approach. Finally, the design of
+replication setting may lead to more appropriate inferences as it also takes
+into account the compatibility of the effect estimates from original and
+replication studies. In addition, there are various more advanced Bayes factor
+procedures specifically designed to quantify the evidence for the absence of an
+effect \citep{Johnson2010, Morey2011} that could potentially improve the
+efficiency of the Bayes factor approach. For both equivalence testing and Bayes
+factor approaches, it is important that the parameters of the procedures -- the
+equivalence margin and the prior distribution -- are specified independent of
+the data, so ideally before the studies are conducted. Finally, the design of
 replication studies should align with the planned analysis \citep{Anderson2017,
   Anderson2022, Micheloud2020,
   Pawel2022c}. % The RPCB determined the sample size
@@ -762,14 +779,14 @@ If the goal of study is to find evidence for the absence of an effect, the
 replication sample size should also be determined so that the study has adequate
 power to make conclusive inferences regarding the absence of the effect.
 
-\todo{CM: mention that margin + prior distribution should be chosen
-before first/second study is conducted?}
+% \todo{CM: mention that margin + prior distribution should be chosen
+% before first/second study is conducted?}
 
 \section*{Acknowledgements}
 We thank the contributors of the RPCB for their tremendous efforts and for
 making their data publicly available. We thank Maya Mathur for helpful advice
 with the data preparation. This work was supported by the Swiss National Science
-Foundation (grants \#189295 and \#XXXXXX). We declare no conflict of interest.
+Foundation (grants \#189295 and \#XXXXXX).
 
 \section*{Conflict of interest}
 We declare no conflict of interest.
@@ -777,7 +794,7 @@ We declare no conflict of interest.
 
 \section*{Data and software}
 The data from the RPCB were obtained by downloading the files from
-\url{https://github.com/mayamathur/rpcb} commit a1e0c63 and executing the R
+\url{https://github.com/mayamathur/rpcb} (commit a1e0c63) and executing the R
 script \texttt{Code/data\_prep.R} with the line 632 commented out so that also
 original studies with null finding are included. This then produced the file
 \texttt{prepped\_outcome\_level\_data.csv} which was used for the subsequent
@@ -786,6 +803,11 @@ data set differ in some cases from those in the data set available at
 \url{https://doi.org/10.17605/osf.io/e5nvr}, which is cited in
 \citet{Errington2021}. We used this particular version of the data set because
 it was recommended to us by the RPCB statistician (Maya Mathur) upon request.
+For the \citet{Dawson2011} example original study and its replication
+\citep{Shan2017}, the sample sizes $n = 3$ in th data set correspond to the
+group sample sizes, see Figure 5A in the replication study
+(\url{https://doi.org/10.7554/eLife.25306.012}), which is why we report the
+total sample sizes of $n = 6$ in Figure~\ref{fig:2examples}.
 
 The code and data to reproduce our analyses is openly available at
 \url{https://gitlab.uzh.ch/samuel.pawel/rsAbsence}. A snapshot of the repository
@@ -793,8 +815,9 @@ at the time of writing is available at
 \url{https://doi.org/10.5281/zenodo.XXXXXX}. We used the statistical programming
 language R version \Sexpr{paste(version$major, version$minor, sep = ".")}
 \citep{R} for analyses. The R packages \texttt{ggplot2} \citep{Wickham2016},
-\texttt{dplyr} \citep{Wickham2022}, and \texttt{knitr} \citep{Xie2022} were used
-for plotting, data preparation, and dynamic reporting, respectively.
+\texttt{dplyr} \citep{Wickham2022}, \texttt{knitr} \citep{Xie2022}, and
+\texttt{reporttools} \citep{Rufibach2009} were used for plotting, data
+preparation, dynamic reporting, and formatting, respectively.