Skip to content
Snippets Groups Projects
preprocess-rpcb-data.R 5.69 KiB
## libraries
library(dplyr) # for data wrangling
library(ggplot2) # to reproduce plots from publication

## RPCB data with SMD transformed effect sizes but before aggregating internal replications
## https://github.com/mayamathur/rpcb/tree/master/Prepped%20data/Intermediate%20work
## dat <- read.csv("https://raw.githubusercontent.com/mayamathur/rpcb/master/Prepped%20data/Intermediate%20work/intermediate_dataset_step3.csv")
dat <- read.csv(file = "intermediate_dataset_step3.csv")
dat2 <- dat %>%
    ## only take studies with quantitative effect information available
    filter(!is.na(origDirection),
           quantPair == TRUE)

## organize data set for reanalysis
datClean <- dat2 %>%
    mutate(
        ## define identifier for paper, experiment, effect, internal replication
        id = paste0("(", pID, ", ", eID, ", ", oID, ", ", internalID, ")")
    ) %>%
    select(
        osf = OSF.project.link,
        id,
        paper = pID,
        experiment = eID,
        effect = oID,
        internalReplication = internalID,
        resulto = origDirection,
        resultr = repDirection,
        ## effect sizes and standard errors on SMD scale
        smdo = origES3,
        so = origSE3,
        smdr = repES3,
        sr = repSE3,
        ## effect sizes, standard errors, p-values, confidence intervals on original scale
        effectType = Effect.size.type,
        ESo = Original.effect.size,
        seESo = Original.standard.error,
        lowerESo = Original.lower.CI,
        upperESo = Original.upper.CI,
        po = origPval,
        ESr = Replication.effect.size,
        seESr = Replication.standard.error,
        lowerESr = Replication.lower.CI,
        upperESr = Replication.upper.CI,
        pr = repPval,
        ## original and replication sample size
        ## (not clear whether group or full sample size)
        no = origN,
        nr = repN
    )

## should give 20 original null effects and 10 "successful" null replications
## (see null results in Table 1)
datClean %>%
    summarise(nulls = sum(resulto == "Null"),
              successes = sum(resulto == "Null" &
                              resultr %in% c("Null-positive", "Null-negative",
                                             "Null")))

## should give 112 original positive effects and 44 successful replications
## (see positive results in Table 1)
datClean %>%
    summarise(positives = sum(resulto == "Positive"),
              successes = sum(resulto == "Positive" &
                              resultr == "Positive" &
                              sign(smdo) == sign(smdr)))

## save
write.csv(datClean, "rpcb-outcome-level.csv", row.names = FALSE)
## aggregate internal replications with fixed-effect meta-analysis to get
## "effect-level data"
dat3 <- datClean %>%
    mutate(id2 = paste0("(", paper, ", ", experiment, ", ", effect, ")"))
datClean2 <- lapply(unique(dat3$id2), FUN = function(id2) {
    ## pool the internal replications with fixed-effect meta-analysis
    datr <- dat3[dat3$id2 == id2,]
    nreplications <- nrow(datr)
    varsmdrMA <- 1/sum(1/datr$sr^2)
    smdrMA <- sum(datr$smdr/datr$sr^2)*varsmdrMA
    ## add the sample sizes of the internal replications
    nrTotal <- sum(datr$nr)
    if (nreplications > 1) {
        ## check whether still a "null" replication based on Wald p-value
        prMA <- 2*pnorm(q = abs(smdrMA)/sqrt(varsmdrMA), lower.tail = FALSE)
        if (prMA > 0.05) {
            resultMA <- "Null"
            prNew <- prMA
        } else {
            resultMA <- "Positive"
            prNew <- prMA
        }
    } else {
        resultMA <- datr$resultr
        prNew <- datr$pr
    }
    out <- datr[1,] %>%
        mutate(smdr = smdrMA,
               sr = sqrt(varsmdrMA),
               nr = nrTotal,
               id = id2,
               resultr = resultMA,
               pr = prNew) %>%
        select(-internalReplication, -id2)
    return(out)
}) %>%
    bind_rows()

## should give 15 original null effects and 11 "successful" null replications
## (see null results in Table 1)
datClean2 %>%
    summarise(nulls = sum(resulto == "Null"),
              successes = sum(resulto == "Null" &
                              resultr %in% c("Null-positive", "Null-negative",
                                             "Null")))

## should give 97 original positive effects and 42 successful replications
## (see positive results in Table 1)
datClean2 %>%
    summarise(positives = sum(resulto == "Positive"),
              successes = sum(resulto == "Positive" &
                              resultr == "Positive" &
                              sign(smdo) == sign(smdr)))

## replicate Figure 1
## https://iiif.elifesciences.org/lax/71601%2Felife-71601-fig2-v3.tif/full/1500,/0/default.jpg
datClean2 %>%
    mutate(rsig = factor(pr <= 0.05, levels = c(FALSE, TRUE),
                         labels = c("italic(p[r]) > 0.05",
                                    "italic(p[r]) <= 0.05"))) %>%
    ggplot(aes(x = smdo, y = smdr)) +
    geom_abline(intercept = 0, slope = 1, alpha = 0.3) +
    geom_hline(yintercept = 0, lty = 2, alpha = 0.9, linewidth = 0.4) +
    geom_vline(xintercept = 0, lty = 2, alpha = 0.9, linewidth = 0.4) +
    geom_point(aes(fill = rsig), shape = 21, size = 2) +
    geom_rug(aes(color = rsig), size = 0.3, show.legend = FALSE) +
    labs(x = "Original effect estimate (SMD)",
         y = "Replication effect estimate (SMD)",
         fill = "") +
    coord_fixed(xlim = c(-5, 28), ylim = c(-5, 28)) +
    scale_x_continuous(breaks = seq(-5, 25, 5)) +
    scale_y_continuous(breaks = seq(-5, 25, 5)) +
    scale_fill_discrete(labels = scales::parse_format()) +
    theme_bw() +
    theme(panel.grid = element_blank(), legend.position = c(0.5, 0.835))

## save
write.csv(datClean2, "rpcb-effect-level.csv", row.names = FALSE)