Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simulation-based testing #14

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
35 changes: 35 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,38 @@
#' hist(prior_predictive$chain_size)
#' }
"prior_predictive"


#' High-level performance results in simulations
#'
#' Analyses of a grid (over R, k, and number of chains) of datasets.
#' Point (posterior median, maximum likelihood) and interval (95% credible interval,
#' 95% confidence interval) estimates from both Bayesian and maximum likelihood analyses.
#'
#' @docType data
#' @usage
#' data(sim_based_testing)
#' @format data.frame with true values and estimates.
#' \describe{
#' \item{index}{Simulation index for replicate simulations with shared (R, k, n_chains).}
#' \item{n_chains}{The number of chains in the simulated dataset.}
#' \item{r_true}{The true simulating value of R.}
#' \item{k_true}{The true simulating value of k.}
#' \item{r_point}{The posterior median or maximum likelihood value of R.}
#' \item{k_point}{The posterior median or maximum likelihood value of k.}
#' \item{r_low}{The lower bound of a 95% credible of confidence interval for R.}
#' \item{r_high}{The upper bound of a 95% credible of confidence interval for R.}
#' \item{k_low}{The lower bound of a 95% credible of confidence interval for k.}
#' \item{k_high}{The upper bound of a 95% credible of confidence interval for k.}
#' \item{estimator}{Type of analysis, bayes (Bayesian) or maxlik (maximum likelihood).}
#' }
#' @keywords datasets
#' @source Generated in R using nbbp.
#'
#' @examples
#' \dontrun{
#' data(sbc_quants)
#' plot(sim_based_testing$r_true, sim_based_testing$r_point)
#' abline(a = 0, b = 1)
#' }
"sim_based_testing"
10 changes: 10 additions & 0 deletions data-raw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Generation of package data

## `sim-study`
⚠️ Multi-step procedure; requires `make` and `timeout`.

Simulates 100 datasets each on a predefined grid of R, k, and chain sizes.
Analyzes each with `nbbp::fit_nbbp_homogenous_bayes` and `fit_nbbp_homogenous_ml`, stores point and interval estimates into package data `sim_based_testing`.

To run the `make`-based workflow, from inside `cd data-raw/sim-study` run `make all` to perform all simulations and fits.
After running `make`, from the top level of the repo, `data-raw/sim-study/src/gather_results/R` can be run to produce the package data.
1 change: 1 addition & 0 deletions data-raw/sim-study/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.txt
54 changes: 54 additions & 0 deletions data-raw/sim-study/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
rvec := 0.1 0.25 0.5 0.75 0.95 1.0 1.05 1.25 2.0
kvec := 0.1111111 0.3333333 1.0000000 3.0000000
nvec := 10 20 40
ivec := 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99

.SILENT: k.txt r.txt n.txt i.txt clean
.PHONY: simulate bayes maxlik check_timeout

all: bayes maxlik

maxlik: i.txt n.txt r.txt k.txt simulate
src/maxlik.sh

bayes: i.txt n.txt r.txt k.txt simulate
src/bayes.sh

# timeout is a downstream dependency but there's no point simulating if we can't analyze
simulate: i.txt n.txt r.txt k.txt check_timeout
src/simulate.sh

i.txt:
printf "%s\n" $(ivec) > par/i.txt

n.txt:
printf "%s\n" $(nvec) > par/n.txt

r.txt:
printf "%s\n" $(rvec) > par/r.txt

k.txt:
printf "%s\n" $(kvec) > par/k.txt

# Makes sure timeout command exists
check_timeout:
@which timeout > which_timeout.txt

clean:
rm which_timeout.txt
rm -f i.txt
rm -f n.txt
rm -f r.txt
rm -f k.txt
rm -rf par
rm -rf data
rm -rf bayes
rm -rf maxlik
mkdir par
mkdir data
mkdir bayes
mkdir maxlik
cat <<EOF > par/.placeholder
cat <<EOF > data/.placeholder
cat <<EOF > bayes/.placeholder
cat <<EOF > maxlik/.placeholder
Empty file.
Empty file.
Empty file.
Empty file.
20 changes: 20 additions & 0 deletions data-raw/sim-study/src/bayes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

IFS1=$'\n' read -d '' -r -a ivec < par/i.txt
IFS2=$'\n' read -d '' -r -a nvec < par/n.txt
IFS3=$'\n' read -d '' -r -a rvec < par/r.txt
IFS4=$'\n' read -d '' -r -a kvec < par/k.txt

seed=0
for r in "${rvec[@]}"; do
for k in "${kvec[@]}"; do
offset=0
for i in "${ivec[@]}"; do
for n in "${nvec[@]}"; do
timeout 10s Rscript --vanilla -e "source(\"src/run_analysis.R\"); fit_bayes(${offset}, ${n}, \"data/r${r}_k${k}.txt\", \"bayes/i${i}_n${n}_r${r}_k${k}.txt\", ${seed})"
seed=$((seed+1))
offset=$((offset+n))
done
done
done
done
40 changes: 40 additions & 0 deletions data-raw/sim-study/src/gather_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
parse_par <- function(fp) {
true_par <- gsub(".txt", "", basename(fp), fixed = TRUE)
true_par <- strsplit(true_par, "_", fixed = TRUE)[[1]]
true_par <- gsub("[[:alpha:]]", "", true_par)
return(data.frame(
index = as.integer(true_par[1]),
n_chains = as.integer(true_par[2]),
r_true = as.numeric(true_par[3]),
k_true = as.numeric(true_par[4])
))
}

bayes <- lapply(
list.files("data-raw/sim-study/bayes", full.names = TRUE),
function(fp) {
true_par <- parse_par(fp)
est <- read.table(fp, header = FALSE, stringsAsFactors = FALSE, row.names = 1) |> t()
res <- cbind(true_par, est, data.frame(estimator = "bayes"))
row.names(res) <- NULL
return(res)
}
)

ml <- lapply(
list.files("data-raw/sim-study/maxlik", full.names = TRUE),
function(fp) {
true_par <- parse_par(fp)
est <- read.table(fp, header = FALSE, stringsAsFactors = FALSE, row.names = 1) |> t()
res <- cbind(true_par, est, data.frame(estimator = "maxlik"))
row.names(res) <- NULL
return(res)
}
)

sim_based_testing <- rbind(do.call(rbind, bayes), do.call(rbind, ml))

sim_unique <- sim_based_testing |> dplyr::distinct()
stopifnot("Simulation duplicates found" = dim(sim_unique) == dim(sim_based_testing))

usethis::use_data(sim_based_testing, overwrite = TRUE)
20 changes: 20 additions & 0 deletions data-raw/sim-study/src/maxlik.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

IFS1=$'\n' read -d '' -r -a ivec < par/i.txt
IFS2=$'\n' read -d '' -r -a nvec < par/n.txt
IFS3=$'\n' read -d '' -r -a rvec < par/r.txt
IFS4=$'\n' read -d '' -r -a kvec < par/k.txt

seed=0
for r in "${rvec[@]}"; do
for k in "${kvec[@]}"; do
offset=0
for i in "${ivec[@]}"; do
for n in "${nvec[@]}"; do
timeout 30s Rscript --vanilla -e "source(\"src/run_analysis.R\"); fit_ml(${offset}, ${n}, \"data/r${r}_k${k}.txt\", \"maxlik/i${i}_n${n}_r${r}_k${k}.txt\", ${seed})"
seed=$((seed+1))
offset=$((offset+n))
done
done
done
done
84 changes: 84 additions & 0 deletions data-raw/sim-study/src/run_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
parse_chains <- function(datapath, offset, nchains) {
all_chains <- scan(datapath, what = numeric())
stopifnot(offset >= 0)
stopifnot(offset + nchains <= length(all_chains))
return(all_chains[offset + (1:nchains)])
}

fit_bayes <- function(
offset, nchains,
datapath, outpath,
seed, iter = 5000, alpha = 0.05, ess_thresh = 1000, rhat_thresh = 1.005) {
q_low <- alpha / 2
q_high <- 1 - q_low

chains <- parse_chains(datapath, offset, nchains)
stopifnot("Got unexpected number of chains" = (length(chains) == nchains))
stopifnot("Got bad chains" = (all(is.numeric(chains)) && all(chains > 0)))

bayes_ests <- c(
r_point = NA, k_point = NA,
r_low = NA, r_high = NA,
k_low = NA, k_high = NA
)

bayes <- nbbp::fit_nbbp_homogenous_bayes(
all_outbreaks = chains,
iter = iter,
seed = seed
)

par <- rstan::extract(bayes, c("r_eff", "dispersion", "inv_sqrt_dispersion"), permuted = FALSE)
min_ess <- min(sapply(1:3, function(i) {
rstan::ess_bulk(par[, , i])
}))
max_rhat <- max(sapply(1:3, function(i) {
rstan::Rhat(par[, , i])
}))

if (min_ess > ess_thresh && max_rhat < rhat_thresh) {
par <- rstan::extract(bayes, c("r_eff", "dispersion", "inv_sqrt_dispersion"), permuted = TRUE)
bayes_ests <- c(
r_point = unname(median(par$r_eff)),
k_point = unname(median(par$dispersion)),
r_low = unname(quantile(par$r_eff, q_low)),
r_high = unname(quantile(par$r_eff, q_high)),
k_low = unname(quantile(par$dispersion, q_low)),
k_high = unname(quantile(par$dispersion, q_high))
)
}

write.table(as.data.frame(bayes_ests), file = outpath, col.names = FALSE, quote = FALSE)
}

fit_ml <- function(offset, nchains, datapath, outpath, seed, nboot = 1000, alpha = 0.05) {
q_low <- alpha / 2
q_high <- 1 - q_low

chains <- parse_chains(datapath, offset, nchains)
stopifnot("Got bad chains" = (all(is.numeric(chains)) && all(chains > 0)))
stopifnot("Got unexpected number of chains" = (length(chains) == nchains))

maxlik_ests <- c(
r_point = NA, k_point = NA,
r_low = NA, r_high = NA,
k_low = NA, k_high = NA
)
maxlik <- nbbp::fit_nbbp_homogenous_ml(
all_outbreaks = chains,
nboot = nboot,
seed = seed
)
if (maxlik$return_code == 0) {
maxlik_ests <- c(
r_point = unname(maxlik$par["r_eff"]),
k_point = unname(maxlik$par["dispersion"]),
r_low = unname(maxlik$ci["r_eff", 1]),
r_high = unname(maxlik$ci["r_eff", 2]),
k_low = unname(maxlik$ci["dispersion", 1]),
k_high = unname(maxlik$ci["dispersion", 2])
)
}

write.table(as.data.frame(maxlik_ests), file = outpath, col.names = FALSE, quote = FALSE)
}
21 changes: 21 additions & 0 deletions data-raw/sim-study/src/simulate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

IFS1=$'\n' read -d '' -r -a ivec < par/i.txt
IFS2=$'\n' read -d '' -r -a nvec < par/n.txt
IFS3=$'\n' read -d '' -r -a rvec < par/r.txt
IFS4=$'\n' read -d '' -r -a kvec < par/k.txt

nchains=0
for i in "${ivec[@]}"; do
for n in "${nvec[@]}"; do
nchains=$((nchains+n))
done
done

seed=0
for r in "${rvec[@]}"; do
for k in "${kvec[@]}"; do
Rscript --vanilla -e "set.seed(${seed}); nbbp::rnbbp(${nchains}, $r, $k) |> cat(file=\"data/r${r}_k${k}.txt\")"
seed=$((seed+1))
done
done
Binary file added data/sim_based_testing.rda
Binary file not shown.
41 changes: 41 additions & 0 deletions man/sim_based_testing.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading