For a simple illustration, we generate a synthetic allele count
matrix with minor allele frequency maf greater than 5%. The
function simulate_traits() requires the input genotype data
to be stored in the PLINK format (.bim, .bed,
and .fam files). Here we use the package genio
to write the allele count matrix to a PLINK file.
plink_file <- tempfile()
n_samples <- 3000
n_snps <- 5000
maf <- 0.05 + 0.45 * runif(n_snps)
random_minor_allele_counts <- (runif(n_samples * n_snps) < maf) +
(runif(n_samples * n_snps) < maf)
allele_count_matrix <- matrix(random_minor_allele_counts,
nrow = n_samples,
ncol = n_snps,
byrow = TRUE,
)
# Create .fam and .bim data frames
fam <- data.frame(
fam = sprintf("F%d", 1:n_samples), # Family ID
id = sprintf("I%d", 1:n_samples), # Individual ID
pat = 0, # Paternal ID
mat = 0, # Maternal ID
sex = sample(1:2, n_samples, replace = TRUE),
pheno = -9
)
bim <- data.frame(
chr = 1, # Chromosome number
id = sprintf("rs%d", 1:n_snps), # SNP name
posg = 0, # Genetic distance (cM)
pos = 1:n_snps, # Base-pair position
ref = sample(c("A", "C", "G", "T"), n_snps, replace = T), # Minor allele
alt = sample(c("A", "C", "G", "T"), n_snps, replace = T) # Major allele
)
# Write to .bed, .fam, and .bim files
write_plink(
file = plink_file,
X = t(allele_count_matrix),
fam = fam,
bim = bim
)To simulate traits, we need to specify the genetic architecture. E.g., the function requires us to specify
The simulated trait is written to file in the PLINK phenotype format.
pheno_file <- tempfile()
additive_heritability <- 0.3
gxg_heritability <- 0.25
n_snps_gxg_group <- 5
n_snps_additive <- 100
additive_snps <- sort(sample(1:n_snps, n_snps_additive, replace = F))
gxg_group_1 <- sort(sample(additive_snps, n_snps_gxg_group, replace = F))
gxg_group_2 <- sort(sample(setdiff(additive_snps, gxg_group_1),
n_snps_gxg_group, replace = F))
simulate_traits(
plink_file,
pheno_file,
additive_heritability,
gxg_heritability,
additive_snps,
gxg_group_1,
gxg_group_2
)The variance components of the traits are constructed to match the input. See Crawford et al. (2017) and mvMAPIT Documentation: Simulate Traits for a full description of the simulation scheme.
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.2
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
#>
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: Europe/Berlin
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] genio_1.1.2 knitr_1.49 tidyr_1.3.1
#> [4] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1 IRanges_2.38.1
#> [7] S4Vectors_0.42.1 BiocGenerics_0.50.0 ggplot2_3.5.1
#> [10] dplyr_1.1.4 smer_0.0.1
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.9 generics_0.1.3 digest_0.6.37
#> [4] magrittr_2.0.3 evaluate_1.0.3 grid_4.4.2
#> [7] iterators_1.0.14 CompQuadForm_1.4.3 mvtnorm_1.3-3
#> [10] fastmap_1.2.0 foreach_1.5.2 jsonlite_1.8.9
#> [13] backports_1.5.0 httr_1.4.7 purrr_1.0.2
#> [16] UCSC.utils_1.0.0 scales_1.3.0 codetools_0.2-20
#> [19] jquerylib_0.1.4 cli_3.6.3 rlang_1.1.4
#> [22] XVector_0.44.0 munsell_0.5.1 withr_3.0.2
#> [25] cachem_1.1.0 yaml_2.3.10 FMStable_0.1-4
#> [28] tools_4.4.2 parallel_4.4.2 checkmate_2.3.2
#> [31] colorspace_2.1-1 GenomeInfoDbData_1.2.12 vctrs_0.6.5
#> [34] R6_2.5.1 lifecycle_1.0.4 zlibbioc_1.50.0
#> [37] mvMAPIT_2.0.3 pkgconfig_2.0.3 pillar_1.10.1
#> [40] bslib_0.8.0 gtable_0.3.6 glue_1.8.0
#> [43] Rcpp_1.0.14 harmonicmeanp_3.0.1 xfun_0.50
#> [46] tibble_3.2.1 tidyselect_1.2.1 farver_2.1.2
#> [49] htmltools_0.5.8.1 rmarkdown_2.29 labeling_0.4.3
#> [52] compiler_4.4.2