Full R-Code for:
Maier, M. J. (2014). DirichletReg: Dirichlet Regression for Compositional Data in R. Research Report Series/Department of Statistics and Mathematics, 125. WU Vienna University of Economics and Business, Vienna. https://epub.wu.ac.at/4077/
library("DirichletReg")
head(ArcticLake)
## sand silt clay depth
## 1 0.775 0.195 0.030 10.4
## 2 0.719 0.249 0.032 11.7
## 3 0.507 0.361 0.132 12.8
## 4 0.522 0.409 0.066 13.0
## 5 0.700 0.265 0.035 15.7
## 6 0.665 0.322 0.013 16.3
DR_data(ArcticLake[, 1:3])
AL <-## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced
1:6, ]
AL[## sand silt clay
## 1 0.7750000 0.1950000 0.0300000
## 2 0.7190000 0.2490000 0.0320000
## 3 0.5070000 0.3610000 0.1320000
## 4 0.5235707 0.4102307 0.0661986
## 5 0.7000000 0.2650000 0.0350000
## 6 0.6650000 0.3220000 0.0130000
Code for Fig. 1 (left):
plot(AL, cex = 0.5, a2d = list(colored = FALSE, c.grid = FALSE))
Code for Fig. 1 (right):
plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875",
"#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1)
DirichReg(AL ~ depth, ArcticLake)
lake1 <-
lake1## Call:
## DirichReg(formula = AL ~ depth, data = ArcticLake)
## using the common parametrization
##
## Log-likelihood: 101.4 on 6 df (104 BFGS + 1 NR Iterations)
##
## -----------------------------------------
## Coefficients for variable no. 1: sand
## (Intercept) depth
## 0.11662 0.02335
## -----------------------------------------
## Coefficients for variable no. 2: silt
## (Intercept) depth
## -0.31060 0.05557
## -----------------------------------------
## Coefficients for variable no. 3: clay
## (Intercept) depth
## -1.1520 0.0643
## -----------------------------------------
coef(lake1)
## $sand
## (Intercept) depth
## 0.11662480 0.02335114
##
## $silt
## (Intercept) depth
## -0.31059591 0.05556745
##
## $clay
## (Intercept) depth
## -1.15195642 0.06430175
update(lake1, . ~ . + I(depth^2) | . + I(depth^2) | . + I(depth^2))
lake2 <-anova(lake1, lake2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = AL ~ depth, data = ArcticLake)
## Model 2: DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) |
## depth + I(depth^2), data = ArcticLake)
##
## Deviance N. par Difference df Pr(>Chi)
## Model 1 -202.74 6
## Model 2 -217.99 9 15.254 3 0.001612 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lake2)
## Call:
## DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) | depth +
## I(depth^2), data = ArcticLake)
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## sand -1.7647 -0.7080 -0.1786 0.9598 3.0460
## silt -1.1379 -0.5330 -0.1546 0.2788 1.5604
## clay -1.7661 -0.6583 -0.0454 0.6584 2.0152
##
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 1: sand
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4361967 0.8026814 1.789 0.0736 .
## depth -0.0072382 0.0329433 -0.220 0.8261
## I(depth^2) 0.0001324 0.0002761 0.480 0.6315
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 2: silt
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0259705 0.7598827 -0.034 0.9727
## depth 0.0717450 0.0343089 2.091 0.0365 *
## I(depth^2) -0.0002679 0.0003088 -0.867 0.3857
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 3: clay
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7931487 0.7362293 -2.436 0.01487 *
## depth 0.1107906 0.0357705 3.097 0.00195 **
## I(depth^2) -0.0004872 0.0003308 -1.473 0.14079
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 109 on 9 df (162 BFGS + 2 NR Iterations)
## AIC: -200, BIC: -185
## Number of Observations: 39
## Link: Log
## Parametrization: common
Code for Fig. 2:
par(mar = c(4, 4, 4, 4) + 0.1)
plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875",
"#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1,
main = "Sediment Composition in an Arctic Lake")
data.frame(depth = seq(min(ArcticLake$depth), max(ArcticLake$depth), length.out = 100))
Xnew <-for (i in 1:3) lines(cbind(Xnew, predict(lake2, Xnew)[, i]), col = c("#E495A5", "#86B875",
"#7DB0DD")[i], lwd = 2)
legend("topleft", legend = c("Sand", "Silt", "Clay"), lwd = 2, col = c("#E495A5",
"#86B875", "#7DB0DD"), pt.bg = c("#E495A5", "#86B875", "#7DB0DD"), pch = 21,
bty = "n")
par(new = TRUE)
plot(cbind(Xnew, predict(lake2, Xnew, F, F, T)), lty = "24", type = "l", ylim = c(0,
max(predict(lake2, Xnew, F, F, T))), axes = F, ann = F, lwd = 2)
axis(4)
mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3)
legend("top", legend = c(expression(hat(mu[c] == hat(alpha)[c]/hat(alpha)[0])), expression(hat(phi) ==
hat(alpha)[0])), lty = c(1, 2), lwd = c(3, 2), bty = "n")
ArcticLake
AL <-$AL <- DR_data(ArcticLake[, 1:3])
AL## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced
range(ArcticLake$depth)
dd <- data.frame(depth = seq(dd[1], dd[2], length.out = 200))
X <-
predict(DirichReg(AL ~ depth + I(depth^2), AL), X) pp <-
Code for Fig. 3:
plot(AL$AL, cex = 0.1, reset_par = FALSE)
points(toSimplex(AL$AL), pch = 16, cex = 0.5, col = gray(0.5))
lines(toSimplex(pp), lwd = 3, col = c("#6E1D34", "#004E42")[2])
log(cbind(ArcticLake[, 2]/ArcticLake[, 1], ArcticLake[, 3]/ArcticLake[, 1]))
Dols <-
lm(Dols ~ depth + I(depth^2), ArcticLake)
ols <- predict(ols, X)
p2 <- exp(cbind(0, p2[, 1], p2[, 2]))/rowSums(exp(cbind(0, p2[, 1], p2[, 2])))
p2m <-
lines(toSimplex(p2m), lwd = 3, col = c("#6E1D34", "#004E42")[1], lty = "21")
BloodSamples
Bld <-$Smp <- DR_data(Bld[, 1:4])
Bld## Warning in DR_data(Bld[, 1:4]): not all rows sum up to 1 => normalization forced
DirichReg(Smp ~ Disease | 1, Bld, model = "alternative", base = 3)
blood1 <- DirichReg(Smp ~ Disease | Disease, Bld, model = "alternative", base = 3)
blood2 <-anova(blood1, blood2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = Smp ~ Disease | 1, data = Bld, model =
## "alternative", base = 3)
## Model 2: DirichReg(formula = Smp ~ Disease | Disease, data = Bld, model =
## "alternative", base = 3)
##
## Deviance N. par Difference df Pr(>Chi)
## Model 1 -303.86 7
## Model 2 -304.61 8 0.7587 1 0.3837
summary(blood1)
## Call:
## DirichReg(formula = Smp ~ Disease | 1, data = Bld, model = "alternative", base
## = 3)
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## Albumin -2.1310 -0.9307 -0.1234 0.8149 2.8429
## Pre.Albumin -1.0687 -0.4054 -0.0789 0.1947 1.5691
## Globulin.A -2.0503 -1.0392 0.1938 0.7927 2.2393
## Globulin.B -1.8176 -0.5347 0.1488 0.5115 1.3284
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: Albumin
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.11639 0.09935 11.237 <2e-16 ***
## DiseaseB -0.07002 0.13604 -0.515 0.607
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Pre.Albumin
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5490 0.1082 5.076 3.86e-07 ***
## DiseaseB -0.1276 0.1493 -0.855 0.393
## ------------------------------------------------------------------
## Coefficients for variable no. 3: Globulin.A
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Globulin.B
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4863 0.1094 4.445 8.8e-06 ***
## DiseaseB 0.1819 0.1472 1.236 0.216
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.2227 0.1475 28.64 <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 151.9 on 7 df (44 BFGS + 1 NR Iterations)
## AIC: -289.9, BIC: -280
## Number of Observations: 30
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
Code for Fig.~:
par(mfrow = c(1, 4), mar = c(4, 4, 4, 2) + 0.25)
for (i in 1:4) {
boxplot(Bld$Smp[, i] ~ Bld$Disease, ylim = range(Bld$Smp[, 1:4]), main = paste(names(Bld)[i]),
xlab = "Disease Type", ylab = "Proportion")
segments(c(-5, 1.5), unique(fitted(blood2)[, i]), c(1.5, 5), unique(fitted(blood2)[,
lwd = 2, lty = 2)
i]), }
predict(blood2, data.frame(Disease = factor(c("A", "B"))), F, T, F)
alpha <- sapply(1:2, function(i) ddirichlet(DR_data(Bld[31:36, 1:4]), unlist(alpha[i,
L <-
])))## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization
## forced
## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization
## forced
L/rowSums(L)
LP <-dimnames(LP) <- list(paste("C", 1:6), c("A", "B"))
print(data.frame(round(LP * 100, 1), pred. = as.factor(ifelse(LP[, 1] > LP[, 2],
"==> A", "==> B"))), print.gap = 2)
## A B pred.
## C 1 59.4 40.6 ==> A
## C 2 43.2 56.8 ==> B
## C 3 38.4 61.6 ==> B
## C 4 43.8 56.2 ==> B
## C 5 36.6 63.4 ==> B
## C 6 70.2 29.8 ==> A
Code for Fig.~:
DR_data(BloodSamples[, c(1, 2, 4)])
B2 <-## Warning in DR_data(BloodSamples[, c(1, 2, 4)]): not all rows sum up to 1 =>
## normalization forced
plot(B2, cex = 0.001, reset_par = FALSE)
colorRampPalette(c("#023FA5", "#c0c0c0", "#8E063B"))(100)
div.col <-
# expected values
(alpha/rowSums(alpha))[, c(1, 2, 4)]
temp <-points(toSimplex(temp/rowSums(temp)), pch = 22, bg = div.col[c(1, 100)], cex = 2,
lwd = 0.25)
# known values
B2[1:30, ]
temp <-points(toSimplex(temp/rowSums(temp)), pch = 21, bg = (div.col[c(1, 100)])[BloodSamples$Disease[1:30]],
cex = 0.5, lwd = 0.25)
# unclassified
B2[31:36, ]
temp <-points(toSimplex(temp/rowSums(temp)), pch = 21, bg = div.col[round(100 * LP[, 2],
0)], cex = 1, lwd = 0.5)
legend("topright", bty = "n", legend = c("Disease A", "Disease B", NA, "Expected Values"),
pch = c(21, 21, NA, 22), pt.bg = c(div.col[c(1, 100)], NA, "white"))
ReadingSkills
RS <-$acc <- DR_data(RS$accuracy)
RS## only one variable in [0, 1] supplied - beta-distribution assumed.
## check this assumption.
$dyslexia <- C(RS$dyslexia, treatment)
RS DirichReg(acc ~ dyslexia * iq | dyslexia * iq, RS, model = "alternative")
rs1 <- DirichReg(acc ~ dyslexia * iq | dyslexia + iq, RS, model = "alternative")
rs2 <-anova(rs1, rs2)
## Analysis of Deviance Table
##
## Model 1: DirichReg(formula = acc ~ dyslexia * iq | dyslexia * iq, data = RS,
## model = "alternative")
## Model 2: DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS,
## model = "alternative")
##
## Deviance N. par Difference df Pr(>Chi)
## Model 1 -133.47 8
## Model 2 -131.80 7 1.6645 1 0.197
Code for Fig.~:
as.numeric(RS$dyslexia)
g.ind <- g.ind == 1 # normal
g1 <- g.ind != 1 # dyslexia
g2 <-par(mar = c(4, 4, 4, 4) + 0.25)
plot(accuracy ~ iq, RS, pch = 21, bg = c("#E495A5", "#39BEB1")[3 - g.ind], cex = 1.5,
main = "Dyslexic (Red) vs. Control (Green) Group", xlab = "IQ Score", ylab = "Reading Accuracy",
xlim = range(ReadingSkills$iq))
seq(min(RS$iq[g1]), max(RS$iq[g1]), length.out = 200)
x1 <- seq(min(RS$iq[g2]), max(RS$iq[g2]), length.out = 200)
x2 <- length(x1)
n <- data.frame(dyslexia = factor(rep(0:1, each = n), levels = 0:1, labels = c("no",
X <-"yes")), iq = c(x1, x2))
predict(rs2, X, TRUE, TRUE, TRUE)
pv <-
lines(x1, pv$mu[1:n, 2], col = c("#E495A5", "#39BEB1")[2], lwd = 3)
lines(x2, pv$mu[(n + 1):(2 * n), 2], col = c("#E495A5", "#39BEB1")[1], lwd = 3)
RS$accuracy
a <- log(a/(1 - a))
logRa_a <- lm(logRa_a ~ dyslexia * iq, RS)
rlr <- 1/(1 + exp(-predict(rlr, X)))
ols <-lines(x1, ols[1:n], col = c("#AD6071", "#00897D")[2], lwd = 3, lty = 2)
lines(x2, ols[(n + 1):(2 * n)], col = c("#AD6071", "#00897D")[1], lwd = 3, lty = 2)
### precision plot
par(new = TRUE)
plot(x1, pv$phi[1:n], col = c("#6E1D34", "#004E42")[2], lty = "11", type = "l", ylim = c(0,
max(pv$phi)), axes = F, ann = F, lwd = 2, xlim = range(RS$iq))
lines(x2, pv$phi[(n + 1):(2 * n)], col = c("#6E1D34", "#004E42")[1], lty = "11",
type = "l", lwd = 2)
axis(4)
mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3)
legend("topleft", legend = c(expression(hat(mu)), expression(hat(phi)), "OLS"), lty = c(1,
3, 2), lwd = c(3, 2, 3), bty = "n")
RS$accuracy
a <- log(a/(1 - a))
logRa_a <- lm(logRa_a ~ dyslexia * iq, RS)
rlr <-summary(rlr)
##
## Call:
## lm(formula = logRa_a ~ dyslexia * iq, data = RS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.66405 -0.37966 0.03687 0.40887 2.50345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8067 0.2822 9.944 2.27e-12 ***
## dyslexiayes -2.4113 0.4517 -5.338 4.01e-06 ***
## iq 0.7823 0.2992 2.615 0.0125 *
## dyslexiayes:iq -0.8457 0.4510 -1.875 0.0681 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 40 degrees of freedom
## Multiple R-squared: 0.6151, Adjusted R-squared: 0.5862
## F-statistic: 21.31 on 3 and 40 DF, p-value: 2.083e-08
summary(rs2)
## Call:
## DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS, model =
## "alternative")
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## 1 - accuracy -1.5661 -0.8204 -0.5112 0.5211 3.4334
## accuracy -3.4334 -0.5211 0.5112 0.8204 1.5661
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: 1 - accuracy
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: accuracy
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8649 0.2991 6.235 4.52e-10 ***
## dyslexiayes -1.4833 0.3029 -4.897 9.74e-07 ***
## iq 1.0676 0.3359 3.178 0.001482 **
## dyslexiayes:iq -1.1625 0.3452 -3.368 0.000757 ***
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5579 0.3336 4.670 3.01e-06 ***
## dyslexiayes 3.4931 0.5880 5.941 2.83e-09 ***
## iq 1.2291 0.4596 2.674 0.00749 **
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 65.9 on 7 df (56 BFGS + 2 NR Iterations)
## AIC: -117.8, BIC: -105.3
## Number of Observations: 44
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
confint(rs2)
##
## 95% Confidence Intervals (original form)
##
## - Beta-Parameters:
## Variable: 1 - accuracy
## variable omitted
##
## Variable: accuracy
## 2.5% Est. 97.5%
## (Intercept) 1.279 1.86 2.451
## dyslexiayes -2.077 -1.48 -0.890
## iq 0.409 1.07 1.726
## dyslexiayes:iq -1.839 -1.16 -0.486
##
## - Gamma-Parameters
## 2.5% Est. 97.5%
## (Intercept) 0.904 1.56 2.21
## dyslexiayes 2.341 3.49 4.65
## iq 0.328 1.23 2.13
confint(rs2, exp = TRUE)
##
## 95% Confidence Intervals (exponentiated)
##
## - Beta-Parameters:
## Variable: 1 - accuracy
## variable omitted
##
## Variable: accuracy
## 2.5% exp(Est.) 97.5%
## (Intercept) 3.592 6.455 11.601
## dyslexiayes 0.125 0.227 0.411
## iq 1.506 2.908 5.618
## dyslexiayes:iq 0.159 0.313 0.615
##
## - Gamma-Parameters
## 2.5% exp(Est.) 97.5%
## (Intercept) 2.47 4.75 9.13
## dyslexiayes 10.39 32.89 104.12
## iq 1.39 3.42 8.41
Code for Fig.~:
c("#E495A5", "#39BEB1")[3 - as.numeric(RS$dyslexia)]
gcol <- c(-3, 3)
tmt <-
par(mfrow = c(3, 2), cex = 0.8)
qqnorm(residuals(rlr, "pearson"), ylim = tmt, xlim = tmt, pch = 21, bg = gcol, main = "Normal Q-Q-Plot: OLS Residuals",
cex = 0.75, lwd = 0.5)
abline(0, 1, lwd = 2)
qqline(residuals(rlr, "pearson"), lty = 2)
qqnorm(residuals(rs2, "standardized")[, 2], ylim = tmt, xlim = tmt, pch = 21, bg = gcol,
main = "Normal Q-Q-Plot: DirichReg Residuals", cex = 0.75, lwd = 0.5)
abline(0, 1, lwd = 2)
qqline(residuals(rs2, "standardized")[, 2], lty = 2)
plot(ReadingSkills$iq, residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3,
3), main = "OLS Residuals", xlab = "IQ", ylab = "Pearson Residuals", cex = 0.75,
lwd = 0.5)
abline(h = 0, lty = 2)
plot(ReadingSkills$iq, residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol,
ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "IQ", ylab = "Standardized Residuals",
cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)
plot(fitted(rlr), residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3, 3),
main = "OLS Residuals", xlab = "Fitted", ylab = "Pearson Residuals", cex = 0.75,
lwd = 0.5)
abline(h = 0, lty = 2)
plot(fitted(rs2)[, 2], residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol,
ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "Fitted", ylab = "Standardized Residuals",
cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)