DirichletReg – Article Sources

Marco J. Maier

2021-05-18

Full R-Code for:

Maier, M. J. (2014). DirichletReg: Dirichlet Regression for Compositional Data in R. Research Report Series/Department of Statistics and Mathematics, 125. WU Vienna University of Economics and Business, Vienna. https://epub.wu.ac.at/4077/

4. Application examples

4.1. The Arctic lake (common parametrization)

library("DirichletReg")
head(ArcticLake)
##    sand  silt  clay depth
## 1 0.775 0.195 0.030  10.4
## 2 0.719 0.249 0.032  11.7
## 3 0.507 0.361 0.132  12.8
## 4 0.522 0.409 0.066  13.0
## 5 0.700 0.265 0.035  15.7
## 6 0.665 0.322 0.013  16.3

AL <- DR_data(ArcticLake[, 1:3])
## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced

AL[1:6, ]
##        sand      silt      clay
## 1 0.7750000 0.1950000 0.0300000
## 2 0.7190000 0.2490000 0.0320000
## 3 0.5070000 0.3610000 0.1320000
## 4 0.5235707 0.4102307 0.0661986
## 5 0.7000000 0.2650000 0.0350000
## 6 0.6650000 0.3220000 0.0130000

Code for Fig. 1 (left):

plot(AL, cex = 0.5, a2d = list(colored = FALSE, c.grid = FALSE))

Figure 1: Arctic lake: Ternary plot and depth vs. composition. (left)

Code for Fig. 1 (right):

plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875", 
    "#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1)

Figure 1: Arctic lake: Ternary plot and depth vs. composition. (right)

lake1 <- DirichReg(AL ~ depth, ArcticLake)
lake1
## Call:
## DirichReg(formula = AL ~ depth, data = ArcticLake)
## using the common parametrization
## 
## Log-likelihood: 101.4 on 6 df (104 BFGS + 1 NR Iterations)
## 
## -----------------------------------------
## Coefficients for variable no. 1: sand
## (Intercept)        depth  
##     0.11662      0.02335  
## -----------------------------------------
## Coefficients for variable no. 2: silt
## (Intercept)        depth  
##    -0.31060      0.05557  
## -----------------------------------------
## Coefficients for variable no. 3: clay
## (Intercept)        depth  
##     -1.1520       0.0643  
## -----------------------------------------

coef(lake1)
## $sand
## (Intercept)       depth 
##  0.11662480  0.02335114 
## 
## $silt
## (Intercept)       depth 
## -0.31059591  0.05556745 
## 
## $clay
## (Intercept)       depth 
## -1.15195642  0.06430175

lake2 <- update(lake1, . ~ . + I(depth^2) | . + I(depth^2) | . + I(depth^2))
anova(lake1, lake2)
## Analysis of Deviance Table
## 
## Model 1: DirichReg(formula = AL ~ depth, data = ArcticLake)
## Model 2: DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) |
##   depth + I(depth^2), data = ArcticLake)
## 
##         Deviance N. par Difference df Pr(>Chi)   
## Model 1  -202.74      6                          
## Model 2  -217.99      9     15.254  3 0.001612 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lake2)
## Call:
## DirichReg(formula = AL ~ depth + I(depth^2) | depth + I(depth^2) | depth +
## I(depth^2), data = ArcticLake)
## 
## Standardized Residuals:
##           Min       1Q   Median      3Q     Max
## sand  -1.7647  -0.7080  -0.1786  0.9598  3.0460
## silt  -1.1379  -0.5330  -0.1546  0.2788  1.5604
## clay  -1.7661  -0.6583  -0.0454  0.6584  2.0152
## 
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 1: sand
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  1.4361967  0.8026814   1.789   0.0736 .
## depth       -0.0072382  0.0329433  -0.220   0.8261  
## I(depth^2)   0.0001324  0.0002761   0.480   0.6315  
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 2: silt
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.0259705  0.7598827  -0.034   0.9727  
## depth        0.0717450  0.0343089   2.091   0.0365 *
## I(depth^2)  -0.0002679  0.0003088  -0.867   0.3857  
## ------------------------------------------------------------------
## Beta-Coefficients for variable no. 3: clay
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -1.7931487  0.7362293  -2.436  0.01487 * 
## depth        0.1107906  0.0357705   3.097  0.00195 **
## I(depth^2)  -0.0004872  0.0003308  -1.473  0.14079   
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 109 on 9 df (162 BFGS + 2 NR Iterations)
## AIC: -200, BIC: -185
## Number of Observations: 39
## Link: Log
## Parametrization: common

Code for Fig. 2:

par(mar = c(4, 4, 4, 4) + 0.1)
plot(rep(ArcticLake$depth, 3), as.numeric(AL), pch = 21, bg = rep(c("#E495A5", "#86B875", 
    "#7DB0DD"), each = 39), xlab = "Depth (m)", ylab = "Proportion", ylim = 0:1, 
    main = "Sediment Composition in an Arctic Lake")

Xnew <- data.frame(depth = seq(min(ArcticLake$depth), max(ArcticLake$depth), length.out = 100))
for (i in 1:3) lines(cbind(Xnew, predict(lake2, Xnew)[, i]), col = c("#E495A5", "#86B875", 
    "#7DB0DD")[i], lwd = 2)
legend("topleft", legend = c("Sand", "Silt", "Clay"), lwd = 2, col = c("#E495A5", 
    "#86B875", "#7DB0DD"), pt.bg = c("#E495A5", "#86B875", "#7DB0DD"), pch = 21, 
    bty = "n")

par(new = TRUE)
plot(cbind(Xnew, predict(lake2, Xnew, F, F, T)), lty = "24", type = "l", ylim = c(0, 
    max(predict(lake2, Xnew, F, F, T))), axes = F, ann = F, lwd = 2)
axis(4)
mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3)
legend("top", legend = c(expression(hat(mu[c] == hat(alpha)[c]/hat(alpha)[0])), expression(hat(phi) == 
    hat(alpha)[0])), lty = c(1, 2), lwd = c(3, 2), bty = "n")

Figure 2: Arctic lake: Fitted values of the quadratic model.

AL <- ArcticLake
AL$AL <- DR_data(ArcticLake[, 1:3])
## Warning in DR_data(ArcticLake[, 1:3]): not all rows sum up to 1 => normalization
## forced

dd <- range(ArcticLake$depth)
X <- data.frame(depth = seq(dd[1], dd[2], length.out = 200))

pp <- predict(DirichReg(AL ~ depth + I(depth^2), AL), X)

Code for Fig. 3:

plot(AL$AL, cex = 0.1, reset_par = FALSE)
points(toSimplex(AL$AL), pch = 16, cex = 0.5, col = gray(0.5))

lines(toSimplex(pp), lwd = 3, col = c("#6E1D34", "#004E42")[2])

Dols <- log(cbind(ArcticLake[, 2]/ArcticLake[, 1], ArcticLake[, 3]/ArcticLake[, 1]))

ols <- lm(Dols ~ depth + I(depth^2), ArcticLake)
p2 <- predict(ols, X)
p2m <- exp(cbind(0, p2[, 1], p2[, 2]))/rowSums(exp(cbind(0, p2[, 1], p2[, 2])))

lines(toSimplex(p2m), lwd = 3, col = c("#6E1D34", "#004E42")[1], lty = "21")

Figure 3: Arctic lake: OLS (dashed) vs. Dirichlet regression (solid) predictions.

4.2. Blood samples (alternative parametrization)

Bld <- BloodSamples
Bld$Smp <- DR_data(Bld[, 1:4])
## Warning in DR_data(Bld[, 1:4]): not all rows sum up to 1 => normalization forced

blood1 <- DirichReg(Smp ~ Disease | 1, Bld, model = "alternative", base = 3)
blood2 <- DirichReg(Smp ~ Disease | Disease, Bld, model = "alternative", base = 3)
anova(blood1, blood2)
## Analysis of Deviance Table
## 
## Model 1: DirichReg(formula = Smp ~ Disease | 1, data = Bld, model =
##   "alternative", base = 3)
## Model 2: DirichReg(formula = Smp ~ Disease | Disease, data = Bld, model =
##   "alternative", base = 3)
## 
##         Deviance N. par Difference df Pr(>Chi)
## Model 1  -303.86      7                       
## Model 2  -304.61      8     0.7587  1   0.3837

summary(blood1)
## Call:
## DirichReg(formula = Smp ~ Disease | 1, data = Bld, model = "alternative", base
## = 3)
## 
## Standardized Residuals:
##                  Min       1Q   Median      3Q     Max
## Albumin      -2.1310  -0.9307  -0.1234  0.8149  2.8429
## Pre.Albumin  -1.0687  -0.4054  -0.0789  0.1947  1.5691
## Globulin.A   -2.0503  -1.0392   0.1938  0.7927  2.2393
## Globulin.B   -1.8176  -0.5347   0.1488  0.5115  1.3284
## 
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: Albumin
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.11639    0.09935  11.237   <2e-16 ***
## DiseaseB    -0.07002    0.13604  -0.515    0.607    
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Pre.Albumin
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.5490     0.1082   5.076 3.86e-07 ***
## DiseaseB     -0.1276     0.1493  -0.855    0.393    
## ------------------------------------------------------------------
## Coefficients for variable no. 3: Globulin.A
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Globulin.B
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.4863     0.1094   4.445  8.8e-06 ***
## DiseaseB      0.1819     0.1472   1.236    0.216    
## ------------------------------------------------------------------
## 
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.2227     0.1475   28.64   <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 151.9 on 7 df (44 BFGS + 1 NR Iterations)
## AIC: -289.9, BIC: -280
## Number of Observations: 30
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative

Code for Fig.~:

par(mfrow = c(1, 4), mar = c(4, 4, 4, 2) + 0.25)
for (i in 1:4) {
    boxplot(Bld$Smp[, i] ~ Bld$Disease, ylim = range(Bld$Smp[, 1:4]), main = paste(names(Bld)[i]), 
        xlab = "Disease Type", ylab = "Proportion")
    segments(c(-5, 1.5), unique(fitted(blood2)[, i]), c(1.5, 5), unique(fitted(blood2)[, 
        i]), lwd = 2, lty = 2)
}

Blood samples: Box plots and fitted values (dashed lines indicate the fitted values for each group).

alpha <- predict(blood2, data.frame(Disease = factor(c("A", "B"))), F, T, F)
L <- sapply(1:2, function(i) ddirichlet(DR_data(Bld[31:36, 1:4]), unlist(alpha[i, 
    ])))
## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization
## forced

## Warning in DR_data(Bld[31:36, 1:4]): not all rows sum up to 1 => normalization
## forced
LP <- L/rowSums(L)
dimnames(LP) <- list(paste("C", 1:6), c("A", "B"))
print(data.frame(round(LP * 100, 1), pred. = as.factor(ifelse(LP[, 1] > LP[, 2], 
    "==> A", "==> B"))), print.gap = 2)
##         A     B  pred.
## C 1  59.4  40.6  ==> A
## C 2  43.2  56.8  ==> B
## C 3  38.4  61.6  ==> B
## C 4  43.8  56.2  ==> B
## C 5  36.6  63.4  ==> B
## C 6  70.2  29.8  ==> A

Code for Fig.~:

B2 <- DR_data(BloodSamples[, c(1, 2, 4)])
## Warning in DR_data(BloodSamples[, c(1, 2, 4)]): not all rows sum up to 1 =>
## normalization forced
plot(B2, cex = 0.001, reset_par = FALSE)

div.col <- colorRampPalette(c("#023FA5", "#c0c0c0", "#8E063B"))(100)

# expected values
temp <- (alpha/rowSums(alpha))[, c(1, 2, 4)]
points(toSimplex(temp/rowSums(temp)), pch = 22, bg = div.col[c(1, 100)], cex = 2, 
    lwd = 0.25)

# known values
temp <- B2[1:30, ]
points(toSimplex(temp/rowSums(temp)), pch = 21, bg = (div.col[c(1, 100)])[BloodSamples$Disease[1:30]], 
    cex = 0.5, lwd = 0.25)

# unclassified
temp <- B2[31:36, ]
points(toSimplex(temp/rowSums(temp)), pch = 21, bg = div.col[round(100 * LP[, 2], 
    0)], cex = 1, lwd = 0.5)

legend("topright", bty = "n", legend = c("Disease A", "Disease B", NA, "Expected Values"), 
    pch = c(21, 21, NA, 22), pt.bg = c(div.col[c(1, 100)], NA, "white"))

Blood samples: Observed values and predictions

4.3. Reading skills data (alternative parametrization)

RS <- ReadingSkills
RS$acc <- DR_data(RS$accuracy)
## only one variable in [0, 1] supplied - beta-distribution assumed.
## check this assumption.

RS$dyslexia <- C(RS$dyslexia, treatment)
rs1 <- DirichReg(acc ~ dyslexia * iq | dyslexia * iq, RS, model = "alternative")
rs2 <- DirichReg(acc ~ dyslexia * iq | dyslexia + iq, RS, model = "alternative")
anova(rs1, rs2)
## Analysis of Deviance Table
## 
## Model 1: DirichReg(formula = acc ~ dyslexia * iq | dyslexia * iq, data = RS,
##   model = "alternative")
## Model 2: DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS,
##   model = "alternative")
## 
##         Deviance N. par Difference df Pr(>Chi)
## Model 1  -133.47      8                       
## Model 2  -131.80      7     1.6645  1    0.197

Code for Fig.~:

g.ind <- as.numeric(RS$dyslexia)
g1 <- g.ind == 1  # normal
g2 <- g.ind != 1  # dyslexia
par(mar = c(4, 4, 4, 4) + 0.25)
plot(accuracy ~ iq, RS, pch = 21, bg = c("#E495A5", "#39BEB1")[3 - g.ind], cex = 1.5, 
    main = "Dyslexic (Red) vs. Control (Green) Group", xlab = "IQ Score", ylab = "Reading Accuracy", 
    xlim = range(ReadingSkills$iq))

x1 <- seq(min(RS$iq[g1]), max(RS$iq[g1]), length.out = 200)
x2 <- seq(min(RS$iq[g2]), max(RS$iq[g2]), length.out = 200)
n <- length(x1)
X <- data.frame(dyslexia = factor(rep(0:1, each = n), levels = 0:1, labels = c("no", 
    "yes")), iq = c(x1, x2))
pv <- predict(rs2, X, TRUE, TRUE, TRUE)

lines(x1, pv$mu[1:n, 2], col = c("#E495A5", "#39BEB1")[2], lwd = 3)
lines(x2, pv$mu[(n + 1):(2 * n), 2], col = c("#E495A5", "#39BEB1")[1], lwd = 3)

a <- RS$accuracy
logRa_a <- log(a/(1 - a))
rlr <- lm(logRa_a ~ dyslexia * iq, RS)
ols <- 1/(1 + exp(-predict(rlr, X)))
lines(x1, ols[1:n], col = c("#AD6071", "#00897D")[2], lwd = 3, lty = 2)
lines(x2, ols[(n + 1):(2 * n)], col = c("#AD6071", "#00897D")[1], lwd = 3, lty = 2)

### precision plot
par(new = TRUE)
plot(x1, pv$phi[1:n], col = c("#6E1D34", "#004E42")[2], lty = "11", type = "l", ylim = c(0, 
    max(pv$phi)), axes = F, ann = F, lwd = 2, xlim = range(RS$iq))
lines(x2, pv$phi[(n + 1):(2 * n)], col = c("#6E1D34", "#004E42")[1], lty = "11", 
    type = "l", lwd = 2)
axis(4)
mtext(expression(paste("Precision (", phi, ")", sep = "")), 4, line = 3)
legend("topleft", legend = c(expression(hat(mu)), expression(hat(phi)), "OLS"), lty = c(1, 
    3, 2), lwd = c(3, 2, 3), bty = "n")

Reading skills: Predicted values of Dirichlet regression and OLS regression.

a <- RS$accuracy
logRa_a <- log(a/(1 - a))
rlr <- lm(logRa_a ~ dyslexia * iq, RS)
summary(rlr)
## 
## Call:
## lm(formula = logRa_a ~ dyslexia * iq, data = RS)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.66405 -0.37966  0.03687  0.40887  2.50345 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.8067     0.2822   9.944 2.27e-12 ***
## dyslexiayes     -2.4113     0.4517  -5.338 4.01e-06 ***
## iq               0.7823     0.2992   2.615   0.0125 *  
## dyslexiayes:iq  -0.8457     0.4510  -1.875   0.0681 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 40 degrees of freedom
## Multiple R-squared:  0.6151, Adjusted R-squared:  0.5862 
## F-statistic: 21.31 on 3 and 40 DF,  p-value: 2.083e-08

summary(rs2)
## Call:
## DirichReg(formula = acc ~ dyslexia * iq | dyslexia + iq, data = RS, model =
## "alternative")
## 
## Standardized Residuals:
##                   Min       1Q   Median      3Q     Max
## 1 - accuracy  -1.5661  -0.8204  -0.5112  0.5211  3.4334
## accuracy      -3.4334  -0.5211   0.5112  0.8204  1.5661
## 
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: 1 - accuracy
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: accuracy
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.8649     0.2991   6.235 4.52e-10 ***
## dyslexiayes     -1.4833     0.3029  -4.897 9.74e-07 ***
## iq               1.0676     0.3359   3.178 0.001482 ** 
## dyslexiayes:iq  -1.1625     0.3452  -3.368 0.000757 ***
## ------------------------------------------------------------------
## 
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.5579     0.3336   4.670 3.01e-06 ***
## dyslexiayes   3.4931     0.5880   5.941 2.83e-09 ***
## iq            1.2291     0.4596   2.674  0.00749 ** 
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 65.9 on 7 df (56 BFGS + 2 NR Iterations)
## AIC: -117.8, BIC: -105.3
## Number of Observations: 44
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative

confint(rs2)
## 
## 95% Confidence Intervals (original form)
## 
## - Beta-Parameters:
## Variable: 1 - accuracy
##   variable omitted
## 
## Variable: accuracy
##                   2.5%   Est.   97.5%
## (Intercept)      1.279   1.86   2.451
## dyslexiayes     -2.077  -1.48  -0.890
## iq               0.409   1.07   1.726
## dyslexiayes:iq  -1.839  -1.16  -0.486
## 
## - Gamma-Parameters
##               2.5%  Est.  97.5%
## (Intercept)  0.904  1.56   2.21
## dyslexiayes  2.341  3.49   4.65
## iq           0.328  1.23   2.13

confint(rs2, exp = TRUE)
## 
## 95% Confidence Intervals (exponentiated)
## 
## - Beta-Parameters:
## Variable: 1 - accuracy
##   variable omitted
## 
## Variable: accuracy
##                  2.5%  exp(Est.)   97.5%
## (Intercept)     3.592      6.455  11.601
## dyslexiayes     0.125      0.227   0.411
## iq              1.506      2.908   5.618
## dyslexiayes:iq  0.159      0.313   0.615
## 
## - Gamma-Parameters
##               2.5%  exp(Est.)   97.5%
## (Intercept)   2.47       4.75    9.13
## dyslexiayes  10.39      32.89  104.12
## iq            1.39       3.42    8.41

Code for Fig.~:

gcol <- c("#E495A5", "#39BEB1")[3 - as.numeric(RS$dyslexia)]
tmt <- c(-3, 3)

par(mfrow = c(3, 2), cex = 0.8)

qqnorm(residuals(rlr, "pearson"), ylim = tmt, xlim = tmt, pch = 21, bg = gcol, main = "Normal Q-Q-Plot: OLS Residuals", 
    cex = 0.75, lwd = 0.5)
abline(0, 1, lwd = 2)
qqline(residuals(rlr, "pearson"), lty = 2)

qqnorm(residuals(rs2, "standardized")[, 2], ylim = tmt, xlim = tmt, pch = 21, bg = gcol, 
    main = "Normal Q-Q-Plot: DirichReg Residuals", cex = 0.75, lwd = 0.5)
abline(0, 1, lwd = 2)
qqline(residuals(rs2, "standardized")[, 2], lty = 2)

plot(ReadingSkills$iq, residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3, 
    3), main = "OLS Residuals", xlab = "IQ", ylab = "Pearson Residuals", cex = 0.75, 
    lwd = 0.5)
abline(h = 0, lty = 2)
plot(ReadingSkills$iq, residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol, 
    ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "IQ", ylab = "Standardized Residuals", 
    cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)

plot(fitted(rlr), residuals(rlr, "pearson"), pch = 21, bg = gcol, ylim = c(-3, 3), 
    main = "OLS Residuals", xlab = "Fitted", ylab = "Pearson Residuals", cex = 0.75, 
    lwd = 0.5)
abline(h = 0, lty = 2)
plot(fitted(rs2)[, 2], residuals(rs2, "standardized")[, 2], pch = 21, bg = gcol, 
    ylim = c(-3, 3), main = "DirichReg Residuals", xlab = "Fitted", ylab = "Standardized Residuals", 
    cex = 0.75, lwd = 0.5)
abline(h = 0, lty = 2)

Reading skills: residual plots of OLS and Dirichlet regression models.