# Statistical Matching using Optimal Transport

## Introduction

In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc dataset of the laeken package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) <arXiv:2105.08379>.

## Contingency table

We will estimate the contingency table when the factor variable which represents the economic status pl030 is crossed with a discretized version of the equivalized household income eqIncome. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.

library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix

data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)

# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030 # categorial income splitted by the percentile c_income <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15)) c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which(  eusilc$eqIncome > q[7] )] <- "(90,100]" # variable of interests Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)

# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id

YZ <- table(cbind(Y,Z))
#>        c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]   Sum
#>     1      409     616     722     807     935    1025      648  5162
#>     2      189     181     205     184     165     154       82  1160
#>     3      137      90      72      75      59      52       33   518
#>     4      210     159     103      95      74      49       46   736
#>     5      470     462     492     477     459     435      351  3146
#>     6       57      25      28      30      17      11       10   178
#>     7      344     283     194     149     106      91       40  1207
#>     Sum   1816    1816    1816    1817    1815    1817     1210 12107

## Sampling schemes

Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.


# size of sample
n1 <- 1000
n2 <- 500

# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)

# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]

# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)

# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]

# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2

# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)

# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))

Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]

## Harmonization

Then the harmonization step must be performed. The harmonize function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.



re <- harmonize(X1,d1,id1,X2,d2,id2)

# if we want to use the population totals to harmonize we can use
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))

w1 <- re$w1 w2 <- re$w2

colSums(Xm)
#>      1      2      3      4      5      6      7      8      9     10     11
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844
#>     12     13     14  hsize    age
#>  11073    283    751  36380 559915
colSums(w1*X1)
#>      1      2      3      4      5      6      7      8      9     10     11
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844
#>     12     13     14  hsize    age
#>  11073    283    751  36380 559915
colSums(w2*X2)
#>      1      2      3      4      5      6      7      8      9     10     11
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844
#>     12     13     14  hsize    age
#>  11073    283    751  36380 559915

## Optimal transport matching

The statistical matching is done by using the otmatch function. The estimation of the contingency table is calculated by extracting the id1 units (respectively id2 units) and by using the function tapply with the correct weights.


# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
#>         id1    id2     weight
#> 1      2002   2002 12.3693242
#> 2403   2403  18503  6.6549822
#> 2403.1 2403 391904  7.0634999
#> 2404   2404 256903  0.7315227
#> 2404.1 2404 286302 12.9999855
#> 2505   2505 361005 12.9288819

Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum) # transform NA into 0 YZ_ot[is.na(YZ_ot)] <- 0 # result round(addmargins(YZ_ot),3) #> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum #> 1 590.712 842.961 691.578 913.959 642.791 763.596 579.974 5025.572 #> 2 92.120 172.447 177.192 178.544 156.887 130.758 94.599 1002.547 #> 3 44.705 49.082 72.319 99.973 59.875 98.070 77.787 501.812 #> 4 157.860 141.832 135.632 145.374 147.062 98.797 38.622 865.179 #> 5 490.223 391.945 514.436 493.816 489.049 484.409 491.583 3355.461 #> 6 21.357 68.943 20.831 10.583 23.219 9.848 21.952 176.734 #> 7 139.597 210.281 226.080 256.982 184.846 74.058 87.854 1179.696 #> Sum 1536.574 1877.491 1838.068 2099.231 1703.730 1659.536 1392.370 12107.000 ## Balanced sampling As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.  # Balanced Sampling BS <- bsmatch(object) head(BS$object[,1:3])
#>         id1    id2     weight
#> 1      2002   2002 12.3693242
#> 2403   2403  18503  6.6549822
#> 2404   2404 256903  0.7315227
#> 2505   2505 361005 12.9288819
#> 3901   3901 474502 13.1517446
#> 4101.1 4101 138501  1.6895103

Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum) YZ_bs[is.na(YZ_bs)] <- 0 round(addmargins(YZ_bs),3) #> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum #> 1 621.099 800.005 660.437 889.889 647.441 788.650 618.051 5025.572 #> 2 100.353 158.434 172.335 184.415 166.926 119.849 100.235 1002.547 #> 3 61.437 62.601 86.312 93.052 37.187 99.268 61.955 501.812 #> 4 149.533 114.650 138.389 165.151 123.416 108.776 65.264 865.179 #> 5 494.525 355.216 494.427 490.951 542.947 493.790 483.605 3355.461 #> 6 23.058 68.806 25.641 12.413 24.311 10.763 11.743 176.734 #> 7 145.609 200.086 237.068 267.377 188.187 60.880 80.490 1179.696 #> Sum 1595.614 1759.797 1814.609 2103.248 1730.415 1681.976 1421.342 12107.000 # With Z2 as auxiliary information for stratified balanced sampling. BS <- bsmatch(object,Z2) Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),]) Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),]) YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    651.249  862.018  647.975  885.562  679.848  772.688  526.232  5025.572
#> 2     87.458  170.447  182.069  171.690  157.381  120.034  113.468  1002.547
#> 3     48.814   24.211   72.143  107.549   61.131  101.476   86.488   501.812
#> 4    137.241  164.263  137.980  139.333  125.073  136.224   25.065   865.179
#> 5    455.864  388.036  536.370  525.841  471.763  469.060  508.527  3355.461
#> 6     23.058   68.806   25.641    0.000   23.053    0.000   36.176   176.734
#> 7    133.574  212.685  230.697  267.377  193.994   60.880   80.490  1179.696
#> Sum 1537.257 1890.466 1832.874 2097.351 1712.243 1660.362 1376.446 12107.000

## Prediction


# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))

Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2\$c_income))
#> [6,]      0 0.8656330       0 0.0000000 0.134367       0        0