CRSO-vignette

This vignette presents an example workflow for CRSO. If a parallel backend is registered, CRSO will make use of all available registered workers. If not, it can still be run sequentially.

library(crso)
#> Loading required package: foreach
### Load example dataset consisting of TCGA melanoma (SKCM) patients.
data(skcm)
list2env(skcm.list,envir=globalenv())
#> <environment: R_GlobalEnv>
names(skcm.list) ### load D, P and cnv.dictionary
#> [1] "D"              "P"              "cnv.dictionary"
Q <- log10(P) ### Q is the penalty matrix derived from P

Specify parameters

The parameter values used are not the default values that are recommended in the CRSO paper, because the computation time may be excessive depending on the parallel availability. Instead, these were chosen to demonstrate how to use all of the functions in the package. The recommended parameter values are indicated in parentheses.

rule.thresh <- .05 # Minimum percentage of samples covered by each rule in the rule library. (Default = .03)
spr <- 4 # Phase 1 random sets per rule (default = 40, recommend at least 40) 
trn <- 40 # Phase 1 stop criteria (default = 16, recommend at most 24)
k.max <- 6 # Maximum RS size (default is 12)
max.stored <- 1000 # Max RS stored per K in phases 2 and 3 (no default, recommend at least 10^4)
max.nrs.ee <- 1000 # Phase 2 max rs per K (default is 10^5, recommend at least 10^5)
max.compute <- 10*max.nrs.ee # Max raw im size for phase 2 (default is 10^6, recommend at least 10*max.nrs.ee)
max.nrs.borrow <- 1000 # Phase 3 max rs per K (default is 10^5, recommend at least 10^5)
filter.thresh <- .03 # minimum assignment threshold per rule set (default is .03)
### Generalized core parameters:
num.gc.iter <- 10 # Number GC iterations (default is 100)
num.gc.eval <- 100 # Rulesets evaluated per K per GC iter (default is 1000)

Phase 1

set.seed(100)
rm.full <- buildRuleLibrary(D,rule.thresh) # build rule library
rm.ordered <- makePhaseOneOrderedRM(D,rm.full,spr,Q,trn,shouldPrint = TRUE) # run phase 1
#> [1] "Starting rules = 60"
#> Warning: executing %dopar% sequentially: no parallel backend registered
#> [1] "Remaining rules = 54"
#> [1] "Remaining rules = 49"
#> [1] "Remaining rules = 45"
#> [1] "Remaining rules = 41"
#> [1] "Remaining rules = 40"
#> Time difference of 0.4587963 mins

Phase 2

pool.sizes <- getPoolSizes(rm.ordered,k.max,max.nrs.ee,max.compute) 
### The pool size for each K is the number of rules considered for exhaustive evaluation in phase 2.
til.p2 <- makePhaseTwoImList(D,Q,rm.ordered,k.max,pool.sizes,max.stored,shouldPrint = TRUE) # Run phase 2
#> [1] "Starting Phase 2: Exhaustive Evaluation"
#> [1] "Evaluation time for k = 2: 0.02526 min"
#> [1] "Evaluation time for k = 3: 0.02557 min"
#> [1] "Evaluation time for k = 4: 0.02728 min"
#> [1] "Evaluation time for k = 5: 0.02103 min"
#> [1] "Evaluation time for k = 6: 0.02239 min"
#> [1] "Total Phase 2 Time: 0.127 min"
### TIL stands for top index list. The output of phase two is a list of top index matrices for each k.  Each index matrix contains the rule sets ordered by performance. For example the best performing rule set of size 3 will be the first row of the K.3 index matrix. For K=1 the index matrix is actually a vector.

Phase 3 and Filtering

til.p3 <- makePhaseThreeImList(D,Q,rm.ordered,til.p2,pool.sizes,max.stored,max.nrs.borrow,shouldPrint = TRUE)
#> [1] "Starting Phase 3: neighbor expansion"
#> [1] "Updated top im for K = 3, time = 0.09706 min"
#> [1] "Updated top im for K = 4, time = 0.1444 min"
#> [1] "Updated top im for K = 5, time = 0.1455 min"
#> [1] "Updated top im for K = 6, time = 0.1626 min"
#> [1] "Total Phase 3 Time: 0.5496 min"
### Make TIL for phase 3 by updating phase two til to consider neighbor rule sets.

til.filtered <- makeFilteredImList(D,Q,rm.ordered,til.p3,filter.thresh)
### Filter the phase 3 results to only include rule sets for which every rule is assigned to a minimum percentage of samples, default is 3%
tpl.filtered <- evaluateListOfIMs(D,Q,rm.ordered,til.filtered) 
### Get top performance list (TPL), which contains the objective function score of all of the rule sets in til.filtered

Extract best rule sets and core rule set

best.rs.list <- getBestRsList(rm.ordered,tpl.filtered,til.filtered)
### This is a list of the best rule sets for all K

core.K <- getCoreK(D,rm.ordered,tpl.filtered,til.filtered) # Determine core K
core.ruleset <- getCoreRS(D,rm.ordered,tpl.filtered,til.filtered) # Extract core rule set
print(core.ruleset)
#>                      r1                      r2                      r3 
#>      "BRAF-M.CDKN2A-MD"      "CDKN2A-MD.NRAS-M"        "BRAF-M.PTEN-MD" 
#>                     r29                     r13                     r32 
#>         "NRAS-M.TP53-M" "BRAF-M.HIPK2/TBXAS1-A"    "BRAF-M.RN7SKP254-A"

Make generalized core results

list.subset.cores <- makeSubCoreList(D,Q,rm.ordered,til.filtered,num.gc.iter,num.gc.eval)
#> [1] "Subset Core Iteration = 1"
#> [1] "Subset Core Iteration = 2"
#> [1] "Subset Core Iteration = 3"
#> [1] "Subset Core Iteration = 4"
#> [1] "Subset Core Iteration = 5"
#> [1] "Subset Core Iteration = 6"
#> [1] "Subset Core Iteration = 7"
#> [1] "Subset Core Iteration = 8"
#> [1] "Subset Core Iteration = 9"
#> [1] "Subset Core Iteration = 10"
#> Time difference of 10.31857 secs
### list.subset.cores is a list of core rule set derived from subsampling iterations

gcr.df <- getGCRs(list.subset.cores) # Generalized core rules
print(gcr.df)
#>                     GCR Confidence Confidence.Level
#> 8         NRAS-M.TP53-M        100             High
#> 7      CDKN2A-MD.NRAS-M        100             High
#> 6      BRAF-M.CDKN2A-MD        100             High
#> 5        BRAF-M.PTEN-MD         90             High
#> 4 BRAF-M.HIPK2/TBXAS1-A         60           Medium
#> 3    BRAF-M.RN7SKP254-A         40           Medium
#> 2         BRAF-M.TP53-M         30              Low
#> 1        ARID2-M.NRAS-M         10              Low
gcd.df <- getGCDs(list.subset.cores) # Generalized core duos
print(gcd.df)
#>                     GCD Confidence Confidence.Level
#> 8         NRAS-M.TP53-M        100             High
#> 7      CDKN2A-MD.NRAS-M        100             High
#> 6      BRAF-M.CDKN2A-MD        100             High
#> 5        BRAF-M.PTEN-MD         90             High
#> 4 BRAF-M.HIPK2/TBXAS1-A         60           Medium
#> 3    BRAF-M.RN7SKP254-A         40           Medium
#> 2         BRAF-M.TP53-M         30              Low
#> 1        ARID2-M.NRAS-M         10              Low
gce.df <- getGCEs(list.subset.cores) # Generalized core events
print(gce.df)
#>              GCE Confidence Confidence.Level
#> 8         TP53-M        100             High
#> 7         NRAS-M        100             High
#> 6      CDKN2A-MD        100             High
#> 5         BRAF-M        100             High
#> 4        PTEN-MD         90             High
#> 3 HIPK2/TBXAS1-A         60           Medium
#> 2    RN7SKP254-A         40           Medium
#> 1        ARID2-M         10              Low