Testing the new function valid

Date: 02/04/2014

I have updated the whole valid function to remove the selection bias that was previously included in the sPLS-DA and the sPLS functions. Thank you to Amrit to pinpoint this bug, as well as other mixOmics users (J. Labus)

Major changes include:

Data: liver.toxicity

library(knitr)  # to generate this document
library(mixOmics)  # latest version on CRAN 5.0-1
## Loading required package: MASS
## Loading required package: lattice
data(liver.toxicity)

Source the new set of functions

source("valid_full_version_updated.R")

PLS

X <- liver.toxicity$gene
Y <- liver.toxicity$clinic

res.pls = pls(X, Y, ncomp = 3, mode = "regression")
valid.pls = valid.pls.updated(res.pls, validation = "Mfold", folds = 5, criterion = "all", 
    progressBar = FALSE)
valid.pls$Q2.total
## [1] 0.26666 0.03406 0.06376
valid.pls$Q2
##                    ncomp 1  ncomp 2   ncomp 3
## BUN.mg.dL.         0.50221  0.01509 -0.013839
## Creat.mg.dL.       0.12360 -0.06334 -0.024768
## TP.g.dL.           0.01626  0.08512  0.231063
## ALB.g.dL.          0.03632  0.16316  0.287502
## ALT.IU.L.          0.46902  0.10929  0.012289
## SDH.IU.L.          0.09432 -0.06386  0.056458
## AST.IU.L.          0.47786  0.13643  0.002404
## ALP.IU.L.          0.20431 -0.02978 -0.050536
## TBA.umol.L.        0.57496 -0.03207  0.085775
## Cholesterol.mg.dL. 0.16775  0.02732 -0.019525
valid.pls$R2
##                     ncomp 1  ncomp 2 ncomp 3
## BUN.mg.dL.         0.535141 0.573951 0.57662
## Creat.mg.dL.       0.057982 0.023179 0.03022
## TP.g.dL.           0.029391 0.035825 0.29676
## ALB.g.dL.          0.001032 0.133384 0.44700
## ALT.IU.L.          0.635903 0.739456 0.74189
## SDH.IU.L.          0.031214 0.002037 0.08218
## AST.IU.L.          0.601682 0.716459 0.73606
## ALP.IU.L.          0.150172 0.129972 0.09423
## TBA.umol.L.        0.591480 0.570781 0.73485
## Cholesterol.mg.dL. 0.219515 0.326594 0.31898

sPLS

X <- liver.toxicity$gene
Y <- liver.toxicity$clinic

res.spls = spls(X, Y, ncomp = 3, keepX = c(10, 10, 10), keepY = c(5, 5, 5), 
    mode = "regression")
valid.spls = valid.spls.updated(res.spls, validation = "Mfold", folds = 5, criterion = "all", 
    progressBar = FALSE)
valid.spls$Q2.total
## [1]  0.28520  0.11227 -0.03314
valid.spls$Q2
##                    ncomp 1   ncomp 2   ncomp 3
## BUN.mg.dL.         0.37336  0.000000  0.046096
## Creat.mg.dL.       0.06349 -0.003505 -0.045557
## TP.g.dL.           0.06349  0.257093 -0.099956
## ALB.g.dL.          0.06349  0.349555 -0.063206
## ALT.IU.L.          0.67267  0.000000  0.000000
## SDH.IU.L.          0.06349  0.132681 -0.000726
## AST.IU.L.          0.66346  0.000000  0.012613
## ALP.IU.L.          0.06349  0.000000 -0.089248
## TBA.umol.L.        0.54465  0.272203  0.000000
## Cholesterol.mg.dL. 0.28039 -0.014659 -0.011517
valid.spls$R2
##                     ncomp 1   ncomp 2  ncomp 3
## BUN.mg.dL.         0.497622 0.4976219 0.493780
## Creat.mg.dL.       0.055881 0.0004329 0.004725
## TP.g.dL.           0.061015 0.2739985 0.205509
## ALB.g.dL.          0.004819 0.4056349 0.354558
## ALT.IU.L.          0.851450 0.8514501 0.851450
## SDH.IU.L.          0.015530 0.0376482 0.038165
## AST.IU.L.          0.816152 0.8161524 0.820803
## ALP.IU.L.          0.009611 0.0096109 0.001480
## TBA.umol.L.        0.688953 0.8006679 0.800668
## Cholesterol.mg.dL. 0.443735 0.4452083 0.313441

This new output outputs the frequency of the selected variables across the folds

valid.spls$feature$stable.X
## [[1]]
## A_42_P620915  A_43_P14131  A_43_P22616  A_43_P23376 A_42_P705413 
##          1.0          1.0          1.0          1.0          0.8 
##  A_43_P10606  A_43_P17415 A_42_P675890 A_42_P802628 A_42_P840776 
##          0.8          0.8          0.6          0.6          0.4 
##  A_43_P10003  A_43_P11724 A_42_P550264 A_42_P681650 A_42_P758454 
##          0.4          0.4          0.2          0.2          0.2 
## A_42_P825290  A_43_P10006  A_43_P16842 
##          0.2          0.2          0.2 
## 
## [[2]]
## A_42_P505480 A_42_P576823 A_42_P591665 A_42_P678904 A_42_P843603 
##          1.0          1.0          1.0          1.0          1.0 
##  A_43_P11285 A_42_P794613 A_42_P531684  A_43_P10088  A_43_P12448 
##          1.0          0.6          0.4          0.4          0.4 
##  A_43_P15363  A_43_P19039 A_42_P477643 A_42_P618538 A_42_P637641 
##          0.4          0.4          0.2          0.2          0.2 
## A_42_P773054 A_42_P792017 A_42_P836392  A_43_P12413 
##          0.2          0.2          0.2          0.2 
## 
## [[3]]
## A_42_P467372 A_42_P656513 A_42_P693789  A_43_P22375 A_42_P473074 
##          0.4          0.4          0.4          0.4          0.2 
## A_42_P474794 A_42_P504507 A_42_P505298 A_42_P512802 A_42_P519373 
##          0.2          0.2          0.2          0.2          0.2 
## A_42_P524633 A_42_P525370 A_42_P563285 A_42_P603705 A_42_P606133 
##          0.2          0.2          0.2          0.2          0.2 
## A_42_P611127 A_42_P618055 A_42_P622652 A_42_P637583 A_42_P648819 
##          0.2          0.2          0.2          0.2          0.2 
## A_42_P668600 A_42_P677172 A_42_P704869 A_42_P721911 A_42_P754353 
##          0.2          0.2          0.2          0.2          0.2 
## A_42_P754654 A_42_P755687 A_42_P756431 A_42_P772157 A_42_P786715 
##          0.2          0.2          0.2          0.2          0.2 
## A_42_P790250 A_42_P793123  A_43_P10485  A_43_P10703  A_43_P10754 
##          0.2          0.2          0.2          0.2          0.2 
##  A_43_P11417  A_43_P13147  A_43_P13739  A_43_P16637  A_43_P17387 
##          0.2          0.2          0.2          0.2          0.2 
##  A_43_P17889  A_43_P18498  A_43_P20149  A_43_P20329  A_43_P20814 
##          0.2          0.2          0.2          0.2          0.2 
##  A_43_P21538 
##          0.2

PLS-DA

X <- liver.toxicity$gene
Y <- as.factor(liver.toxicity$treatment[, "Dose.Group"])  # take a factor here

res.plsda = plsda(X, Y, ncomp = 3)
valid.plsda = valid.plsda.updated(res.plsda, method = "all", validation = "Mfold", 
    folds = 5, progressBar = FALSE)
valid.plsda
##         max.dist centroids.dist mahalanobis.dist
## ncomp 1   0.6397         0.5167           0.5167
## ncomp 2   0.4231         0.4218           0.3885
## ncomp 3   0.2654         0.2654           0.2500
## attr(,"class")
## [1] "valid"      "plsda.mthd"

sPLS-DA

X <- liver.toxicity$gene
Y <- as.factor(liver.toxicity$treatment[, "Dose.Group"])  # take a factor here

res.splsda = splsda(X, Y, ncomp = 3, keepX = c(10, 10, 10))
valid.splsda = valid.splsda.updated(res.splsda, method = "all", validation = "Mfold", 
    folds = 5, progressBar = FALSE)
valid.splsda$error.rate
##         max.dist centroids.dist mahalanobis.dist
## ncomp 1   0.6103        0.37308           0.3731
## ncomp 2   0.1705        0.09359           0.2333
## ncomp 3   0.1410        0.09487           0.1103

This new output outputs the final variable selection from the whole data set (from res.splsda)

valid.splsda$features$final
## [[1]]
##              value.var
## A_43_P12764   0.631667
## A_43_P20438   0.512896
## A_43_P18047   0.371786
## A_43_P14921  -0.258900
## A_43_P14195   0.210560
## A_42_P496622 -0.203754
## A_43_P13002   0.164840
## A_42_P493879  0.116900
## A_42_P595591  0.077066
## A_43_P13297   0.007231
## 
## [[2]]
##              value.var
## A_42_P722761   0.49840
## A_42_P730438   0.48726
## A_42_P672425   0.47922
## A_42_P537398   0.38076
## A_43_P10810    0.20839
## A_42_P781929   0.20636
## A_43_P14413    0.14156
## A_43_P14952    0.13610
## A_42_P783205   0.11613
## A_43_P14795    0.03851
## 
## [[3]]
##              value.var
## A_43_P11409   0.518118
## A_43_P10726   0.431280
## A_43_P11113  -0.426560
## A_42_P660864  0.364193
## A_43_P23061   0.307629
## A_42_P814129  0.260699
## A_43_P17342   0.227043
## A_43_P21483   0.102224
## A_43_P20754   0.079767
## A_42_P548566 -0.001211

For the fun of plotting a graphic!

matplot(valid.splsda$error.rate[, "max.dist"], type = "l", col = "blue")

plot of chunk unnamed-chunk-2