Date: 02/04/2014
I have updated the whole valid function to remove the selection bias that was previously included in the sPLS-DA and the sPLS functions. Thank you to Amrit to pinpoint this bug, as well as other mixOmics users (J. Labus)
Major changes include:
library(knitr) # to generate this document
library(mixOmics) # latest version on CRAN 5.0-1
## Loading required package: MASS
## Loading required package: lattice
data(liver.toxicity)
Source the new set of functions
source("valid_full_version_updated.R")
X <- liver.toxicity$gene
Y <- liver.toxicity$clinic
res.pls = pls(X, Y, ncomp = 3, mode = "regression")
valid.pls = valid.pls.updated(res.pls, validation = "Mfold", folds = 5, criterion = "all",
progressBar = FALSE)
valid.pls$Q2.total
## [1] 0.26666 0.03406 0.06376
valid.pls$Q2
## ncomp 1 ncomp 2 ncomp 3
## BUN.mg.dL. 0.50221 0.01509 -0.013839
## Creat.mg.dL. 0.12360 -0.06334 -0.024768
## TP.g.dL. 0.01626 0.08512 0.231063
## ALB.g.dL. 0.03632 0.16316 0.287502
## ALT.IU.L. 0.46902 0.10929 0.012289
## SDH.IU.L. 0.09432 -0.06386 0.056458
## AST.IU.L. 0.47786 0.13643 0.002404
## ALP.IU.L. 0.20431 -0.02978 -0.050536
## TBA.umol.L. 0.57496 -0.03207 0.085775
## Cholesterol.mg.dL. 0.16775 0.02732 -0.019525
valid.pls$R2
## ncomp 1 ncomp 2 ncomp 3
## BUN.mg.dL. 0.535141 0.573951 0.57662
## Creat.mg.dL. 0.057982 0.023179 0.03022
## TP.g.dL. 0.029391 0.035825 0.29676
## ALB.g.dL. 0.001032 0.133384 0.44700
## ALT.IU.L. 0.635903 0.739456 0.74189
## SDH.IU.L. 0.031214 0.002037 0.08218
## AST.IU.L. 0.601682 0.716459 0.73606
## ALP.IU.L. 0.150172 0.129972 0.09423
## TBA.umol.L. 0.591480 0.570781 0.73485
## Cholesterol.mg.dL. 0.219515 0.326594 0.31898
X <- liver.toxicity$gene
Y <- liver.toxicity$clinic
res.spls = spls(X, Y, ncomp = 3, keepX = c(10, 10, 10), keepY = c(5, 5, 5),
mode = "regression")
valid.spls = valid.spls.updated(res.spls, validation = "Mfold", folds = 5, criterion = "all",
progressBar = FALSE)
valid.spls$Q2.total
## [1] 0.28520 0.11227 -0.03314
valid.spls$Q2
## ncomp 1 ncomp 2 ncomp 3
## BUN.mg.dL. 0.37336 0.000000 0.046096
## Creat.mg.dL. 0.06349 -0.003505 -0.045557
## TP.g.dL. 0.06349 0.257093 -0.099956
## ALB.g.dL. 0.06349 0.349555 -0.063206
## ALT.IU.L. 0.67267 0.000000 0.000000
## SDH.IU.L. 0.06349 0.132681 -0.000726
## AST.IU.L. 0.66346 0.000000 0.012613
## ALP.IU.L. 0.06349 0.000000 -0.089248
## TBA.umol.L. 0.54465 0.272203 0.000000
## Cholesterol.mg.dL. 0.28039 -0.014659 -0.011517
valid.spls$R2
## ncomp 1 ncomp 2 ncomp 3
## BUN.mg.dL. 0.497622 0.4976219 0.493780
## Creat.mg.dL. 0.055881 0.0004329 0.004725
## TP.g.dL. 0.061015 0.2739985 0.205509
## ALB.g.dL. 0.004819 0.4056349 0.354558
## ALT.IU.L. 0.851450 0.8514501 0.851450
## SDH.IU.L. 0.015530 0.0376482 0.038165
## AST.IU.L. 0.816152 0.8161524 0.820803
## ALP.IU.L. 0.009611 0.0096109 0.001480
## TBA.umol.L. 0.688953 0.8006679 0.800668
## Cholesterol.mg.dL. 0.443735 0.4452083 0.313441
This new output outputs the frequency of the selected variables across the folds
valid.spls$feature$stable.X
## [[1]]
## A_42_P620915 A_43_P14131 A_43_P22616 A_43_P23376 A_42_P705413
## 1.0 1.0 1.0 1.0 0.8
## A_43_P10606 A_43_P17415 A_42_P675890 A_42_P802628 A_42_P840776
## 0.8 0.8 0.6 0.6 0.4
## A_43_P10003 A_43_P11724 A_42_P550264 A_42_P681650 A_42_P758454
## 0.4 0.4 0.2 0.2 0.2
## A_42_P825290 A_43_P10006 A_43_P16842
## 0.2 0.2 0.2
##
## [[2]]
## A_42_P505480 A_42_P576823 A_42_P591665 A_42_P678904 A_42_P843603
## 1.0 1.0 1.0 1.0 1.0
## A_43_P11285 A_42_P794613 A_42_P531684 A_43_P10088 A_43_P12448
## 1.0 0.6 0.4 0.4 0.4
## A_43_P15363 A_43_P19039 A_42_P477643 A_42_P618538 A_42_P637641
## 0.4 0.4 0.2 0.2 0.2
## A_42_P773054 A_42_P792017 A_42_P836392 A_43_P12413
## 0.2 0.2 0.2 0.2
##
## [[3]]
## A_42_P467372 A_42_P656513 A_42_P693789 A_43_P22375 A_42_P473074
## 0.4 0.4 0.4 0.4 0.2
## A_42_P474794 A_42_P504507 A_42_P505298 A_42_P512802 A_42_P519373
## 0.2 0.2 0.2 0.2 0.2
## A_42_P524633 A_42_P525370 A_42_P563285 A_42_P603705 A_42_P606133
## 0.2 0.2 0.2 0.2 0.2
## A_42_P611127 A_42_P618055 A_42_P622652 A_42_P637583 A_42_P648819
## 0.2 0.2 0.2 0.2 0.2
## A_42_P668600 A_42_P677172 A_42_P704869 A_42_P721911 A_42_P754353
## 0.2 0.2 0.2 0.2 0.2
## A_42_P754654 A_42_P755687 A_42_P756431 A_42_P772157 A_42_P786715
## 0.2 0.2 0.2 0.2 0.2
## A_42_P790250 A_42_P793123 A_43_P10485 A_43_P10703 A_43_P10754
## 0.2 0.2 0.2 0.2 0.2
## A_43_P11417 A_43_P13147 A_43_P13739 A_43_P16637 A_43_P17387
## 0.2 0.2 0.2 0.2 0.2
## A_43_P17889 A_43_P18498 A_43_P20149 A_43_P20329 A_43_P20814
## 0.2 0.2 0.2 0.2 0.2
## A_43_P21538
## 0.2
X <- liver.toxicity$gene
Y <- as.factor(liver.toxicity$treatment[, "Dose.Group"]) # take a factor here
res.plsda = plsda(X, Y, ncomp = 3)
valid.plsda = valid.plsda.updated(res.plsda, method = "all", validation = "Mfold",
folds = 5, progressBar = FALSE)
valid.plsda
## max.dist centroids.dist mahalanobis.dist
## ncomp 1 0.6397 0.5167 0.5167
## ncomp 2 0.4231 0.4218 0.3885
## ncomp 3 0.2654 0.2654 0.2500
## attr(,"class")
## [1] "valid" "plsda.mthd"
X <- liver.toxicity$gene
Y <- as.factor(liver.toxicity$treatment[, "Dose.Group"]) # take a factor here
res.splsda = splsda(X, Y, ncomp = 3, keepX = c(10, 10, 10))
valid.splsda = valid.splsda.updated(res.splsda, method = "all", validation = "Mfold",
folds = 5, progressBar = FALSE)
valid.splsda$error.rate
## max.dist centroids.dist mahalanobis.dist
## ncomp 1 0.6103 0.37308 0.3731
## ncomp 2 0.1705 0.09359 0.2333
## ncomp 3 0.1410 0.09487 0.1103
This new output outputs the final variable selection from the whole data set (from res.splsda)
valid.splsda$features$final
## [[1]]
## value.var
## A_43_P12764 0.631667
## A_43_P20438 0.512896
## A_43_P18047 0.371786
## A_43_P14921 -0.258900
## A_43_P14195 0.210560
## A_42_P496622 -0.203754
## A_43_P13002 0.164840
## A_42_P493879 0.116900
## A_42_P595591 0.077066
## A_43_P13297 0.007231
##
## [[2]]
## value.var
## A_42_P722761 0.49840
## A_42_P730438 0.48726
## A_42_P672425 0.47922
## A_42_P537398 0.38076
## A_43_P10810 0.20839
## A_42_P781929 0.20636
## A_43_P14413 0.14156
## A_43_P14952 0.13610
## A_42_P783205 0.11613
## A_43_P14795 0.03851
##
## [[3]]
## value.var
## A_43_P11409 0.518118
## A_43_P10726 0.431280
## A_43_P11113 -0.426560
## A_42_P660864 0.364193
## A_43_P23061 0.307629
## A_42_P814129 0.260699
## A_43_P17342 0.227043
## A_43_P21483 0.102224
## A_43_P20754 0.079767
## A_42_P548566 -0.001211
For the fun of plotting a graphic!
matplot(valid.splsda$error.rate[, "max.dist"], type = "l", col = "blue")