Commit a6edb51f authored by Edi Prifti's avatar Edi Prifti

- setting default evalToFit to "accuracy_" making it robust for other score...

- setting default evalToFit to "accuracy_" making it robust for other score types during evaluation process
- scorelist for models with 1 variable
- improving documentation terga2
- improving messaging during run
- advancing vignette
parent ccab5432
This diff is collapsed.
#' metal: metal searching algorithm
#' @description metal is a model search algorithm on a list of beam search approach and get the populations into GA.
#' @param sparsity: number of features in a given model. This is a vector with multiple lengths.
......@@ -17,7 +20,7 @@
#' @param verbose: print out information on the progress of the algorithm (default:TRUE)
#' @param debug: print debug information (default:FALSE)
#' @param print_ind_method: One of c("short","graphical") indicates how to print a model and subsequently a population during the run (default:"short").
#' @param parallelize.folds: parallelize folds when cross-validating (default:TRUE)
#' @param parallelize_folds: parallelize folds when cross-validating (default:TRUE)
#' @param nCores: the number of cores to execute the program. If nCores=1 than the program runs in a non parallel mode
#' @param seed: the seed to be used for reproductibility. If seed=NULL than it is not taken into account (default:NULL).
#' @param experiment.id: The id of the experiment that is to be used in the plots and comparitive analyses (default is the learner's name, when not specified)
......@@ -25,7 +28,7 @@
#' @param experiment.save: Data from an experiment can be saved with different levels of completness, with options to be selected from c("nothing", "minimal", "full"), default is "minimal"
#' @param list.clfs: list of Genetor and Unificator
#' @param unificator.method: the default unificator is a terga2. Other one specified will yield a stop of the program.
#' @param unificator.evolveMethod: the default evolve method used by the unificator which is by default a terga2.
#' @param unificator.evolver: the default evolve method used by the unificator which is by default a terga2.
#' @return an object containing a list of parameters for this classifier
#' @export
metal <- function(sparsity = 1:10, max.nb.features = 1000,
......@@ -39,13 +42,13 @@ metal <- function(sparsity = 1:10, max.nb.features = 1000,
# evaluation options
objective = "auc", k_penalty = 0, evalToFit = "accuracy_", estimate_coefs = FALSE, intercept = "NULL", testAllSigns = FALSE,
# output options
plot = FALSE, verbose = TRUE, debug = FALSE, print_ind_method = "short", parallelize.folds = TRUE,
plot = FALSE, verbose = TRUE, debug = FALSE, print_ind_method = "short", parallelize_folds = TRUE,
# computing options
nCores = 10, seed = "NULL", #maxTime = Inf,
# experiment options
experiment.id = "NULL", experiment.description = "NULL", experiment.save = "nothing",
# list of clfs
list.clfs="NULL", unificator.method = "terga2", unificator.evolveMethod = "v2m_new")
list.clfs="NULL", unificator.method = "terga2", unificator.evolver = "v2m_new")
{
# standard means that we use the standard heuristics
clf <- list()
......@@ -69,7 +72,7 @@ metal <- function(sparsity = 1:10, max.nb.features = 1000,
# Computing options
clf$params$nCores <- nCores # parallel computing
clf$params$parallel <- nCores > 1 # parallel computing
clf$params$parallelize.folds <- parallelize.folds
clf$params$parallelize_folds <- parallelize_folds
clf$params$parallel.local <- FALSE
clf$params$seed <- seed # fix the seed to be able to reproduce results
......@@ -107,7 +110,7 @@ metal <- function(sparsity = 1:10, max.nb.features = 1000,
# unificator information
clf$params$unificator.method <- unificator.method
clf$params$unificator.evolveMethod<- unificator.evolveMethod
clf$params$unificator.evolver <- unificator.evolver
# add here the list of classifiers
if(is.null(list.clfs))
......@@ -160,7 +163,7 @@ metal_fit <- function(X, y, clf)
if(clf$params$verbose) printClassifier(obj = clf)
# parallel switch
#parfold <- FALSE
# if(!clf$params$parallelize.folds & clf$params$parallel)
# if(!clf$params$parallelize_folds & clf$params$parallel)
# {
# parfold <- TRUE
# }
......@@ -192,7 +195,7 @@ metal_fit <- function(X, y, clf)
{
g.clf <- list.clfs[[i]]
g.clf$params$cluster <- clf$params$cluster
g.clf$params$parallelize.folds <- FALSE
g.clf$params$parallelize_folds <- FALSE
# initiate the current sparsity
g.clf$params$current_seed <- clf$params$current_seed
g.clf$params$objective <- clf$params$objective
......
......@@ -90,7 +90,7 @@ generator_metal <- function(mat, clf = NULL)
clf.g <- terda(language = l)
},
terga2={
clf.g <- terga2(language = l, nb_gen = 1)
clf.g <- terga2(language = l, nb_generations = 1)
}
)
generator[[length(generator)+1]] <- clf.g
......@@ -153,7 +153,7 @@ generator_metal <- function(mat, clf = NULL)
clf.g <- terga2(language = l,
sparsity = clf$params$sparsity,
nCores = clf$params$nCores,
nb_gen = 1, # random generation without evolution
nb_generations = 1, # random generation without evolution
evalToFit = clf$params$evalToFit,
seed = clf$params$seed)
}
......@@ -174,9 +174,9 @@ generator_metal <- function(mat, clf = NULL)
}
generator[[length(generator)+1]] <- terga2(sparsity = clf$params$sparsity,
evolveMethod = clf$params$unificator.evolveMethod,
evolver = clf$params$unificator.evolver,
language = l,
nb_gen = 50,
nb_generations = 50,
size_pop_random = 0,
size_pop = 500,
evalToFit = clf$params$evalToFit,
......
This diff is collapsed.
......@@ -226,7 +226,7 @@ LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
#cl <- makeCluster(clf$params$nCores, type = "FORK",outfile='LOG.TXT')
#registerDoParallel(cl)
clf$params$cluster <- cl
if(clf$params$parallelize.folds)
if(clf$params$parallelize_folds)
{
print("Starting cross validation in parallel")
# execute each crossVal in //
......@@ -270,7 +270,7 @@ LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
res.all[[i]] <- runClassifier(X = x_train, y = y_train, clf = clf)
} # end of folds loop (for)
} # end else parallelize.folds
} # end else parallelize_folds
# Dispatch the results in the custom output structure
......
......@@ -43,7 +43,7 @@
#' @param debug: print debug information (default:FALSE)
#' @param print_ind_method: One of c("short","graphical") indicates how to print a model and subsequently a population during the run (default:"short").
#' @param nCores: the number of cores to execute the program. If nCores=1 than the program runs in a non parallel mode
#' @param parallelize.folds: parallelize folds when cross-validating (default:TRUE)
#' @param parallelize_folds: parallelize folds when cross-validating (default:TRUE)
#' @param seed: the seed to be used for reproductibility. If seed=NULL than it is not taken into account (default:NULL).
#### TODO check
#' @param experiment.id: The id of the experiment that is to be used in the plots and comparitive analyses (default is the learner's name, when not specified)
......@@ -69,7 +69,7 @@ terBeam <- function(sparsity = 1:5, max.nb.features = 1000,
# evaluation options
objective = "auc", k_penalty=0, evalToFit = 'auc_', estimate_coefs=FALSE, intercept = "NULL", testAllSigns = FALSE,
# output options
plot = FALSE, verbose = TRUE, debug=FALSE, print_ind_method = "short", parallelize.folds=TRUE,
plot = FALSE, verbose = TRUE, debug=FALSE, print_ind_method = "short", parallelize_folds = TRUE,
# computing options
nCores = 4, seed = "NULL", #maxTime = Inf,
# experiment options
......@@ -104,7 +104,7 @@ terBeam <- function(sparsity = 1:5, max.nb.features = 1000,
# Computing options
clf$params$nCores <- nCores # parallel computing
clf$params$parallel <- nCores > 1 # parallel computing
clf$params$parallelize.folds <- parallelize.folds
clf$params$parallelize_folds <- parallelize_folds
clf$params$parallel.local <- FALSE
clf$params$seed <- seed # fix the seed to be able to reproduce results
......
......@@ -45,7 +45,7 @@
#' @param verbose: print out information on the progress of the algorithm (default:TRUE)
#' @param debug: print out debug infotmation when activated (default: FALSE)
#' @param print_ind_method: One of c("short","graphical") indicates how to print a model and subsequently a population during the run (default:"short").
#' @param parallelize.folds: parallelize folds when cross-validating (default:TRUE)
#' @param parallelize_folds: parallelize folds when cross-validating (default:TRUE)
#' @param nCores: the number of cores to execute the program. If nCores=1 than the program runs in a non parallel mode
#' @param seed: the seed to be used for reproductibility. If seed=NULL than it is not taken into account (default:NULL).
#### TODO check
......@@ -67,7 +67,7 @@ terda <- function(sparsity = 5, nIterations = 5, max.nb.features = 1000, kBest =
# population options
popSaveFile = "NULL", final.pop.perc = 100, alpha = 0.5,
# output options
plot = FALSE, verbose = TRUE, debug = FALSE, print_ind_method = "short", parallelize.folds = TRUE,
plot = FALSE, verbose = TRUE, debug = FALSE, print_ind_method = "short", parallelize_folds = TRUE,
# computing options
nCores = 4, seed = "NULL", #maxTime = Inf,
# experiment options
......@@ -105,7 +105,7 @@ terda <- function(sparsity = 5, nIterations = 5, max.nb.features = 1000, kBest =
# Computing options
clf$params$nCores <- nCores # parallel computing
clf$params$parallel <- (nCores > 1) # parallel computing
clf$params$parallelize.folds <- parallelize.folds
clf$params$parallelize_folds <- parallelize_folds
clf$params$parallel.local <- FALSE
clf$params$seed <- seed # fix the seed to be able to reproduce results
......
......@@ -47,12 +47,12 @@
#### TOCHECK is this still needed for tergaV2
#' @param convergence: should the algorithm converge when the best individual is not improving (default:TRUE).
#' @param convergence_steps: the number of generations after which we consider convergence (default:10).
#' @param evolve.k1: weather or not to evaluate exhaustively the features for k_sparse=1. This will take a lot of time if the dataset is large, thus the possibility to evolve this using the GA. (default:TRUE)
#' @param evolve_k1: weather or not to evaluate exhaustively the features for k_sparse=1. This will take a lot of time if the dataset is large, thus the possibility to evolve this using the GA. (default:TRUE)
#' @param verbose: print out information on the progress of the algorithm (default:TRUE)
#' @param debug: print debug information (default:FALSE)
#' @param print_ind_method: One of c("short","graphical") indicates how to print a model and subsequently a population during the run (default:"short").
#' @param parallelize.folds: parallelize folds when cross-validating (default:TRUE)
#' @param nb_gen: maximum number of generations to evolve the population.
#' @param parallelize_folds: parallelize folds when cross-validating (default:TRUE)
#' @param nb_generations: maximum number of generations to evolve the population.
#' @param nCores: the number of cores to execute the program. If nCores=1 than the program runs in a non parallel mode
#' @param seed: the seed to be used for reproductibility. If seed=NULL than it is not taken into account (default:NULL).
#### TODO check
......@@ -77,9 +77,9 @@ terga1 <- function(sparsity = c(1:10),
# mutation options
mutate_size = 70, mutate_rate = 50,
# evolution options
nb_gen = 100, convergence = TRUE, convergence_steps = 10, evolve.k1 = TRUE,
nb_generations = 100, convergence = TRUE, convergence_steps = 10, evolve_k1 = TRUE,
# output options
plot = FALSE, verbose = TRUE, debug=FALSE, print_ind_method = "short", parallelize.folds=TRUE,
plot = FALSE, verbose = TRUE, debug=FALSE, print_ind_method = "short", parallelize_folds = TRUE,
# computing options
nCores = 4, seed = "NULL",
# experiment options
......@@ -96,7 +96,7 @@ terga1 <- function(sparsity = c(1:10),
clf$params$size_pop <- size_pop # how many models in the population to evolve
clf$params$size_world <- size_world # total number of variables
clf$params$max.nb.features <- max.nb.features
clf$params$nb_gen <- nb_gen # number of generation to evolve
clf$params$nb_generations <- nb_generations # number of generation to evolve
clf$params$unique_vars <- unique_vars # weather in a model we can have one variable more than once
clf$params$popSourceFile <- popSourceFile
clf$params$popSaveFile <- popSaveFile
......@@ -115,7 +115,7 @@ terga1 <- function(sparsity = c(1:10),
# CONVERGENCE
clf$params$convergence <- convergence # what should the simulation stop when convergence ?
clf$params$convergence_steps <- convergence_steps # after how many steps without improvement do we consider convergence?
clf$params$evolve.k1 <- evolve.k1 # weather to evolve models with k_1 or to search them exhaustively.
clf$params$evolve_k1 <- evolve_k1 # weather to evolve models with k_1 or to search them exhaustively.
# print out intermediary results
clf$params$plot <- plot # plot results?
clf$params$verbose <- verbose # print out logs.
......@@ -125,7 +125,7 @@ terga1 <- function(sparsity = c(1:10),
# Computing options
clf$params$nCores <- nCores # parallel computing
clf$params$parallel <- nCores > 1 # parallel computing
clf$params$parallelize.folds <- parallelize.folds
clf$params$parallelize.folds <- parallelize_folds
clf$params$parallel.local <- FALSE
clf$params$seed <- seed
......@@ -260,7 +260,7 @@ terga1_fit <- function(X, y, clf) {
# test for
if (clf$params$current_sparsity == 1 & !clf$params$evolve.k1) # if we want to evolve features for k_sparse=1 we create a normal population
if (clf$params$current_sparsity == 1 & !clf$params$evolve_k1) # if we want to evolve features for k_sparse=1 we create a normal population
{
pop_last <- as.list(1:nrow(X)) # create population with k_sparse = 1
......
......@@ -511,11 +511,11 @@ evolve <- function(X, y, clf, pop, seed = NULL)
{
# store information on the evolution
evolved_pop <- pop
trace_evolution <- rep(NA, clf$params$nb_gen)
trace_evolution <- rep(NA, clf$params$nb_generations)
if(length(pop[[1]]) < nrow(X) - 1) # testing zize TODO verify why
{
for (i in 1:clf$params$nb_gen) # For all the generations
for (i in 1:clf$params$nb_generations) # For all the generations
{
# 1. Evaluate the current population
# transform to a population of model objects
......
This diff is collapsed.
This diff is collapsed.
# =====================================================================================
# create the databases
# =====================================================================================
# (1) cirrhosis stage 1
load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinn_bug_stage1.rda")
cir_train <- list()
cir_train$X <- as.data.frame(data.qinn.bug.stage1.freq.species); dim(cir_train$X) # 1045 181
cir_train$y <- data.qinn.bug.y
save(cir_train, file="cir_train.rda", compress = TRUE, compression_level = 9)
rm(list=ls()); gc()
# (2) cirrhosis stage 2
load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinn_bug_stage2.rda")
cir_test <- list()
cir_test$X <- as.data.frame(data.qinn.bug.stage2.freq.species); dim(cir_test$X) # 1045 56
cir_test$y <- data.qinn.bug.y
save(cir_test, file="cir_test.rda", compress = TRUE, compression_level = 9)
rm(list=ls()); gc()
# (3) ibd
load("/data/projects/predomics_testing/data/segata_2017/data.bugs/nielsen_bug.rda")
ibd <- list()
ibd$X <- as.data.frame(data.nielsen.bug.freq.species); dim(ibd$X) # 1045 396
ibd$y <- data.nielsen.bug.y
save(ibd, file="ibd.rda", compress = TRUE, compression_level = 9)
rm(list=ls()); gc()
# (4) obesity
load("/data/projects/predomics_testing/data/segata_2017/data.bugs/lechat_bug.rda")
obesity <- list()
obesity$X <- as.data.frame(data.lechat.bug.freq.species); dim(obesity$X) # 1045 292
obesity$y <- data.lechat.bug.y
save(obesity, file="obesity.rda", compress = TRUE, compression_level = 9)
rm(list=ls()); gc()
# (5) t2d
load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinj_bug.rda")
t2d <- list()
t2d$X <- as.data.frame(data.qinj.bug.freq.species); dim(t2d$X) # 1045 344
t2d$y <- data.qinj.bug.y
save(t2d, file="t2d.rda", compress = TRUE, compression_level = 9)
rm(list=ls()); gc()
# (6) t2dw
load("/data/projects/predomics_testing/data/segata_2017/data.bugs/karlsson_bug.rda")
t2dw <- list()
t2dw$X <- as.data.frame(data.karlsson.bug.freq.species); dim(t2dw$X) # 1045 344
t2dw$y <- data.karlsson.bug.y
save(t2dw, file="t2dw.rda", compress = TRUE, compression_level = 9)
rm(list=ls()); gc()
# # =====================================================================================
# # create the databases
# # =====================================================================================
#
# # (1) cirrhosis stage 1
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinn_bug_stage1.rda")
# cir_train <- list()
# cir_train$X <- as.data.frame(data.qinn.bug.stage1.freq.species); dim(cir_train$X) # 1045 181
# cir_train$y <- data.qinn.bug.y
# save(cir_train, file="cir_train.rda", compress = TRUE, compression_level = 9)
# rm(list=ls()); gc()
#
# # (2) cirrhosis stage 2
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinn_bug_stage2.rda")
# cir_test <- list()
# cir_test$X <- as.data.frame(data.qinn.bug.stage2.freq.species); dim(cir_test$X) # 1045 56
# cir_test$y <- data.qinn.bug.y
# save(cir_test, file="cir_test.rda", compress = TRUE, compression_level = 9)
# rm(list=ls()); gc()
#
# # (3) ibd
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/nielsen_bug.rda")
# ibd <- list()
# ibd$X <- as.data.frame(data.nielsen.bug.freq.species); dim(ibd$X) # 1045 396
# ibd$y <- data.nielsen.bug.y
# save(ibd, file="ibd.rda", compress = TRUE, compression_level = 9)
# rm(list=ls()); gc()
#
# # (4) obesity
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/lechat_bug.rda")
# obesity <- list()
# obesity$X <- as.data.frame(data.lechat.bug.freq.species); dim(obesity$X) # 1045 292
# obesity$y <- data.lechat.bug.y
# save(obesity, file="obesity.rda", compress = TRUE, compression_level = 9)
# rm(list=ls()); gc()
#
# # (5) t2d
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinj_bug.rda")
# t2d <- list()
# t2d$X <- as.data.frame(data.qinj.bug.freq.species); dim(t2d$X) # 1045 344
# t2d$y <- data.qinj.bug.y
# save(t2d, file="t2d.rda", compress = TRUE, compression_level = 9)
# rm(list=ls()); gc()
#
# # (6) t2dw
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/karlsson_bug.rda")
# t2dw <- list()
# t2dw$X <- as.data.frame(data.karlsson.bug.freq.species); dim(t2dw$X) # 1045 344
# t2dw$y <- data.karlsson.bug.y
# save(t2dw, file="t2dw.rda", compress = TRUE, compression_level = 9)
# rm(list=ls()); gc()
#
# # (7) cirrhosis stage 1 counts
# load("/data/projects/predomics_testing/data/segata_2017/data.bugs/qinn_bug_stage1.rda")
# cir_train_count <- list()
......
......@@ -48,57 +48,93 @@ A *predomics* model is coded in R as a S3 object, which contains a certain numbe
# Getting started
\
# Installing and loading `predomics`
\
`predomics` package is available on Integromics' GitLab and can be installed on your computer through the following instructions:
## Installing and loading `predomics`
`predomics` package is available on Integromics' GitLab and can be installed on your computer through the following instructions. For this first is needed the `devtools` package if not already installed. Finally the package will be loaded to be used.
```{r, warning=FALSE, echo=T, message=F}
# Load "devtools" library (install it if needed)
ip <- installed.packages()
if(!("devtools" %in% rownames(ip)))
{
install.packages("devtools")
}
library(devtools)
if(!("predomics" %in% rownames(ip)))
{
#creds = git2r::cred_user_pass("eprifti", getPass::getPass())
creds = git2r::cred_ssh_key("/home/eprifti/.ssh/id_rsa.pub",
"/home/eprifti/.ssh/id_rsa")
# Install "predomics" from the gitlab
install_git(url = "https://git.integromics.fr/Predomics/predomics",
credentials = creds
)
rm(creds)
}
# # Load "devtools" library (install it if needed)
# ip <- installed.packages()
# library(devtools)
# if(!("predomics" %in% rownames(ip)))
# {
# #creds = git2r::cred_user_pass("eprifti", getPass::getPass())
# creds = git2r::cred_ssh_key("/home/eprifti/.ssh/id_rsa.pub",
# "/home/eprifti/.ssh/id_rsa")
#
# # Install "predomics" from the gitlab
# install_git(url = "https://git.integromics.fr/Predomics/predomics",
# credentials = creds
# )
# rm(creds)
# }
# rm(ip)
# load the library
library(predomics)
rm(ip)
```
## Setting the environment
In this example we will use data from the package
## Preparing the data
In this example we provide 6 datasets provided with the package. Each is a list containing a data matrix `X` (with features in the raws and observations in the columns) and a vector `y` with information on the classes to predict. The dataset we will use here is cir_train which comes from the liver cirrhoses study (`?cir_train` for more information).
```{r}
data(package = "predomics")
In `predomics` we provide several functions that allow to filter the initial data and avoid learning from variables with little information. The most commonly used function is `filterNoSignal()`, which will compute the first derivate of the `median(sd)/x` and will find an automatic threshold. Other filtering procedures are also implemented as for instance `filterFeaturesByPrevalence()`, which selects variables based on their minimal prevalence or `filterfeaturesK()`, which selects the `k` most significant features with the class. The reader is invited to explore the documentation for more information.
```{r message=FALSE, warning=FALSE, paged.print=FALSE}
# data(package = "predomics")
# cir_test Cirhosis stage 2 (frequencies)
# cir_train Cirhosis stage 1 (frequencies)
# ibd Inflammatory Bowel Disease (frequencies) from the MetaHIT study
# obesity Obesity (frequencies) from the MetaHIT study
# t2d Type 2 diabetes (frequencies) BGI
# t2dw Type 2 diabetes (frequencies) Women Sweden
# load the data
data("cir_train")
data("cir_test")
str(cir_train, max.level = 1)
# List of 2
# $ X:'data.frame': 1045 obs. of 181 variables:
# $ y: num [1:181] 1 1 1 1 1 1 1 1 1 1 ...
# # take out the lines with zero signal
#
# X <- X[rowSums(X)!=0,]; dim(X) # 556 181
# X <- filterNoSignal(X = X, side = 1, threshold = "auto", verbose = TRUE); dim(X) # 462 181
#
# save(y, X, file="db.rda")
# Filter the non informative variables
X <- cir_train$X; y <- cir_train$y
X <- X[rowSums(X)!=0,]; dim(X) # 556 181
X <- filterNoSignal(X = X, side = 1, threshold = "auto", verbose = FALSE); dim(X) # 462 181
```
## Setting the learner context
Now that the dataset is ready, we need to prepare the learner context. This is the `classifier` object mentioned previsously. Eeach learning algorithm we have proposed here, `?terga1`, `?terga2`, `?terbeam`, `?terda` and `?metal`, has its own parameters, which can be explored by running `?terga2`. In this example we will use the default parameters and set only `nCores = 1`. If nCores > 1 the execution will run in parallel. We have optimized the parallel execution to take full advantage of the computational ressources and typically it will run each cross-validation fold in one CPU. A deeper paralelization will waste ressources in the serialization process. Finally, we set the `seed = 1` (if multuple seeds are provided, the algorithm will run multiple times). When `plot = TRUE` graphics with the evolution process are provided in pdf format.
```{r}
# Terga2
clf <- terga2(nCores = 1, # if > 1, it will run in parallel.
seed = 1,
plot = TRUE
)
printy(clf) # print the object for more information
isClf(clf) # test whether the object is a classifier
class(clf) # the class of the classifier object
```
## Running the learner and exploring the results
After setting the classifier, we are now able to run the learning experiment. For this we simply run `fit()` along with `X`, `y` and `clf`.
```{r}
res_clf <- fit(X = X, y = y, clf = clf, cross.validate = TRUE, nfolds = 10)
# digest the results
res.dig <- digest(obj = res_clf, plot = TRUE, penalty = 0.75/100)
printPopulation(res.dig$best$models)
lfolds <- create.folds(y, k = 10)
populationToDataFrame(list(evaluateModel(mod, X[,-lfolds[[2]]], y[-lfolds[[2]]], clf = clf, eval.all = TRUE, force.re.evaluation = TRUE, mode = "test")))
```
## Datasets
In this experiment we will focus on the capacity to predict from different taxonomical levels characterizing the gut microbiome. Here we will use the IBD database. The taxonomic levels that we use are the following **Species, Genus, Family, Order, Class, Phylum** respectively decreasing in specificity in the taxonomic tree. We also use a mixed level by combining the different taxonomic levels - named **All**).
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment