Commit 26dfdd13 authored by Edi Prifti's avatar Edi Prifti

- unify getBestModel methods in one

- don't evaluate ter models without two sides
- increase robustness of the code
- adding more isObject methods
- updating version
- adding printy and printObject methods
- cleaning unused code
- adding a modular testing markdown
parent 9cc43acf
Package: predomics
Type: Package
Title: Ternary Prediction in large Omics Dataests
Version: 0.9.4
Date: 2018-02-21
Version: 0.9.7
Date: 2018-05-29
Author: Edi Prifti, Lucas Robin, Shasha Cui, Blaise Hanczar, Yann Chevaleyre, Jean-Daniel Zucker
Maintainer: Edi Prifti <edi.prifti@gmail.com>
Depends:
......@@ -26,12 +26,12 @@ Imports:
viridis,
kernlab,
randomForest
Description: The objective of this work is to propose new approaches that
are efficient and adopted for prediction and regression in metagenomics and
other large datasets. We use a ternary approach where it makes sense to sum or
divide features such as for instance abundances of species in an ecosystem. We
propose three different and complementary algorithms that try to solve the above
mentioned problems.
Description: The main objective of this work is to propose a framework for epxploring and
finding sparse and interpretable modesl in very large dimentionality.
These models are efficient and adopted for classification and regression in metagenomics and
other large omics datasets. We propose several languages that describe links between variables
in different ways. In this framework are also implemented different and complementary heuristics
that allow us to identify accurate models.
License: GPL (>= 2)
LazyData: TRUE
RoxygenNote: 6.0.1
......@@ -30,28 +30,31 @@ export(filterNoSignal)
export(filterfeaturesK)
export(fit)
export(generator_metal)
export(getBestModels)
export(getFeaturePrevalence)
export(getGraph)
export(getMaxMinPrevalenceModel)
export(getModelScore)
export(getNBestIndividuals)
export(getNBestModels)
export(getSign)
export(getTheBestModel)
export(individual)
export(isClf)
export(isExperiment)
export(isLearnerSota)
export(isModel)
export(isModelBTR)
export(isModelCollection)
export(isModelSota)
export(isModelSotaGLMNET)
export(isModelSotaRF)
export(isModelSotaSVM)
export(isModelTerda)
export(isPopulation)
export(listOfDenseVecToListOfModels)
export(listOfModels2ModelCollection)
export(listOfModelsToDenseCoefMatrix)
export(listOfModelsToListOfDenseVec)
export(listOfModelsToListOfSparseVec)
export(listOfSparseVecToListOfModels)
export(loadPopulation)
export(loadResults)
export(makeFeatureAnnot)
......@@ -87,9 +90,12 @@ export(population)
export(populationGet_X)
export(populationSet_X)
export(populationToDataFrame)
export(printClassifier)
export(printExperiment)
export(printModel)
export(printPop)
export(printModelCollection)
export(printPopulation)
export(printy)
export(quartileDispersion)
export(runClassifier)
export(runCrossval)
......@@ -97,7 +103,6 @@ export(savePopulation)
export(saveResults)
export(scoreRatio)
export(selectBestPopulation)
export(shuffle_y)
export(sim_inter)
export(sim_intra)
export(sortPopulation)
......
This diff is collapsed.
This diff is collapsed.
......@@ -157,7 +157,7 @@ metal_fit <- function(X, y, clf)
if(clf$params$verbose) print("... ... sanity check of the list of clf")
if(clf$params$verbose) printExperiment(clf)
if(clf$params$verbose) printClassifier(obj = clf)
# parallel switch
#parfold <- FALSE
# if(!clf$params$parallelize.folds & clf$params$parallel)
......@@ -226,7 +226,23 @@ metal_fit <- function(X, y, clf)
}
NBest <- min(floor(size_pop / (n-2) ), length(modelCollectionToPopulation(res)))
bestIndividuals <- getNBestIndividuals(res, g.clf, NBest, equal.sparsity = TRUE)
#bestIndividuals <- getNBestIndividuals(res, g.clf, NBest, equal.sparsity = TRUE)
bestIndividuals <- getNBestModels(obj = res,
significance = FALSE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = NBest,
single.best = FALSE,
single.best.cv = TRUE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = TRUE,
unique.control = TRUE
)
# when the generators did not generate anything
if(!is.null(bestIndividuals))
......
......@@ -308,7 +308,10 @@ fit <- function(X,
pop <- modelCollectionToPopulation(res.clf$classifier$models)
if(!isPopulation(pop))
{
stop("fit: adding feature importance, there is a problem with the population")
warning("fit: the population seems to be empty. No models were found")
res.clf$classifier$models <- listOfModels2ModelCollection(pop)
if(clf$params$verbose) print("... Thank you for using Predomics")
return(res.clf)
}
# for each model add a vector with feature importance
......@@ -491,8 +494,11 @@ runClassifier <- function(X, y, clf, x_test = NULL, y_test = NULL)
clf$fip <- efip.fold
}
# update the final indexes as the input X
clf$models <- updateObjectIndex(obj = clf$models, features = clf$data$features)
if(isModelCollection(clf$models))
{
# update the final indexes as the input X
clf$models <- updateObjectIndex(obj = clf$models, features = clf$data$features)
}
clf$execTime <- as.numeric(Sys.time() - startingTime, units = "mins")
return(clf)
......@@ -567,8 +573,8 @@ runCrossval <- function(X, y, clf, lfolds = NULL, nfolds = 10, return.all = FALS
res.crossval$nfold <- list()
res.crossval$k <- list()
res.crossval$scores <- list()
res.crossval$scores$empirical.auc <- as.data.frame(matrix(nrow=length(clf$params$sparsity), ncol=(nfolds-1)))
rownames(res.crossval$scores$empirical.auc) <- c(paste("k",clf$params$sparsity,sep="_"))
res.crossval$scores$empirical.auc <- as.data.frame(matrix(nrow=max(clf$params$sparsity), ncol=(nfolds-1)))
rownames(res.crossval$scores$empirical.auc) <- c(paste("k",c(1:max(clf$params$sparsity)), sep="_"))
colnames(res.crossval$scores$empirical.auc) <- paste("fold",1:(nfolds-1),sep="_") #-1 since the whole won't be taken into account
# add others using the same model
......@@ -753,11 +759,11 @@ runCrossval <- function(X, y, clf, lfolds = NULL, nfolds = 10, return.all = FALS
}else
{
# digest
res_train.digest <- digestModelCollection(X = x_train, obj = res_train, clf = clf)
res_train.digest <- digestModelCollection(obj = res_train, X = x_train, clf = clf)
}
}else # is a crossval result
{
res_train.digest <- digestModelCollection(X = x_train, obj = res_train$models, clf = clf)
res_train.digest <- digestModelCollection(obj = res_train$models, X = x_train, clf = clf)
} # end if/else models exist
}else # return nothing
{
......
......@@ -235,7 +235,7 @@ LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
res.all <- foreach (i = 1:nfolds) %dorng%
{
#printExperiment(clf)
#printClassifier(obj = clf)
# if (clf$params$verbose) {cat("===> k-fold\t",i,"\n")}
# prepare the datasets
......@@ -288,8 +288,8 @@ LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
res_train <- res.all[[i]]
# res_train <- runClassifier(X = x_train, y = y_train, clf = clf)
res_train.digest <- digestModelCollection(obj = res_train, clf, X=x_train, mmprev=FALSE)
res_train.digest_2 <- digestModelCollection(obj = res_train, clf, X=x_train, mmprev=TRUE)
res_train.digest <- digestModelCollection(obj = res_train, X = x_train, clf, mmprev=FALSE)
res_train.digest_2 <- digestModelCollection(obj = res_train, X = x_train, clf, mmprev=TRUE)
if(!is.null(res_train.digest))
{
# for all the best models of each k-sparse (create empty matrix) for auc
......
......@@ -216,7 +216,7 @@ terBeam_fit <- function(X, y, clf)
if(clf$params$verbose) print(paste("... ... parameters are checked and set"))
# Print the experiment configuration
if(clf$params$verbose) printExperiment(clf)
if(clf$params$verbose) printClassifier(obj = clf)
# Rprof("Profiling_terbeam", line.profiling = TRUE)
# Rprof(NULL)
......@@ -235,7 +235,7 @@ terBeam_fit <- function(X, y, clf)
if(!is.null(clf$feature.cor))
{
# for the ration language, force to have the same number of negative and positive features selected
if(clf$params$language == "ratio" & clf$params$objective == "auc")
if((clf$params$language == "ratio" | clf$params$language == "ter" | clf$params$language == "terinter") & clf$params$objective == "auc")
{
# select the best features here no need to compute all
nb.selected.features <- min(nrow(clf$feature.cor),clf$params$maxNbOfModels)
......@@ -278,7 +278,7 @@ terBeam_fit <- function(X, y, clf)
# for the ration language, force to have the same number of negative and positive features selected
if(clf$params$language == "ratio" & clf$params$objective == "auc")
if((clf$params$language == "ratio" | clf$params$language == "ter" | clf$params$language == "terinter") & clf$params$objective == "auc")
{
# select the best features here no need to compute all
nb.selected.features <- max(which(nbCombinaisons < clf$params$maxNbOfModels))
......
......@@ -227,7 +227,7 @@ terda_fit <- function(X, y, clf)
)
# Print the experiment configuration
if(clf$params$verbose) printExperiment(clf)
if(clf$params$verbose) printClassifier(obj = clf)
# if(clf$params$max.nb.features < nrow(X))
# {
......
......@@ -320,7 +320,8 @@ glmnetRR <- function(clf, X, y)
#Erreur dans if ((!remove.zero.vec) | sum(abs(wrr)) > 0) { :
# valeur manquante là où TRUE / FALSE est requis
myRRSkbest <- function(clf, X, y) {
myRRSkbest <- function(clf, X, y)
{
p <- clf$params
check.X_y_w(X,y) #sanity check
tX = as.data.frame(t(X))
......@@ -361,7 +362,9 @@ myRRSkbest <- function(clf, X, y) {
# print(paste("solver did not find a model in iteration =",i))
# }
res <- multipleRR(clf = clf, X = X, y = y, w = wb, n = p$nRR) # appelle nRR fois le rounding
# TODO (shouldn't we get the bestmodels for all the modelCollection instead ?) When we explore a range of k_sparse the best of the collection will tend to be a high number. By selecting the best per k_sparse we get a larger distribution (more models)
# TODO (shouldn't we get the bestmodels for all the modelCollection instead ?)
# When we explore a range of k_sparse the best of the collection will tend to be a high number.
# By selecting the best per k_sparse we get a larger distribution (more models)
#UNIQUE
# mod <- getTheBestModel(res)
# #getBestModels()
......@@ -369,7 +372,21 @@ myRRSkbest <- function(clf, X, y) {
# best_dense_vecs[[length(best_dense_vecs)+1]] <- ter_wb
# ALL the best per each k_sparse /// Update Edi
best.models <- getBestModels(res) # all the best models for each k_sparse
#best.models <- getBestModels(res) # all the best models for each k_sparse
best.models <- getNBestModels(obj = res,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 1,
single.best = FALSE,
single.best.cv = FALSE,
single.best.k = NULL,
max.min.prevalence = TRUE,
X = NULL,
verbose = FALSE,
evalToOrder = "unpenalized_fit_",
return.population = TRUE # MC
)
ter_wbs <- listOfModelsToListOfDenseVec(clf = clf, X = X, y = y, list.models = best.models)
the.best.model <- getTheBestModel(res) # the best model
......
......@@ -248,7 +248,7 @@ terga1_fit <- function(X, y, clf) {
clf$params$size_world <- nrow(X)
# Print the experiment configuration
if(clf$params$verbose) printExperiment(clf)
if(clf$params$verbose) printClassifier(obj = clf)
res <- list() # the final object containing the evolved models
for(i in clf$params$sparsity) # sparsity is = k, i.e. the number of features in a model
......
......@@ -47,7 +47,8 @@ selectMixed <- function(evaluation, percentage_elite, percentage_tournoi)
{
perc <- ceiling((percentage_elite + percentage_tournoi) * length(evaluation)/100)
perc_elite <- ceiling(percentage_elite * length(evaluation)/100)
perc_tournoi <- floor(percentage_tournoi * length(evaluation)/100)
#perc_tournoi <- floor(percentage_tournoi * length(evaluation)/100)
perc_tournoi <- perc - perc_elite
res <- rep(NA, perc)
# Elite
......
......@@ -299,7 +299,7 @@ terga2_fit <- function(X, y, clf)
clf$params$size_world <- nrow(X)
# Print the experiment configuration
if(clf$params$verbose) printExperiment(clf)
if(clf$params$verbose) printClassifier(obj = clf)
featEval <-rep(NA, length(rownames(X)))
names(featEval) <- rownames(X)
......
This diff is collapsed.
---
title: "modular_testing"
author: "Edi Prifti"
date: "5/1/2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
In this document we will be testing different functions to make sure they work properly
```{r load package}
library(predomics)
print(paste("The current version of the package is",packageVersion("predomics")))
```
## Load a dataset and experiment
For this tests we will be using the cirrhosis dataset at the bug_species level.
```{r pressure, echo=FALSE}
load("/data/projects/predomics_testing/analyses/2.db_segata/2.db_cirrhose_stage1/bug_species/db.rda")
load("/data/projects/predomics_testing/analyses/2.db_segata/2.db_cirrhose_stage1/bug_species/results/results.metal.all._spar_1_to_30.rda")
```
## getNBestModels
This function in the current version is generic and will replace several other functions performing parts of the tasks.
```{r getNbestModels}
#?getNBestModels
print("GET THE BEST 5")
# enter an experiment
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = FALSE,
single.best.cv = TRUE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # MC
)
# return an MC
printy(res)
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = FALSE,
single.best.cv = TRUE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = TRUE # population
)
# return a population
printy(res)
print("GET THE BEST FOR K")
# Control for significance
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 1,
single.best = FALSE,
single.best.cv = TRUE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # MC
)
# return best per k
printy(res)
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 1,
single.best = FALSE,
single.best.cv = TRUE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = TRUE # population
)
# return a population
printy(res)
print("GET THE POP MINMAXPREV")
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = FALSE,
single.best.cv = FALSE,
single.best.k = NULL,
max.min.prevalence = TRUE, # use the max.min.prevalence selection
X = X, # this is needed when this parameter is active
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # population
)
# return a population
printy(res)
# Get the best CV
print("GET THE BEST CV")
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = TRUE,
single.best.cv = TRUE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # MC
)
# return best per k
printy(res)
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = FALSE,
single.best.cv = FALSE,
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # population
)
# return a population
printy(res)
print("GET THE BEST CV")
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = TRUE, # give best
single.best.cv = TRUE, # based on CV
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # population
)
# return a population
printy(res)
print("GET THE BEST CV penalty")
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0.75/100,
n.best = 5,
single.best = TRUE, # give best
single.best.cv = TRUE, # based on CV
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # population
)
# return a population
printy(res)
print("GET THE BEST no CV")
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = TRUE, # give best
single.best.cv = FALSE, # not CV
single.best.k = NULL,
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # population
)
# return a population
printy(res)
print("GET THE BEST K")
res <- getNBestModels(obj = res.metal.all.,
significance = TRUE,
by.k.sparsity = TRUE,
k.penalty = 0,
n.best = 5,
single.best = TRUE,
single.best.cv = FALSE,
single.best.k = 3, # decide the best k to return
max.min.prevalence = FALSE,
X = NULL,
verbose = FALSE,
evalToOrder = "fit_",
return.population = FALSE # population
)
# return a population
printy(res)
```
## digest
This function in the current version will allow to extract from an experiment object important information that is needed for many functions
```{r digest}
dig <- digest(obj = res.metal.all.,
penalty = 0.5/100,
best.cv = TRUE,
best.k = NULL,
plot = FALSE)
printy(dig$best$model)
```
## digestModelCollection
Testing digestModelCollection
```{r digestModelCollection}
dig <- digestModelCollection(obj = res.metal.all.$classifier$models, X = NULL,clf = clf.metal.all., k.penalty = 0.75/100, mmprev = TRUE)
printy(dig$best$models)
printy(dig$best$model)
dig <- digestModelCollection(obj = res.metal.all.$classifier$models, X = NULL,clf = clf.metal.all., k.penalty = 0.75/100, mmprev = FALSE)
printy(dig$best$models)
printy(dig$best$model)
```
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment