Commit 9d42a5cb authored by Edi Prifti's avatar Edi Prifti

- updating version of the package

- improving regression analyses, adding more attributes such as R2 and SER (standard error of the regression). default eval_to_fit is rsq_
- fixing different bugs related to regression analyses as the code had substantialy evolved since this was tested
- adding evalToFit attribute to models
- improving overall code quality and robustness
- controling color and shape in comparative plots
- setting same objective in metal generators and unificators
- adding family for glmnet in terda for regression mode
parent 26dfdd13
......@@ -8,6 +8,5 @@ test.txt
Profiling_*
*.yml
testsdb3.RData
modified_comparison_results_visu_allMethods_1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.21.22.23.24.25.26.27.28.29.30.pdf
man/
.Rhistory
Package: predomics
Type: Package
Title: Ternary Prediction in large Omics Dataests
Version: 0.9.7
Date: 2018-05-29
Version: 0.9.8
Date: 2018-07-07
Author: Edi Prifti, Lucas Robin, Shasha Cui, Blaise Hanczar, Yann Chevaleyre, Jean-Daniel Zucker
Maintainer: Edi Prifti <edi.prifti@gmail.com>
Depends:
......@@ -26,12 +26,11 @@ Imports:
viridis,
kernlab,
randomForest
Description: The main objective of this work is to propose a framework for epxploring and
finding sparse and interpretable modesl in very large dimentionality.
These models are efficient and adopted for classification and regression in metagenomics and
other large omics datasets. We propose several languages that describe links between variables
in different ways. In this framework are also implemented different and complementary heuristics
that allow us to identify accurate models.
Description: This package offers access to a novel framework implementing several heuristics that allow to find
sparse and interpretable models in very large dimention data. These models are efficient and adopted for
classification and regression in metagenomics and other large omics datasets. We introduce the BTR (BIN, TER, RATIO)
languages that describe links between variables in different ways. In this framework are also implemented different
state of the art methods (SOTA) such as RF, ENET and SVM.
License: GPL (>= 2)
LazyData: TRUE
RoxygenNote: 6.0.1
......@@ -78,7 +78,6 @@ export(plotAbundanceByCalss)
export(plotComparativeBestCV)
export(plotComparativeCV)
export(plotComparativeEmpiricalScore)
export(plotComparativeFigure)
export(plotComparativeResults)
export(plotComparativeResultsBest)
export(plotFeatureModelCoeffs)
......
This diff is collapsed.
This diff is collapsed.
......@@ -195,6 +195,7 @@ metal_fit <- function(X, y, clf)
g.clf$params$parallelize.folds <- FALSE
# initiate the current sparsity
g.clf$params$current_seed <- clf$params$current_seed
g.clf$params$objective <- clf$params$objective
# fix the max number of features
g.clf$params$max.nb.features <- clf$params$max.nb.features
......@@ -270,6 +271,7 @@ metal_fit <- function(X, y, clf)
u.clf$params$size_pop <- length(in_pop)
u.clf$params$in_pop <- in_pop
u.clf$params$cluster <- clf$params$cluster
u.clf$params$objective <- clf$params$objective
# fix the max number of features
u.clf$params$max.nb.features <- clf$params$max.nb.features
......
......@@ -67,7 +67,7 @@ myAssertNotNullNorNa <- function(obj, message="", stop=FALSE)
#' tests weather two values are close
#'
#' @description Asserts the existance of an object and prints a message or stops the block
#' @description Asserts wether two vectors of the same length are close in value below a given threshold
#' @param x: condition to be tested
#' @param y: message to be printed
#' @return TRUE when the distance of two numbers is smaller than a given value
......@@ -76,5 +76,12 @@ isclose <- function(x, y, e=1e-10)
if(length(x)!=length(y)){
stop("x and y should have the same length")
}
return(abs(x-y) < e)
if(length(x)>1)
{
return(all(abs(x-y) < e))
}else
{
return(abs(x-y) < e)
}
}
This diff is collapsed.
......@@ -76,10 +76,21 @@ glmnetRR <- function(clf, X, y)
# nblambdas = 400
# lambdas = exp(-c(0:nblambdas)/(28*sqrt(nblambdas/150)))
if(clf$params$verbose) print("... ... running glmnet")
# set up the family parameter
# https://www.r-bloggers.com/how-and-when-ridge-regression-with-glmnet/
if(clf$params$objective == "cor")
{
family <- "gaussian"
}else
{
family <- "binomial"
}
system.time(glmmod <- glmnet(x = t(X),
y = y,
alpha = p$alpha,
family = "binomial",
family = family,
lower.limits = lower.limits,
upper.limits = upper.limits,
intercept = intercept,
......@@ -154,56 +165,59 @@ glmnetRR <- function(clf, X, y)
# Edi added this to solve a bug when SparsityInRange is NA
#if(is.na(SparsityInRange)) SparsityInRange <- FALSE
if(SparsityInRange || p$nRR==0) # if in the sparsity range or no roundings
if(myAssertNotNullNorNa(SparsityInRange))
{
if(clf$params$verbose & !clf$params$debug)
if(SparsityInRange || p$nRR==0) # if in the sparsity range or no roundings
{
cat(paste(".")) # progress
}
if(clf$params$debug) # give more information when debugging
{
cat(paste("glmnetRR...\t","j:",j,"\tlambda:",signif(lambda,3),"\n"))
}
glmmod.coefs <- as.matrix(coef(glmmod, s = lambda))
glmmod.coefs <- glmmod.coefs[2:(nrow(glmmod.coefs)),]
if(!all(glmmod.coefs==0)) # if the model is not empty
{
# Randomized Rounding
if(p$nRR >= 1)
if(clf$params$verbose & !clf$params$debug)
{
set.seed(clf$params$seed) # Different "seed" give us different result. # want different results
res <- multipleRR(clf = clf, X = X, y = y, w = glmmod.coefs, n = p$nRR) # use the no parallel version due to more global paralleliszaition
}else
cat(paste(".")) # progress
}
if(clf$params$debug) # give more information when debugging
{
res <- list(glmmod.coefs)
cat(paste("glmnetRR...\t","j:",j,"\tlambda:",signif(lambda,3),"\n"))
}
if(clf$params$language == "logreg")
glmmod.coefs <- as.matrix(coef(glmmod, s = lambda))
glmmod.coefs <- glmmod.coefs[2:(nrow(glmmod.coefs)),]
if(!all(glmmod.coefs==0)) # if the model is not empty
{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf, obj=glmmod)
if(!is.null(mod))
# Randomized Rounding
if(p$nRR >= 1)
{
mod$lambda <- lambda
set.seed(clf$params$seed) # Different "seed" give us different result. # want different results
res <- multipleRR(clf = clf, X = X, y = y, w = glmmod.coefs, n = p$nRR) # use the no parallel version due to more global paralleliszaition
}else
{
res <- list(glmmod.coefs)
}
if(clf$params$language == "logreg")
{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf, obj=glmmod)
if(!is.null(mod))
{
mod$lambda <- lambda
}
pop <- c(pop, list(mod)) # add it to the pop
}else{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf)
pop <- c(pop, list(mod)) # add it to the pop
}
pop <- c(pop, list(mod)) # add it to the pop
}else{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf)
pop <- c(pop, list(mod)) # add it to the pop
lamdas.zone[j] <- TRUE # to plot the zone when it is zommed
}
lamdas.zone[j] <- TRUE # to plot the zone when it is zommed
}
}else
{
if(clf$params$debug)
}else
{
print(paste(j, "This model has no coeffs. lambda =",signif(lambda,4)))
# Exclude the fact when the model has no coefficients.
if(clf$params$debug)
{
print(paste(j, "This model has no coeffs. lambda =",signif(lambda,4)))
# Exclude the fact when the model has no coefficients.
}
}
}
} # checking for sanity
} # end loop for all coeffs
# clean the population (NULL) models
......
......@@ -219,7 +219,7 @@ terga2_fit <- function(X, y, clf)
ter=
{
# ternary language without intercept (maximize the accuracy)
if(clf$params$verbose){print("Setting environment for the language 'ter'")}
if(clf$params$verbose){cat("... ... Setting environment for the language 'ter'\n")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
......@@ -237,7 +237,7 @@ terga2_fit <- function(X, y, clf)
terinter=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'terinter'")}
if(clf$params$verbose){cat("... ... Setting environment for the language 'terinter'\n")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
......@@ -246,7 +246,7 @@ terga2_fit <- function(X, y, clf)
bin=
{
# ternary language without intercept (maximize the accuracy)
if(clf$params$verbose){print("Setting environment for the language 'bin'")}
if(clf$params$verbose){cat("... ... Setting environment for the language 'bin'\n")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
......@@ -264,7 +264,7 @@ terga2_fit <- function(X, y, clf)
bininter=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'bininter'")}
if(clf$params$verbose){cat("... ... Setting environment for the language 'bininter'\n")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
......@@ -273,7 +273,7 @@ terga2_fit <- function(X, y, clf)
ratio=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'ratio'")}
if(clf$params$verbose){cat("... ... Setting environment for the language 'ratio'\n")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
......@@ -317,12 +317,12 @@ terga2_fit <- function(X, y, clf)
{
# starting population
pop <- clf$params$in_pop
if(clf$params$verbose) print(paste("... ... using seeding population",length(pop)))
if(clf$params$verbose) cat(paste("... ... using seeding population",length(pop),"\n"))
}else
{
# otherwise start fresh
pop <- population2(X, y, clf, featEval) # We generate randomly a first population
if(clf$params$verbose) print(paste("... ... using generated population",length(pop)))
if(clf$params$verbose) cat(paste("... ... using generated population",length(pop),"\n"))
}
......
......@@ -215,12 +215,18 @@ individual_vec_v1 <- function(clf, signs = NULL) # changer à dense
# Another one, they are here mainly to be examples
individual_vec_v2 <- function(clf, signs = NULL)
{
if(clf$params$size_world == "NULL")
{
warning("individual_vec_v2: the size_world is not set while expected")
return(NULL)
}
if(clf$params$sparsity.min != clf$params$sparsity.max)
{
notNull.number <- sample(clf$params$sparsity.min:clf$params$sparsity.max, 1)
} else
{
notNull.number <- clf$params$sparsity.mean
notNull.number <- clf$params$sparsity.mean
}
notNull.ind <- sample(1:clf$params$size_world, notNull.number)
res <- rep(0, clf$params$size_world)
......@@ -616,32 +622,35 @@ evolve2m <- function(X, y, clf, pop, featEval, generation)
child <- list()
mutatedIndiv <- list()
# Appliquer les operations sur pop[[i]]
if(pop[[i]]$selected && (pop[[i]]$mate > 0))
if(myAssertNotNullNorNa(pop[[i]]$selected) & myAssertNotNullNorNa(pop[[i]]$mate) & myAssertNotNullNorNa(pop[[i]]$toBeMutated))
{
child_1 <- clf$functions$crossingIndividual(X, y, clf, pop[[i]], pop[[pop[[i]]$mate]])
# if parents have the same language
if(pop[[i]]$language == pop[[pop[[i]]$mate]]$language)
# Appliquer les operations sur pop[[i]]
if(pop[[i]]$selected && (pop[[i]]$mate > 0))
{
# make only one child
child[[length(child) +1]]<-evaluateModel(mod = child_1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}else
child_1 <- clf$functions$crossingIndividual(X, y, clf, pop[[i]], pop[[pop[[i]]$mate]])
# if parents have the same language
if(pop[[i]]$language == pop[[pop[[i]]$mate]]$language)
{
# make only one child
child[[length(child) +1]]<-evaluateModel(mod = child_1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}else
{
# otherwise make two identical with each of the parent's languages
child_2 <-child_1
child_1$language <- pop[[i]]$language
child_2$language <- pop[[pop[[i]]$mate]]$language
child[[length(child) +1]]<-evaluateModel(mod = child1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
child[[length(child) +1]]<-evaluateModel(mod = child2, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}
}
if(pop[[i]]$toBeMutated)
{
# otherwise make two identical with each of the parent's languages
child_2 <-child_1
child_1$language <- pop[[i]]$language
child_2$language <- pop[[pop[[i]]$mate]]$language
child[[length(child) +1]]<-evaluateModel(mod = child1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
child[[length(child) +1]]<-evaluateModel(mod = child2, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
mutatedIndiv <- clf$functions$mutator(X, y, clf, pop, pop[[i]], 1:clf$params$size_world, featEval)
mutatedIndiv <- evaluateModel(mod = mutatedIndiv, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}
}
if(pop[[i]]$toBeMutated)
{
mutatedIndiv <- clf$functions$mutator(X, y, clf, pop, pop[[i]], 1:clf$params$size_world, featEval)
mutatedIndiv <- evaluateModel(mod = mutatedIndiv, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}
} # end iff test
return(list(child = child, mutatedIndiv = mutatedIndiv))
}
......@@ -676,51 +685,57 @@ evolve2m <- function(X, y, clf, pop, featEval, generation)
{
newPop <- list()
# for each of the size_pop best individuals (smaller then the real population size)
#for(i in 1:clf$params$size_pop)
for(i in 1:length(pop))
{
if(pop[[i]]$selected && (pop[[i]]$mate > 0))
if(myAssertNotNullNorNa(pop[[i]]$selected) & myAssertNotNullNorNa(pop[[i]]$mate) & myAssertNotNullNorNa(pop[[i]]$toBeMutated))
{
# create the first child
child_1 <- clf$functions$crossingIndividual(X, y, clf, pop[[i]], pop[[pop[[i]]$mate]])
# if parents languages are the same we keep only one child
if(pop[[i]]$language == pop[[pop[[i]]$mate]]$language)
if(pop[[i]]$selected && (pop[[i]]$mate > 0))
{
child <- evaluateModel(mod = child_1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
newPop[[length(newPop) +1]] <- child
# create the first child
child_1 <- clf$functions$crossingIndividual(X, y, clf, pop[[i]], pop[[pop[[i]]$mate]])
}else # if they have different languages (we create two identical children, each with one language)
# if parents languages are the same we keep only one child
if(pop[[i]]$language == pop[[pop[[i]]$mate]]$language)
{
child <- evaluateModel(mod = child_1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
newPop[[length(newPop) +1]] <- child
}else # if they have different languages (we create two identical children, each with one language)
{
child_2 <- child_1
child_1$language <- pop[[i]]$language
child_2$language <- pop[[pop[[i]]$mate]]$language
newPop[[length(newPop) +1]] <- evaluateModel(mod = child_1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
newPop[[length(newPop) +1]] <- evaluateModel(mod = child_2, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}
}
if(pop[[i]]$toBeMutated)
{
child_2 <- child_1
child_1$language <- pop[[i]]$language
child_2$language <- pop[[pop[[i]]$mate]]$language
newPop[[length(newPop) +1]] <- evaluateModel(mod = child_1, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
newPop[[length(newPop) +1]] <- evaluateModel(mod = child_2, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
mutatedIndiv <- clf$functions$mutator(X, y, clf, pop, individual_to_be_mutated = pop[[i]], all_genes = c(1:clf$params$size_world), featEval)
newPop[[length(newPop) +1]] <- evaluateModel(mod = mutatedIndiv, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}
}
if(pop[[i]]$toBeMutated)
{
mutatedIndiv <- clf$functions$mutator(X, y, clf, pop, individual_to_be_mutated = pop[[i]], all_genes = c(1:clf$params$size_world), featEval)
newPop[[length(newPop) +1]] <- evaluateModel(mod = mutatedIndiv, X = X, y = y, clf = clf, eval.all = FALSE, force.re.evaluation = TRUE)
}
} # end for each model in the population size
} # end if no parallel
pop <- sortPopulation(pop, evalToOrder = "fit_")
pop <- pop[1:round(clf$params$size_pop*2/3)] # get the 2/3 best ones
pop <- pop[1:min(length(pop),round(clf$params$size_pop*2/3))] # get the 2/3 population or the size of the pop if smaller
# set population size bigger at the first generation
if(generation==0) { clf$params$size_pop <- round(clf$params$size_pop*3/2) }
n_rest <- clf$params$size_pop - length(pop)
pop[(length(pop)+1):(length(pop)+ n_rest)] <- sortPopulation(newPop, evalToOrder = "fit_")[1:n_rest]
# add in the rest of the population 1/3, the best models
pop <- c(pop, sortPopulation(newPop, evalToOrder = "fit_")[1:n_rest])
#pop[(length(pop)+1):(length(pop)+ n_rest)] <- sortPopulation(newPop, evalToOrder = "fit_")[1:n_rest]
# new2keep <- selectIndividualToKeep(clf, newPop)
# pop[(length(pop)+1):(length(pop)+ length(new2keep))] <- new2keep
# add the new population
pop[(length(pop)+1):(length(pop)+ length(newPop))] <- newPop
# add the whole new population
#pop[(length(pop)+1):(length(pop) + length(newPop))] <- newPop
pop <- c(pop, newPop)
pop <- cleanPopulation(pop = pop, clf = clf)
pop <- unique(pop) # clean
pop <- sortPopulation(pop, evalToOrder = "fit_") # sort
pop <- resetTags(pop) # reset
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment