Commit 277a2141 authored by Edi Prifti's avatar Edi Prifti

- adding a new licence and updating description

- adding pos_score and neg_score to the model for exploration of the balance
- add feature importance on prevalence PDA
- fix order of visu in the mergeResults
- adding custom colors in mergeMeltImportanceCV
- adding plotModelScore function
- adding vignette V0
parent 464f18a1
Package: predomics
Type: Package
Title: Ternary Prediction in large Omics Dataests
Version: 0.9.8
Date: 2018-07-07
Version: 0.9.9
Date: 2018-08-13
Author: Edi Prifti, Lucas Robin, Shasha Cui, Blaise Hanczar, Yann Chevaleyre, Jean-Daniel Zucker
Maintainer: Edi Prifti <edi.prifti@gmail.com>
Depends:
......@@ -26,11 +26,11 @@ Imports:
viridis,
kernlab,
randomForest
Description: This package offers access to a novel framework implementing several heuristics that allow to find
sparse and interpretable models in very large dimention data. These models are efficient and adopted for
classification and regression in metagenomics and other large omics datasets. We introduce the BTR (BIN, TER, RATIO)
languages that describe links between variables in different ways. In this framework are also implemented different
Description: The predomics offers access to a novel framework implementing several heuristics that allow to find
sparse and interpretable models in large datasets. These models are efficient and adopted for
classification and regression in metagenomics and other commensurable datasets. We introduce the BTR (BIN, TER, RATIO)
languages that describe different types of links between variables. In this framework are also implemented different
state of the art methods (SOTA) such as RF, ENET and SVM.
License: GPL (>= 2)
License: Attribution-NonCommercial-NoDerivatives 4.0 International
LazyData: TRUE
RoxygenNote: 6.0.1
RoxygenNote: 6.1.0
This diff is collapsed.
......@@ -83,6 +83,7 @@ export(plotComparativeResultsBest)
export(plotFeatureModelCoeffs)
export(plotMGSQuality)
export(plotModel)
export(plotModelScore)
export(plotPopulation)
export(plotPrevalence)
export(population)
......
......@@ -35,7 +35,7 @@ debugging_terda_YC <- function() {
nCores = nCores,
nRR = terda.nbRR,
seed = seed,
useCustomLanguage = useCustomLanguage,
#useCustomLanguage = useCustomLanguage,
experiment.id = "terda",
print_ind_method = print_ind_method,
experiment.save = experiment.save,
......
This diff is collapsed.
......@@ -43,17 +43,26 @@
plotComparativeEmpiricalScore <- function(digested.results, ylim = c(0.5,1), score = "auc_", main = "")
{
nbe <- length(table(digested.results$empirical[[score]]$method))# number of elements to compare
# plotting the Empirical scores
colors <- c(brewer.pal(9,"Set1"),
brewer.pal(9, "Set3"),
brewer.pal(9, "Spectral"),
brewer.pal(8, "Dark2")
)[1:nbe]
#pchs <- c(8:25)[1:nbe]
if(!is.null(digested.results$colors))
{
colors <- digested.results$colors
}else
{
colors <- c(brewer.pal(9,"Set1"),brewer.pal(9, "Set3"), brewer.pal(9, "Spectral"),brewer.pal(8, "Dark2"))[1:nbe]
}
if(!is.null(digested.results$pch))
{
pch <- digested.results$pch
}else
{
pch <- rep(19, nbe)
}
# EMPIRICAL SCORES
# TODO ORDER BY K not alphabetically
pd <- position_dodge(0.3) # move them .05 to the left and right
# EMPIRICAL SCORES
data <- digested.results$empirical[[score]]
# g <- ggplot(na.omit(data), aes(x=variable, y=value, colour=method)) +
# #geom_errorbar(aes(ymin=value, ymax=value), width=.1, position=pd) +
......@@ -113,11 +122,10 @@ plotComparativeEmpiricalScore <- function(digested.results, ylim = c(0.5,1), sco
ylim[2] <- max(data$value, na.rm = TRUE)
}
g <- ggplot(na.omit(data), aes(x=variable, y=value, colour=method)) +
g <- ggplot(na.omit(data), aes(x=variable, y=value, colour=method, shape = method)) +
#geom_errorbar(aes(ymin=value, ymax=value), width=.1, position=pd) +
geom_line(aes(group=method), position=pd) +
geom_point(position=pd, size=2, shape=19) + # 21 is filled circle
geom_point(position=pd, size=2) + # 21 is filled circle
xlab("k sparse") +
ggtitle(paste("WD","empirical",score, main, sep="; ")) +
geom_hline(aes(yintercept=1), lty=2, col="lightgray") +
......@@ -127,6 +135,7 @@ plotComparativeEmpiricalScore <- function(digested.results, ylim = c(0.5,1), sco
#scale_y_continuous(limits = ylim) +
scale_colour_manual(values=colors, name = "") +
scale_fill_manual(values=colors) +
scale_shape_manual(values=pch, name = "") +
theme_bw() +
#theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(axis.text.x=element_text(angle=90,hjust=1, vjust=0.5)) +
......@@ -791,15 +800,17 @@ multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
#' @param plot: if TRUE this provides a plot, otherwise will return different metrics such as prevalence and enrichment statistics
#' @param col.pt: colors for the point border (-1:deepskyblue4, 1:firebrick4)
#' @param col.bg: colors for the point fill (-1:deepskyblue1, 1:firebrick1)
#' @param zero.value: the value that specifies what is zero. This can be a different than 0 in log transformed data for instance (default = 0)
#' @return a ggplot object
#' @export
plotPrevalence <- function(features, X, y, topdown = TRUE, main = "", plot = TRUE,
col.pt = c("deepskyblue4", "firebrick4"),
col.bg = c("deepskyblue1", "firebrick1"))
col.bg = c("deepskyblue1", "firebrick1"),
zero.value = 0)
{
# build object
v.prop <- getFeaturePrevalence(features = features, X = X, y = y, prop = TRUE)
v.card <- getFeaturePrevalence(features = features, X = X, y = y, prop = FALSE)
v.prop <- getFeaturePrevalence(features = features, X = X, y = y, prop = TRUE, zero.value = zero.value)
v.card <- getFeaturePrevalence(features = features, X = X, y = y, prop = FALSE, zero.value = zero.value)
v.prop.mat <- do.call(rbind, v.prop)
v.card.mat <- do.call(rbind, v.card)
# build melted version
......@@ -1189,7 +1200,7 @@ plotPopulation <- function(pop, X, y,
#' @title Plots a model or a population of model objectsas barplots of scaled coefficients.
#'
#' @description Plots an model or a population of models as a barplots, representing each feature, the length being the coefficient
#' @description Plots a model or a population of models as a barplots, representing each feature, the length being the coefficient
#' @param mod: a model to plot
#' @param X: the data matrix with variables in the rows and observations in the columns
#' @param y: the class vector
......@@ -1431,13 +1442,10 @@ plotModel <- function(mod, X, y,
} # end CV mda plot type
#return(grid.arrange(g, g2, ncol=2, widths = c(2,1)))
} # end else plot importance
if(importance)
{
return(g2)
}else
}else # end else plot importance
{
warning("plotModel: importance is not available in this model, returning model plot")
return(g1)
}
}else
......@@ -1487,6 +1495,97 @@ plotModel <- function(mod, X, y,
}
#' @title Plots a model or a population of model objectsas barplots of scaled coefficients.
#' @import reshape2
#' @import ggplot2
#' @import plyr
#' @description Plots a model score or a population of models as a barplots, representing each feature, the length being the coefficient
#' @param mod: a model to plot
#' @param col.sign: the colors of the cofficients based on the sign of the coefficients (default: -1=deepskyblue1, 1:firebrick1)
#' @param main: possibility to change the title of the function (default:"")
#' @export
plotModelScore <- function(mod,
col.sign = c("deepskyblue1", "firebrick1"),
main = ""
)
{
# test model validity
if(!isModel(obj = mod))
{
print("plotModelScore: The model object is not valid!")
return(NULL)
}
if(length(col.sign) != 2)
{
print("plotModelScore: please provide 2 colors for the ternary coefficients excluding zero")
return(NULL)
}
# disable this plot for SOTA
if(isModelSota(mod))
{
print("plotScore: This method is not available for SOTA models")
return(NULL)
}
# disable this plot for SOTA
if(myAssertNotNullNorNa(mod$score_) | myAssertNotNullNorNa(mod$pos_score_) | myAssertNotNullNorNa(mod$neg_score_))
{
print("plotModelScore: One of either, score_, pos_score_, or neg_score_ attributes are missing. Returning empty handed.")
return(NULL)
}
# reset model attributes for glmnet for proper viewing
if(mod$learner == "terda" & mod$language == "logreg")
{
mod$learner <- "sota"
mod$language <- "glmnet"
}
df <- data.frame(pos = mod$pos_score_,
neg = mod$neg_score_
#,
#score = mod$score_
)
df.melt <- melt(df)
# transform in log
# df.melt$value <- df.melt$value + 1e-10
# df.melt$value <- log10(df.melt$value)
df.melt$variable <- factor(df.melt$variable, levels = c("neg","pos"))
df.melt.summarized <- ddply(df.melt, "variable", summarise, grp.mean=mean(value))
(g1 <- ggplot(data = df.melt, aes(x = value, color = variable)) +
geom_density() +
geom_vline(data = df.melt.summarized, aes(xintercept=grp.mean, color=variable), linetype="dashed") +
scale_color_manual(values=col.sign) +
ggtitle(main) +
theme_bw() +
ylab("score distribution") +
theme(legend.position="bottom", legend.direction="horizontal")
)
(g2 <- ggplot(data = df.melt, aes(x = variable, y = value, color = variable)) +
geom_boxplot(notch = FALSE, outlier.colour = NA, position = position_dodge(width=0.9), alpha = 0.3) +
#geom_boxplot(notch = FALSE, outlier.colour = NA, position = position_dodge(width=0.9), alpha = 0.3) +
geom_point(position=position_jitterdodge(dodge.width=0.9), size = 2, alpha = 0.5) +
scale_color_manual(values=col.sign) +
ylab("score distribution") +
xlab("Ter cumulative") +
ggtitle(main) +
theme_bw() +
theme(legend.position="bottom", legend.direction="horizontal")
)
return(grid.arrange(g1, g2, ncol=2))
}
#' @title Normalize the model coefficients needed for the plot
#'
#' @description Normalize the model coefficients needed for the plot
......
......@@ -127,6 +127,10 @@ fit <- function(X,
# transforming to dataframe first will make sure to keep the dimnames such as for instance when it is sparse table or otu_table (phyloseq)
y.nona <- y[!ina]
X.nona <- X[,!ina]
}else
{
y.nona <- y
X.nona <- X
}
feature.cor <- filterfeaturesK(data = t(apply(X.nona, 1, rank)), # for speedup
......
......@@ -307,8 +307,12 @@ LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
k_sparse.name <- names(res_train.digest$best_models)[k]
mod <- res_train.digest$best_models[[k_sparse.name]]
mod.train <- evaluateModel(mod=mod, X = x_train, y = y_train, clf = clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='train')
mod.test <- list()#evaluateModel(mod=mod, X=x_test, y=y_test, clf=clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='test')
mod$score_ <- getModelScore(mod = mod, X = as.matrix(x_test), clf, force.re.evaluation = TRUE)
mod.test <- list() #evaluateModel(mod=mod, X=x_test, y=y_test, clf=clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='test')
scorelist <- getModelScore(mod = mod, X = as.matrix(x_test), clf, force.re.evaluation = TRUE)
mod$score_ <- scorelist$score_
mod$pos_score_ <- scorelist$pos_score_
mod$neg_score_ <- scorelist$neg_score_
mod.test$accuracy_ <- evaluateAccuracy(mod = mod, X = x_test, y = y_test, clf = clf)$accuracy_
# Empirical fitting score
......@@ -362,7 +366,11 @@ LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
mod <- res_train.digest_2$best_models[[k_sparse.name]]
mod.train <- evaluateModel(mod = mod, X = x_train, y = y_train, clf = clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='train')
mod.test <- list() #evaluateModel(mod=mod, X=x_test, y=y_test, clf=clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='test')
mod$score_ <- getModelScore(mod = mod, X = as.matrix(x_test), clf, force.re.evaluation=TRUE)
scorelist <- getModelScore(mod = mod, X = as.matrix(x_test), clf, force.re.evaluation = TRUE)
mod$score_ <- scorelist$score_
mod$pos_score_ <- scorelist$pos_score_
mod$neg_score_ <- scorelist$neg_score_
mod.test$accuracy_ <- evaluateAccuracy(mod = mod, X = x_test, y = y_test, clf = clf)$accuracy_
# Empirical fitting score
......
......@@ -76,28 +76,16 @@ glmnetRR <- function(clf, X, y)
# nblambdas = 400
# lambdas = exp(-c(0:nblambdas)/(28*sqrt(nblambdas/150)))
if(clf$params$verbose) print("... ... running glmnet")
# set up the family parameter
# https://www.r-bloggers.com/how-and-when-ridge-regression-with-glmnet/
if(clf$params$objective == "cor")
{
family <- "gaussian"
}else
{
family <- "binomial"
}
ina <- is.na(y)
system.time(glmmod <- glmnet(x = t(X[,!ina]),
y = y[!ina],
system.time(glmmod <- glmnet(x = t(X),
y = y,
alpha = p$alpha,
family = family,
family = "binomial",
lower.limits = lower.limits,
upper.limits = upper.limits,
intercept = intercept,
lambda = lambdas,
standardize = TRUE)
)
)
if(clf$params$verbose) print("... ... glmnet object is created")
if(clf$params$plot)
......@@ -166,59 +154,56 @@ glmnetRR <- function(clf, X, y)
# Edi added this to solve a bug when SparsityInRange is NA
#if(is.na(SparsityInRange)) SparsityInRange <- FALSE
if(myAssertNotNullNorNa(SparsityInRange))
if(SparsityInRange || p$nRR==0) # if in the sparsity range or no roundings
{
if(SparsityInRange || p$nRR==0) # if in the sparsity range or no roundings
if(clf$params$verbose & !clf$params$debug)
{
cat(paste(".")) # progress
}
if(clf$params$debug) # give more information when debugging
{
cat(paste("glmnetRR...\t","j:",j,"\tlambda:",signif(lambda,3),"\n"))
}
glmmod.coefs <- as.matrix(coef(glmmod, s = lambda))
glmmod.coefs <- glmmod.coefs[2:(nrow(glmmod.coefs)),]
if(!all(glmmod.coefs==0)) # if the model is not empty
{
if(clf$params$verbose & !clf$params$debug)
# Randomized Rounding
if(p$nRR >= 1)
{
cat(paste(".")) # progress
}
if(clf$params$debug) # give more information when debugging
set.seed(clf$params$seed) # Different "seed" give us different result. # want different results
res <- multipleRR(clf = clf, X = X, y = y, w = glmmod.coefs, n = p$nRR) # use the no parallel version due to more global paralleliszaition
}else
{
cat(paste("glmnetRR...\t","j:",j,"\tlambda:",signif(lambda,3),"\n"))
res <- list(glmmod.coefs)
}
glmmod.coefs <- as.matrix(coef(glmmod, s = lambda))
glmmod.coefs <- glmmod.coefs[2:(nrow(glmmod.coefs)),]
if(!all(glmmod.coefs==0)) # if the model is not empty
if(clf$params$language == "logreg")
{
# Randomized Rounding
if(p$nRR >= 1)
{
set.seed(clf$params$seed) # Different "seed" give us different result. # want different results
res <- multipleRR(clf = clf, X = X, y = y, w = glmmod.coefs, n = p$nRR) # use the no parallel version due to more global paralleliszaition
}else
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf, obj=glmmod)
if(!is.null(mod))
{
res <- list(glmmod.coefs)
}
if(clf$params$language == "logreg")
{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf, obj=glmmod)
if(!is.null(mod))
{
mod$lambda <- lambda
}
pop <- c(pop, list(mod)) # add it to the pop
}else{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf)
pop <- c(pop, list(mod)) # add it to the pop
mod$lambda <- lambda
}
lamdas.zone[j] <- TRUE # to plot the zone when it is zommed
pop <- c(pop, list(mod)) # add it to the pop
}else{
mod <- denseVecToModel(X = X, y = y, v = res[[1]], clf = clf)
pop <- c(pop, list(mod)) # add it to the pop
}
}else
lamdas.zone[j] <- TRUE # to plot the zone when it is zommed
}
}else
{
if(clf$params$debug)
{
if(clf$params$debug)
{
print(paste(j, "This model has no coeffs. lambda =",signif(lambda,4)))
# Exclude the fact when the model has no coefficients.
}
print(paste(j, "This model has no coeffs. lambda =",signif(lambda,4)))
# Exclude the fact when the model has no coefficients.
}
} # checking for sanity
}
} # end loop for all coeffs
# clean the population (NULL) models
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment