Module sklearn_ensemble_cv.cross_validation
Expand source code
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn_ensemble_cv.ensemble import Ensemble
from sklearn_ensemble_cv.utils import check_input, process_grid
from sklearn.tree import DecisionTreeRegressor
from joblib import Parallel, delayed
n_jobs = 16
import warnings
def fit_ensemble(regr=None,kwargs_regr={},kwargs_ensemble={}):
if regr is None:
regr = DecisionTreeRegressor
return Ensemble(estimator=regr(**kwargs_regr), **kwargs_ensemble)
# Sample-split and K-fold cross-validation
def comp_empirical_val(
X_train, Y_train, X_val, Y_val, regr, kwargs_regr={}, kwargs_ensemble={}, M=20,
n_jobs=-1, X_test=None, Y_test=None, _check_input=True, **kwargs_est,
Compute the empirical ECV estimate for a given ensemble model.
X_train, Y_train : numpy.array
The training samples.
X_val, Y_val : numpy.array
The validation samples.
regr : object
The base estimator to use for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The maximum ensemble size to consider.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The test samples.
_check_input : bool, optional
If True, check the input arguments.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
risk_ecv : numpy.array
The empirical ECV estimate.
if _check_input:
kwargs_regr, kwargs_ensemble, kwargs_est = check_input(kwargs_regr, kwargs_ensemble, kwargs_est, M)
regr = fit_ensemble(regr,kwargs_regr,kwargs_ensemble).fit(X_train, Y_train)
risk_val = regr.compute_risk(X_val, Y_val, M_test=None, return_df=False, n_jobs=n_jobs, **kwargs_est)
if X_val is not None and Y_test is not None:
risk_test = regr.compute_risk(X_test, Y_test, M, n_jobs=n_jobs, **kwargs_est)
return regr, (risk_val, risk_test)
return regr, risk_val
def splitCV(
X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={},
M=20, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs
Sample-split cross-validation for ensemble models.
X_train, Y_train : numpy.array
The training samples.
regr : object
The base estimator to use for the ensemble model.
grid_regr : pandas.DataFrame
The grid of hyperparameters for the base estimator.
grid_ensemble : pandas.DataFrame
The grid of hyperparameters for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The ensemble size to build.
return_df : bool, optional
If True, returns the results as a pandas.DataFrame object.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The test samples. It may be useful to be used for comparing the
performance of different cross-validation methods.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
kwargs : dict, optional
Additional keyword arguments for `ShuffleSplit`; see
for more details.
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid(
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M)
test = X_test is not None and Y_test is not None
n_res = 2*M if test else M
n_grid = len(grid_regr)
res_risk = np.full((n_grid,n_res), np.inf)
rs = ShuffleSplit(1, **kwargs)
id_train, id_val = next(rs.split(X_train, Y_train))
_X_train, _X_val, _Y_train, _Y_val = X_train[id_train], X_train[id_val], Y_train[id_train], Y_train[id_val]
for i in range(n_grid):
params_ensemble = grid_ensemble[i]
params_regr = grid_regr[i]
_, res = comp_empirical_val(
_X_train, _Y_train, _X_val, _Y_val, regr,
{**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble},
M, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est
res_risk[i, :] = np.r_[res]
if return_df:
cols = np.char.add(['risk_val-']*M, np.char.mod('%d', 1+np.arange(M)))
if test:
cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M))))
res_splitcv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble),
pd.DataFrame(res_risk, columns=cols)
] ,axis=1)
if test:
res_splitcv = (res_risk[:,:M], res_risk[:,M:])
res_splitcv = res_risk
j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape)
M_best += 1
info = {
'best_params_regr': {**kwargs_regr, **grid_regr[j]},
'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]},
'best_n_estimators': M_best,
'best_score':res_risk[j, M_best-1],
return res_splitcv, info
def KFoldCV(
X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={},
M=20, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs
Sample-split cross-validation for ensemble models.
X_train, Y_train : numpy.array
The training samples.
regr : object
The base estimator to use for the ensemble model.
grid_regr : pandas.DataFrame
The grid of hyperparameters for the base estimator.
grid_ensemble : pandas.DataFrame
The grid of hyperparameters for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The ensemble size to build.
return_df : bool, optional
If True, returns the results as a pandas.DataFrame object.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The test samples. It may be useful to be used for comparing the
performance of different cross-validation methods.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
kwargs : dict, optional
Additional keyword arguments for `KFold`; see
for more details.
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid(
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M)
test = X_test is not None and Y_test is not None
n_res = 2*M if test else M
n_grid = len(grid_regr)
kf = KFold(**kwargs)
n_splits = kf.get_n_splits(X_train)
res_risk_all = np.full((n_grid,n_res,n_splits), np.inf)
for fold, (id_train, id_val) in enumerate(kf.split(X_train)):
_X_train, _X_val, _Y_train, _Y_val = X_train[id_train], X_train[id_val], Y_train[id_train], Y_train[id_val]
for i in range(n_grid):
params_ensemble = grid_ensemble[i]
params_regr = grid_regr[i]
_, res = comp_empirical_val(
_X_train, _Y_train, _X_val, _Y_val, regr,
{**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble},
M, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est
res_risk_all[i, :, fold] = np.r_[res]
res_risk = np.mean(res_risk_all, axis=2)
if return_df:
cols = np.char.add(['risk_val-']*M, np.char.mod('%d', 1+np.arange(M)))
if test:
cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M))))
res_splitcv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble),
pd.DataFrame(res_risk, columns=cols)
] ,axis=1)
if test:
res_splitcv = (res_risk[:,:M], res_risk[:,M:])
res_splitcv = res_risk
j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape)
M_best += 1
info = {
'best_params_regr': {**kwargs_regr, **grid_regr[j]},
'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]},
'best_n_estimators': M_best,
'best_score':res_risk[j, M_best-1],
'test_score':None if not test else res_risk_all[:,M:],
return res_splitcv, info
# Out-of-bag cross-validation
def comp_empirical_ecv(
X_train, Y_train, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=np.inf,
n_jobs=-1, X_test=None, Y_test=None, _check_input=True, **kwargs_est,
Compute the empirical ECV estimate for a given ensemble model.
X_train, Y_train : numpy.array
The training samples.
regr : object
The base estimator to use for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The maximum ensemble size to consider.
M0 : int, optional
The number of estimators to use for the ECV estimate.
M_max : int, optional
The maximum ensemble size to consider for the tuned ensemble.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The test samples.
_check_input : bool, optional
If True, check the input arguments.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
risk_ecv : numpy.array
The empirical ECV estimate.
if _check_input:
if M0>M:
raise ValueError('M0 must be less than or equal to M.')
if np.isinf(M_max):
M_max = np.append(np.arange(M)+1, np.inf)
elif np.isscalar(M_max):
M_max = np.arange(M_max)+1
kwargs_regr, kwargs_ensemble, kwargs_est = check_input(kwargs_regr, kwargs_ensemble, kwargs_est, M)
regr = fit_ensemble(regr,kwargs_regr,kwargs_ensemble).fit(X_train, Y_train)
risk_ecv = regr.compute_ecv_estimate(X_train, Y_train, M_max, M0=M0, n_jobs=n_jobs, **kwargs_est)
if X_test is not None and Y_test is not None:
risk_val = regr.compute_risk(X_test, Y_test, M, n_jobs=n_jobs, **kwargs_est)
return regr, (risk_ecv, risk_val)
return regr, risk_ecv
def ECV(
X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={},
M=20, M0=20, M_max=np.inf, delta=0., return_df=False, n_jobs=-1, X_test=None, Y_test=None,
kwargs_est={}, **kwargs
Cross-validation for ensemble models using the empirical ECV estimate.
X_train, Y_train : numpy.array
The training samples.
grid : pandas.DataFrame
The grid of hyperparameters to search over.
regr : object
The base estimator to use for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The ensemble size to build.
M0 : int, optional
The number of estimators to use for the ECV estimate.
M_max : int, optional
The maximum ensemble size to consider for the tuned ensemble.
delta : float, optional
The suboptimality parameter for the ensemble size tuning by ECV.
return_df : bool, optional
If True, returns the results as a pandas.DataFrame object.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The validation samples. It may be useful to be used for comparing the
performance of ECV with other cross-validation methods that requires sample-splitting.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid(
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M)
if M0>M:
raise ValueError('M0 must be less than or equal to M.')
if np.isinf(M_max):
M_max = np.append(np.arange(M)+1, np.inf)
elif np.isscalar(M_max):
M_max = np.arange(M_max)+1
n_M_max = len(M_max)
test = X_test is not None and Y_test is not None
n_res = n_M_max+M if test else n_M_max
n_grid = len(grid_regr)
res_risk = np.full((n_grid, n_res), np.inf)
for i in range(n_grid):
params_ensemble = grid_ensemble[i]
params_regr = grid_regr[i]
_, res = comp_empirical_ecv(
X_train, Y_train, regr,
{**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble},
M, M0, M_max, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est
res_risk[i, :] = np.r_[res]
if return_df:
cols = np.char.add(['risk_val-']*n_M_max, np.char.mod('%d', 1+np.arange(n_M_max)))
if np.isinf(M_max[-1]):
cols[-1] = 'risk_val-inf'
if test:
cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M))))
res_ecv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble),
pd.DataFrame(res_risk, columns=cols)
] ,axis=1)
if test:
res_ecv = (res_risk[:,:n_M_max], res_risk[:,n_M_max:])
res_ecv = res_risk
j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape)
M_best += 1
if delta==0.:
M_hat = np.inf
M_hat = int(np.ceil(2 / (delta + 2/M_max[-1]*(res_risk[j,0] - res_risk[j,1])) *
(res_risk[j,0] - res_risk[j,1])))
M_best_ext = np.minimum(M_hat, M_max[-1])
if not np.isinf(M_best_ext):
M_best_ext = int(M_best_ext)
info = {
'best_params_regr': {**kwargs_regr, **grid_regr[j]},
'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]},
'best_n_estimators': M_best,
'best_score':res_risk[j, M_best-1],
'delta': delta,
'best_n_estimators_extrapolate': M_best_ext,
'best_score_extrapolate': res_risk[j,n_M_max-1] if np.isinf(M_best_ext) else res_risk[j, M_best_ext-1],
return res_ecv, info
# Generalized cross-validation
def comp_empirical_gcv(
X_train, Y_train, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=np.inf,
corrected=True, type='full',
n_jobs=-1, X_test=None, Y_test=None, _check_input=True, **kwargs_est,
Compute the empirical GCV or CGCV estimate for a given ensemble model.
X_train, Y_train : numpy.array
The training samples.
regr : object
The base estimator to use for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The maximum ensemble size to consider.
corrected : bool, optional
If True, compute the corrected GCV estimate.
type : str, optional
The type of GCV or GCV estimate to compute. It can be either 'full' or 'union' for naive GCV,
and 'full' or 'ovlp' for CGCV.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The test samples.
_check_input : bool, optional
If True, check the input arguments.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
risk_ecv : numpy.array
The empirical ECV estimate.
if _check_input:
if M0>M:
raise ValueError('M0 must be less than or equal to M.')
if np.isinf(M_max):
M_max = np.append(np.arange(M)+1, np.inf)
elif np.isscalar(M_max):
M_max = np.arange(M_max)+1
kwargs_regr, kwargs_ensemble, kwargs_est = check_input(kwargs_regr, kwargs_ensemble, kwargs_est, M)
regr = fit_ensemble(regr,kwargs_regr,kwargs_ensemble).fit(X_train, Y_train)
if corrected:
risk_gcv = regr.compute_cgcv_estimate(X_train, Y_train, M0, type, n_jobs=n_jobs, **kwargs_est)
risk_gcv = regr.compute_gcv_estimate(X_train, Y_train, M0, type, n_jobs=n_jobs, **kwargs_est)
risk_gcv = regr.extrapolate(risk_gcv, M_max)
if X_test is not None and Y_test is not None:
risk_val = regr.compute_risk(X_test, Y_test, M, n_jobs=n_jobs, **kwargs_est)
return regr, (risk_gcv, risk_val)
return regr, risk_gcv
def GCV(
X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={},
M=20, M0=20, M_max=np.inf, corrected=True, type='full', return_df=False, n_jobs=-1, X_test=None, Y_test=None,
kwargs_est={}, **kwargs
Cross-validation for ensemble models using the empirical ECV estimate.
Currently, only the GCV estimates for the Ridge, Lasso, and ElasticNet are implemented.
X_train, Y_train : numpy.array
The training samples.
grid : pandas.DataFrame
The grid of hyperparameters to search over.
regr : object
The base estimator to use for the ensemble model.
kwargs_regr : dict, optional
Additional keyword arguments for the base estimator.
kwargs_ensemble : dict, optional
Additional keyword arguments for the ensemble model.
M : int, optional
The ensemble size to build.
corrected : bool, optional
If True, compute the corrected GCV estimate.
type : str, optional
The type of GCV or GCV estimate to compute. It can be either 'full' or 'union' for naive GCV,
and 'full' or 'ovlp' for CGCV.
return_df : bool, optional
If True, returns the results as a pandas.DataFrame object.
n_jobs : int, optional
The number of jobs to run in parallel. If -1, all CPUs are used.
X_test, Y_test : numpy.array, optional
The validation samples. It may be useful to be used for comparing the
performance of ECV with other cross-validation methods that requires sample-splitting.
kwargs_est : dict, optional
Additional keyword arguments for the risk estimate.
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid(
grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M)
if M0>M:
raise ValueError('M0 must be less than or equal to M.')
if np.isinf(M_max):
M_max = np.append(np.arange(M)+1, np.inf)
elif np.isscalar(M_max):
M_max = np.arange(M_max)+1
n_M_max = len(M_max)
test = X_test is not None and Y_test is not None
n_res = n_M_max+M if test else n_M_max
n_grid = len(grid_regr)
res_risk = np.full((n_grid, n_res), np.inf)
for i in range(n_grid):
params_ensemble = grid_ensemble[i]
params_regr = grid_regr[i]
_, res = comp_empirical_gcv(
X_train, Y_train, regr,
{**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble},
M, M0, M_max, corrected, type, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est
res_risk[i, :] = np.r_[res]
if return_df:
cols = np.char.add(['risk_val-']*n_M_max, np.char.mod('%d', 1+np.arange(n_M_max)))
if np.isinf(M_max[-1]):
cols[-1] = 'risk_val-inf'
if test:
cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M))))
res_gcv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble),
pd.DataFrame(res_risk, columns=cols)
] ,axis=1)
if test:
res_gcv = (res_risk[:,:M], res_risk[:,M:])
res_gcv = res_risk
j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape)
M_best += 1
info = {
'best_params_regr': {**kwargs_regr, **grid_regr[j]},
'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]},
'best_n_estimators': M_best,
'best_score':res_risk[j, M_best-1],
return res_gcv, info
def fit_ensemble(regr=None, kwargs_regr={}, kwargs_ensemble={})
Expand source code
def fit_ensemble(regr=None,kwargs_regr={},kwargs_ensemble={}): if regr is None: regr = DecisionTreeRegressor return Ensemble(estimator=regr(**kwargs_regr), **kwargs_ensemble)
def comp_empirical_val(X_train, Y_train, X_val, Y_val, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, n_jobs=-1, X_test=None, Y_test=None, **kwargs_est)
Compute the empirical ECV estimate for a given ensemble model.
- The training samples.
- The validation samples.
- The base estimator to use for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The maximum ensemble size to consider.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The test samples.
, optional- If True, check the input arguments.
, optional- Additional keyword arguments for the risk estimate.
- The empirical ECV estimate.
Expand source code
def comp_empirical_val( X_train, Y_train, X_val, Y_val, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, n_jobs=-1, X_test=None, Y_test=None, _check_input=True, **kwargs_est, ): ''' Compute the empirical ECV estimate for a given ensemble model. Parameters ---------- X_train, Y_train : numpy.array The training samples. X_val, Y_val : numpy.array The validation samples. regr : object The base estimator to use for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The maximum ensemble size to consider. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The test samples. _check_input : bool, optional If True, check the input arguments. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. Returns ---------- risk_ecv : numpy.array The empirical ECV estimate. ''' if _check_input: kwargs_regr, kwargs_ensemble, kwargs_est = check_input(kwargs_regr, kwargs_ensemble, kwargs_est, M) regr = fit_ensemble(regr,kwargs_regr,kwargs_ensemble).fit(X_train, Y_train) risk_val = regr.compute_risk(X_val, Y_val, M_test=None, return_df=False, n_jobs=n_jobs, **kwargs_est) if X_val is not None and Y_test is not None: risk_test = regr.compute_risk(X_test, Y_test, M, n_jobs=n_jobs, **kwargs_est) return regr, (risk_val, risk_test) else: return regr, risk_val
def splitCV(X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs)
Sample-split cross-validation for ensemble models.
- The training samples.
- The base estimator to use for the ensemble model.
- The grid of hyperparameters for the base estimator.
- The grid of hyperparameters for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The ensemble size to build.
, optional- If True, returns the results as a pandas.DataFrame object.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The test samples. It may be useful to be used for comparing the performance of different cross-validation methods.
, optional- Additional keyword arguments for the risk estimate.
, optional- Additional keyword arguments for
; see for more details.
Expand source code
def splitCV( X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs ): ''' Sample-split cross-validation for ensemble models. Parameters ---------- X_train, Y_train : numpy.array The training samples. regr : object The base estimator to use for the ensemble model. grid_regr : pandas.DataFrame The grid of hyperparameters for the base estimator. grid_ensemble : pandas.DataFrame The grid of hyperparameters for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The ensemble size to build. return_df : bool, optional If True, returns the results as a pandas.DataFrame object. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The test samples. It may be useful to be used for comparing the performance of different cross-validation methods. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. kwargs : dict, optional Additional keyword arguments for `ShuffleSplit`; see for more details. ''' grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid( grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M) test = X_test is not None and Y_test is not None n_res = 2*M if test else M n_grid = len(grid_regr) res_risk = np.full((n_grid,n_res), np.inf) rs = ShuffleSplit(1, **kwargs) id_train, id_val = next(rs.split(X_train, Y_train)) _X_train, _X_val, _Y_train, _Y_val = X_train[id_train], X_train[id_val], Y_train[id_train], Y_train[id_val] for i in range(n_grid): params_ensemble = grid_ensemble[i] params_regr = grid_regr[i] _, res = comp_empirical_val( _X_train, _Y_train, _X_val, _Y_val, regr, {**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble}, M, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est ) res_risk[i, :] = np.r_[res] if return_df: cols = np.char.add(['risk_val-']*M, np.char.mod('%d', 1+np.arange(M))) if test: cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M)))) res_splitcv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble), pd.DataFrame(res_risk, columns=cols) ] ,axis=1) else: if test: res_splitcv = (res_risk[:,:M], res_risk[:,M:]) else: res_splitcv = res_risk j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape) M_best += 1 info = { 'best_params_regr': {**kwargs_regr, **grid_regr[j]}, 'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]}, 'best_n_estimators': M_best, 'best_params_index':j, 'best_score':res_risk[j, M_best-1], 'split_params':{ 'index_train':id_train, 'index_val':id_val, 'test_size':rs.test_size, 'random_state':rs.random_state } } return res_splitcv, info
def KFoldCV(X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs)
Sample-split cross-validation for ensemble models.
- The training samples.
- The base estimator to use for the ensemble model.
- The grid of hyperparameters for the base estimator.
- The grid of hyperparameters for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The ensemble size to build.
, optional- If True, returns the results as a pandas.DataFrame object.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The test samples. It may be useful to be used for comparing the performance of different cross-validation methods.
, optional- Additional keyword arguments for the risk estimate.
, optional- Additional keyword arguments for
; see for more details.
Expand source code
def KFoldCV( X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs ): ''' Sample-split cross-validation for ensemble models. Parameters ---------- X_train, Y_train : numpy.array The training samples. regr : object The base estimator to use for the ensemble model. grid_regr : pandas.DataFrame The grid of hyperparameters for the base estimator. grid_ensemble : pandas.DataFrame The grid of hyperparameters for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The ensemble size to build. return_df : bool, optional If True, returns the results as a pandas.DataFrame object. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The test samples. It may be useful to be used for comparing the performance of different cross-validation methods. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. kwargs : dict, optional Additional keyword arguments for `KFold`; see for more details. ''' grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid( grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M) test = X_test is not None and Y_test is not None n_res = 2*M if test else M n_grid = len(grid_regr) kf = KFold(**kwargs) n_splits = kf.get_n_splits(X_train) res_risk_all = np.full((n_grid,n_res,n_splits), np.inf) for fold, (id_train, id_val) in enumerate(kf.split(X_train)): _X_train, _X_val, _Y_train, _Y_val = X_train[id_train], X_train[id_val], Y_train[id_train], Y_train[id_val] for i in range(n_grid): params_ensemble = grid_ensemble[i] params_regr = grid_regr[i] _, res = comp_empirical_val( _X_train, _Y_train, _X_val, _Y_val, regr, {**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble}, M, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est ) res_risk_all[i, :, fold] = np.r_[res] res_risk = np.mean(res_risk_all, axis=2) if return_df: cols = np.char.add(['risk_val-']*M, np.char.mod('%d', 1+np.arange(M))) if test: cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M)))) res_splitcv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble), pd.DataFrame(res_risk, columns=cols) ] ,axis=1) else: if test: res_splitcv = (res_risk[:,:M], res_risk[:,M:]) else: res_splitcv = res_risk j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape) M_best += 1 info = { 'best_params_regr': {**kwargs_regr, **grid_regr[j]}, 'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]}, 'best_n_estimators': M_best, 'best_params_index':j, 'best_score':res_risk[j, M_best-1], 'val_score':res_risk_all[:,:M], 'test_score':None if not test else res_risk_all[:,M:], 'split_params':{ 'n_splits':n_splits, 'random_state':kf.random_state, 'shuffle':kf.shuffle, } } return res_splitcv, info
def comp_empirical_ecv(X_train, Y_train, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=inf, n_jobs=-1, X_test=None, Y_test=None, **kwargs_est)
Compute the empirical ECV estimate for a given ensemble model.
- The training samples.
- The base estimator to use for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The maximum ensemble size to consider.
, optional- The number of estimators to use for the ECV estimate.
, optional- The maximum ensemble size to consider for the tuned ensemble.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The test samples.
, optional- If True, check the input arguments.
, optional- Additional keyword arguments for the risk estimate.
- The empirical ECV estimate.
Expand source code
def comp_empirical_ecv( X_train, Y_train, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=np.inf, n_jobs=-1, X_test=None, Y_test=None, _check_input=True, **kwargs_est, ): ''' Compute the empirical ECV estimate for a given ensemble model. Parameters ---------- X_train, Y_train : numpy.array The training samples. regr : object The base estimator to use for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The maximum ensemble size to consider. M0 : int, optional The number of estimators to use for the ECV estimate. M_max : int, optional The maximum ensemble size to consider for the tuned ensemble. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The test samples. _check_input : bool, optional If True, check the input arguments. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. Returns ---------- risk_ecv : numpy.array The empirical ECV estimate. ''' if _check_input: if M0>M: raise ValueError('M0 must be less than or equal to M.') if np.isinf(M_max): M_max = np.append(np.arange(M)+1, np.inf) elif np.isscalar(M_max): M_max = np.arange(M_max)+1 kwargs_regr, kwargs_ensemble, kwargs_est = check_input(kwargs_regr, kwargs_ensemble, kwargs_est, M) regr = fit_ensemble(regr,kwargs_regr,kwargs_ensemble).fit(X_train, Y_train) risk_ecv = regr.compute_ecv_estimate(X_train, Y_train, M_max, M0=M0, n_jobs=n_jobs, **kwargs_est) if X_test is not None and Y_test is not None: risk_val = regr.compute_risk(X_test, Y_test, M, n_jobs=n_jobs, **kwargs_est) return regr, (risk_ecv, risk_val) else: return regr, risk_ecv
def ECV(X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=inf, delta=0.0, return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs)
Cross-validation for ensemble models using the empirical ECV estimate.
- The training samples.
- The grid of hyperparameters to search over.
- The base estimator to use for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The ensemble size to build.
, optional- The number of estimators to use for the ECV estimate.
, optional- The maximum ensemble size to consider for the tuned ensemble.
, optional- The suboptimality parameter for the ensemble size tuning by ECV.
, optional- If True, returns the results as a pandas.DataFrame object.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The validation samples. It may be useful to be used for comparing the performance of ECV with other cross-validation methods that requires sample-splitting.
, optional- Additional keyword arguments for the risk estimate.
Expand source code
def ECV( X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=np.inf, delta=0., return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs ): ''' Cross-validation for ensemble models using the empirical ECV estimate. Parameters ---------- X_train, Y_train : numpy.array The training samples. grid : pandas.DataFrame The grid of hyperparameters to search over. regr : object The base estimator to use for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The ensemble size to build. M0 : int, optional The number of estimators to use for the ECV estimate. M_max : int, optional The maximum ensemble size to consider for the tuned ensemble. delta : float, optional The suboptimality parameter for the ensemble size tuning by ECV. return_df : bool, optional If True, returns the results as a pandas.DataFrame object. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The validation samples. It may be useful to be used for comparing the performance of ECV with other cross-validation methods that requires sample-splitting. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. ''' grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid( grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M) if M0>M: raise ValueError('M0 must be less than or equal to M.') if np.isinf(M_max): M_max = np.append(np.arange(M)+1, np.inf) elif np.isscalar(M_max): M_max = np.arange(M_max)+1 n_M_max = len(M_max) test = X_test is not None and Y_test is not None n_res = n_M_max+M if test else n_M_max n_grid = len(grid_regr) res_risk = np.full((n_grid, n_res), np.inf) for i in range(n_grid): params_ensemble = grid_ensemble[i] params_regr = grid_regr[i] _, res = comp_empirical_ecv( X_train, Y_train, regr, {**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble}, M, M0, M_max, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est ) res_risk[i, :] = np.r_[res] if return_df: cols = np.char.add(['risk_val-']*n_M_max, np.char.mod('%d', 1+np.arange(n_M_max))) if np.isinf(M_max[-1]): cols[-1] = 'risk_val-inf' if test: cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M)))) res_ecv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble), pd.DataFrame(res_risk, columns=cols) ] ,axis=1) else: if test: res_ecv = (res_risk[:,:n_M_max], res_risk[:,n_M_max:]) else: res_ecv = res_risk j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape) M_best += 1 if delta==0.: M_hat = np.inf else: M_hat = int(np.ceil(2 / (delta + 2/M_max[-1]*(res_risk[j,0] - res_risk[j,1])) * (res_risk[j,0] - res_risk[j,1]))) M_best_ext = np.minimum(M_hat, M_max[-1]) if not np.isinf(M_best_ext): M_best_ext = int(M_best_ext) info = { 'best_params_regr': {**kwargs_regr, **grid_regr[j]}, 'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]}, 'best_n_estimators': M_best, 'best_params_index':j, 'best_score':res_risk[j, M_best-1], 'delta': delta, 'M_max':M_max[-1], 'best_n_estimators_extrapolate': M_best_ext, 'best_score_extrapolate': res_risk[j,n_M_max-1] if np.isinf(M_best_ext) else res_risk[j, M_best_ext-1], } return res_ecv, info
def comp_empirical_gcv(X_train, Y_train, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=inf, corrected=True, type='full', n_jobs=-1, X_test=None, Y_test=None, **kwargs_est)
Compute the empirical GCV or CGCV estimate for a given ensemble model.
- The training samples.
- The base estimator to use for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The maximum ensemble size to consider.
, optional- If True, compute the corrected GCV estimate.
, optional- The type of GCV or GCV estimate to compute. It can be either 'full' or 'union' for naive GCV, and 'full' or 'ovlp' for CGCV.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The test samples.
, optional- If True, check the input arguments.
, optional- Additional keyword arguments for the risk estimate.
- The empirical ECV estimate.
Expand source code
def comp_empirical_gcv( X_train, Y_train, regr, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=np.inf, corrected=True, type='full', n_jobs=-1, X_test=None, Y_test=None, _check_input=True, **kwargs_est, ): ''' Compute the empirical GCV or CGCV estimate for a given ensemble model. Parameters ---------- X_train, Y_train : numpy.array The training samples. regr : object The base estimator to use for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The maximum ensemble size to consider. corrected : bool, optional If True, compute the corrected GCV estimate. type : str, optional The type of GCV or GCV estimate to compute. It can be either 'full' or 'union' for naive GCV, and 'full' or 'ovlp' for CGCV. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The test samples. _check_input : bool, optional If True, check the input arguments. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. Returns ---------- risk_ecv : numpy.array The empirical ECV estimate. ''' if _check_input: if M0>M: raise ValueError('M0 must be less than or equal to M.') if np.isinf(M_max): M_max = np.append(np.arange(M)+1, np.inf) elif np.isscalar(M_max): M_max = np.arange(M_max)+1 kwargs_regr, kwargs_ensemble, kwargs_est = check_input(kwargs_regr, kwargs_ensemble, kwargs_est, M) regr = fit_ensemble(regr,kwargs_regr,kwargs_ensemble).fit(X_train, Y_train) if corrected: risk_gcv = regr.compute_cgcv_estimate(X_train, Y_train, M0, type, n_jobs=n_jobs, **kwargs_est) else: risk_gcv = regr.compute_gcv_estimate(X_train, Y_train, M0, type, n_jobs=n_jobs, **kwargs_est) risk_gcv = regr.extrapolate(risk_gcv, M_max) if X_test is not None and Y_test is not None: risk_val = regr.compute_risk(X_test, Y_test, M, n_jobs=n_jobs, **kwargs_est) return regr, (risk_gcv, risk_val) else: return regr, risk_gcv
def GCV(X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=inf, corrected=True, type='full', return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs)
Cross-validation for ensemble models using the empirical ECV estimate. Currently, only the GCV estimates for the Ridge, Lasso, and ElasticNet are implemented.
- The training samples.
- The grid of hyperparameters to search over.
- The base estimator to use for the ensemble model.
, optional- Additional keyword arguments for the base estimator.
, optional- Additional keyword arguments for the ensemble model.
, optional- The ensemble size to build.
, optional- If True, compute the corrected GCV estimate.
, optional- The type of GCV or GCV estimate to compute. It can be either 'full' or 'union' for naive GCV, and 'full' or 'ovlp' for CGCV.
, optional- If True, returns the results as a pandas.DataFrame object.
, optional- The number of jobs to run in parallel. If -1, all CPUs are used.
, optional- The validation samples. It may be useful to be used for comparing the performance of ECV with other cross-validation methods that requires sample-splitting.
, optional- Additional keyword arguments for the risk estimate.
Expand source code
def GCV( X_train, Y_train, regr, grid_regr={}, grid_ensemble={}, kwargs_regr={}, kwargs_ensemble={}, M=20, M0=20, M_max=np.inf, corrected=True, type='full', return_df=False, n_jobs=-1, X_test=None, Y_test=None, kwargs_est={}, **kwargs ): ''' Cross-validation for ensemble models using the empirical ECV estimate. Currently, only the GCV estimates for the Ridge, Lasso, and ElasticNet are implemented. Parameters ---------- X_train, Y_train : numpy.array The training samples. grid : pandas.DataFrame The grid of hyperparameters to search over. regr : object The base estimator to use for the ensemble model. kwargs_regr : dict, optional Additional keyword arguments for the base estimator. kwargs_ensemble : dict, optional Additional keyword arguments for the ensemble model. M : int, optional The ensemble size to build. corrected : bool, optional If True, compute the corrected GCV estimate. type : str, optional The type of GCV or GCV estimate to compute. It can be either 'full' or 'union' for naive GCV, and 'full' or 'ovlp' for CGCV. return_df : bool, optional If True, returns the results as a pandas.DataFrame object. n_jobs : int, optional The number of jobs to run in parallel. If -1, all CPUs are used. X_test, Y_test : numpy.array, optional The validation samples. It may be useful to be used for comparing the performance of ECV with other cross-validation methods that requires sample-splitting. kwargs_est : dict, optional Additional keyword arguments for the risk estimate. ''' grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est = process_grid( grid_regr, kwargs_regr, grid_ensemble, kwargs_ensemble, kwargs_est, M) if M0>M: raise ValueError('M0 must be less than or equal to M.') if np.isinf(M_max): M_max = np.append(np.arange(M)+1, np.inf) elif np.isscalar(M_max): M_max = np.arange(M_max)+1 n_M_max = len(M_max) test = X_test is not None and Y_test is not None n_res = n_M_max+M if test else n_M_max n_grid = len(grid_regr) res_risk = np.full((n_grid, n_res), np.inf) for i in range(n_grid): params_ensemble = grid_ensemble[i] params_regr = grid_regr[i] _, res = comp_empirical_gcv( X_train, Y_train, regr, {**kwargs_regr, **params_regr}, {**kwargs_ensemble, **params_ensemble}, M, M0, M_max, corrected, type, n_jobs, X_test, Y_test, _check_input=False, **kwargs_est ) res_risk[i, :] = np.r_[res] if return_df: cols = np.char.add(['risk_val-']*n_M_max, np.char.mod('%d', 1+np.arange(n_M_max))) if np.isinf(M_max[-1]): cols[-1] = 'risk_val-inf' if test: cols = np.append(cols, np.char.add(['risk_test-']*M, np.char.mod('%d', 1+np.arange(M)))) res_gcv = pd.concat([pd.DataFrame(grid_regr), pd.DataFrame(grid_ensemble), pd.DataFrame(res_risk, columns=cols) ] ,axis=1) else: if test: res_gcv = (res_risk[:,:M], res_risk[:,M:]) else: res_gcv = res_risk j, M_best = np.unravel_index(np.nanargmin(res_risk[:,:M]), res_risk[:,:M].shape) M_best += 1 info = { 'best_params_regr': {**kwargs_regr, **grid_regr[j]}, 'best_params_ensemble': {**kwargs_ensemble, **grid_ensemble[j]}, 'best_n_estimators': M_best, 'best_params_index':j, 'best_score':res_risk[j, M_best-1], } return res_gcv, info