python - 自定义转换器和 GridSearch - 管道中的 ValueError

我正在尝试使用一些自定义转换器优化 scikit-learn 管道中的超参数,但我不断收到错误消息:

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class RollingMeanTransform(BaseEstimator, TransformerMixin):

    def __init__(self, col, window=3):
        self._window = window
        self._col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
        return df

class TimeEncoding(BaseEstimator, TransformerMixin):

    def __init__(self, col, drop_original=True):
        self._col = col 
        self._drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        unique_vals = float(len(X[self._col].unique()))
        X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
        X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
        if self._drop_original:
            X.drop([self._col], axis=1, inplace=True, errors='ignore')
        return X

huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]

huber_grid = {'clf__alpha':huber_alpha,

regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
                                ('mean', RollingMeanTransform('my_other_col')), 
                                ('select', Treshold()),
                                ('scale', Scale()),
                                ('clf', huber)


grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5)), y_train)


ValueError                                Traceback (most recent call last)
<ipython-input-14-3949096c802a> in <module>()
----> 1, y_train)

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/ in fit(self, X, y, groups, **fit_params)
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
    641         # if one choose to see train score, "out" will contain train score info

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    334     def get(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in __call__(self)
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in <listcomp>(.0)
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/ in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    456   , **fit_params)
    457         else:
--> 458   , y_train, **fit_params)
    460     except Exception as e:

~/anaconda3/lib/python3.6/site-packages/sklearn/ in fit(self, X, y, **fit_params)
    246             This estimator
    247         """
--> 248         Xt, fit_params = self._fit(X, y, **fit_params)
    249         if self._final_estimator is not None:
    250   , y, **fit_params)

~/anaconda3/lib/python3.6/site-packages/sklearn/ in _fit(self, X, y, **fit_params)
    211                 Xt, fitted_transformer = fit_transform_one_cached(
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
    214                 # Replace the transformer of the step with the fitted
    215                 # transformer. This is necessary when loading the transformer

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/ in __call__(self, *args, **kwargs)
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    364     def call_and_shelve(self, *args, **kwargs):

~/anaconda3/lib/python3.6/site-packages/sklearn/ in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res =, y, **fit_params).transform(X)

~/anaconda3/lib/python3.6/site-packages/sklearn/ in fit_transform(self, X, y, **fit_params)
    518         else:
    519             # fit method of arity 2 (supervised transformation)
--> 520             return, y, **fit_params).transform(X)

~/my_project/ in transform(self, X)
    126     def transform(self, X):
    127         X = X.copy()
--> 128         unique_vals = float(len(X[self._col].unique()))
    129         X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
    130         X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)

~/anaconda3/lib/python3.6/site-packages/pandas/core/ in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2141     def _getitem_column(self, key):

~/anaconda3/lib/python3.6/site-packages/pandas/core/ in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2148         # duplicate columns & possible reduce dimensionality

~/anaconda3/lib/python3.6/site-packages/pandas/core/ in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~/anaconda3/lib/python3.6/site-packages/pandas/core/ in get(self, item, fastpath)
   3850                         loc = indexer.item()
   3851                     else:
-> 3852                         raise ValueError("cannot label index with a null key")
   3854             return self.iget(loc, fastpath=fastpath)

ValueError: cannot label index with a null key



 regression_pipeline = Pipeline([('mean', RollingMeanTransform('my_other_col')), 
                                 ('encoding', TimeEncoding('my_col')),
                                 ('select', Treshold()),
                                 ('scale', Scale()),
                                 ('clf', huber)

我得到了同样的错误,但这次调用了 mean 转换器。


from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

class RollingMeanTransform(BaseEstimator, TransformerMixin):

    def __init__(self, col, window=3):
        self._window = window
        self._col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
        return df

class TimeEncoding(BaseEstimator, TransformerMixin):

    def __init__(self, col, drop_original=True):
        self._col = col 
        self._drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        unique_vals = float(len(X[self._col].unique()))
        X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
        X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
        if self._drop_original:
            X.drop([self._col], axis=1, inplace=True, errors='ignore')
        return X

class Treshold(BaseEstimator, TransformerMixin):
    # note: Threshold which removes features with constant value
    # and preserves the input data as data frame
    def __init__(self):

        self.to_keep = list()

    def fit(self, X, y=None):

        self.to_keep = list()

        self.colname_original = X.columns

        for i, col in enumerate(X):

            if len(np.unique(X.values[:, i])) >= 2:

        return self

    def transform(self, X, copy=None):
        return X[self.to_keep]

class Scale(BaseEstimator, TransformerMixin):
    # note: scaler which keeps the input data as data frame
    # and does not scale binary features
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)

        self.bin_vars_index = list()
        self.cont_vars_index = list()

        self.colnames_original = list()

    def fit(self, X, y=None):

        self.bin_vars_index = list()
        self.cont_vars_index = list()

        self.colnames_original = list()

        self.colnames_original = X.columns

        for i in range(X.shape[1]):

            if len(np.unique(X.values[:, i])) <= 2:
                self.cont_vars_index.append(i)[:, self.cont_vars_index])
        return self

    def transform(self, X, copy=None):
        X_tail = self.scaler.transform(X.values[:, self.cont_vars_index], copy)
        res = np.concatenate((X.values[:, self.bin_vars_index], X_tail), axis=1)

        colnames_res = np.array(
            list(self.colnames_original[self.bin_vars_index]) + list(self.colnames_original[self.cont_vars_index]))
        assert len(colnames_res) == len(self.colnames_original)
        res = pd.DataFrame(data=res, columns=colnames_res)
        return res[[str(el) for el in self.colnames_original]].set_index(X.index)

huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]

huber_grid = {'clf__alpha':huber_alpha,

regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
                                ('mean', RollingMeanTransform('my_other_col')), 
                                ('select', Treshold()),
                                ('scale', Scale()),
                                ('clf', huber)

grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))

X = pd.DataFrame(np.random.randint(low=0, high=10, size=(20, 2)), columns=['my_col', 'my_other_col'])

y = pd.Series(np.random.randint(low=0, high=10, size=(20,))), y)


您会看到 GridSearchCV(以及 scikit-learn 中的大多数交叉验证实用程序)克隆提供的数据以执行网格搜索。

在这样做时,他们将使用 get_params() and set_params()您继承的 BaseEstimator 类。现在 get_params() 将从您声明的 __init__() 方法获取参数。

这是 source of get_params() :

    init_signature = signature(init)
    # Consider the constructor parameters excluding 'self'
    parameters = [p for p in init_signature.parameters.values()
                  if != 'self' and p.kind != p.VAR_KEYWORD]


现在要获取值,使用]( ):

    for key in self._get_param_names():
        value = getattr(self, key, None)


col = None
drop_original = None

不是您使用的带有 前导下划线 的那些。两者的值都是 None,因为您的对象没有任何具有这些名称的属性。

现在这些无值参数将用于实例化克隆对象in clone() :

new_object = klass(**new_object_params)

然后这些 None 值将设置为您的 _col_drop_original。这就是错误的根源。

这件事已记录在the deleloper guidelines in scikit中:

The arguments accepted by init should all be keyword arguments with a default value. In other words, a user should be able to instantiate an estimator without passing any arguments to it. The arguments should all correspond to hyperparameters describing the model or the optimisation problem the estimator tries to solve.

In addition, every keyword argument accepted by init should correspond to an attribute on the instance. Scikit-learn relies on this to find the relevant attributes to set on an estimator when doing model selection.

因此,建议的解决方法是从参数名称中删除前导下划线(以便 __init__self 中的名称应该相同):

class TimeEncoding(BaseEstimator, TransformerMixin):
    # Changed the names from _col to col
    def __init__(self, col, drop_original=True):
        self.col = col
        self.drop_original = drop_original

    def transform(self, X):
        X = X.copy()

        # Updated the names to be used
        unique_vals = float(len(X[self.col].unique()))
        X['sin_{}'.format(self.col)] = np.sin(2 * np.pi * X[self.col] / unique_vals)
        X['cos_{}'.format(self.col)] = np.cos(2 * np.pi * X[self.col] / unique_vals)
        if self.drop_original:
            X.drop([self.col], axis=1, inplace=True, errors='ignore')
        return X


现在,如果您在使用属性的前导下划线方面有一些限制(可能尝试将它们设为私有(private)或类似的东西),您的第二个选择是覆盖 set_params() 方法以显式设置参数。


