From a9d896c271f40a83be906fbdb94f11650956a1f7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 13 Sep 2021 17:13:04 -0700 Subject: [PATCH 1/3] allow modelmatrices with invalid categorical variables when necessary --- pygam/pygam.py | 15 +++++++++++---- pygam/utils.py | 40 ++++++++++++++++++++++------------------ 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index ae8c8d64..a286be7e 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -433,7 +433,8 @@ def predict(self, X): """ return self.predict_mu(X) - def _modelmat(self, X, term=-1): + def _modelmat(self, X, term=-1, + check_categorical=True): """ Builds a model matrix, B, out of the spline basis for each feature @@ -454,7 +455,8 @@ def _modelmat(self, X, term=-1): """ X = check_X(X, n_feats=self.statistics_['m_features'], edge_knots=self.edge_knots_, dtypes=self.dtype, - features=self.feature, verbose=self.verbose) + features=self.feature, verbose=self.verbose, + check_categorical=check_categorical) return self.terms.build_columns(X, term=term) @@ -1553,6 +1555,10 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, if X is None: X = self.generate_X_grid(term=term, meshgrid=meshgrid) + # check categorical features if the variable + # is categorical + check_categorical = self.dtype[term] == 'categorical' + if meshgrid: if not isinstance(X, tuple): raise ValueError('X must be a tuple of grids if `meshgrid=True`, '\ @@ -1562,9 +1568,10 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, X = self._flatten_mesh(X, term=term) X = check_X(X, n_feats=self.statistics_['m_features'], edge_knots=self.edge_knots_, dtypes=self.dtype, - features=self.feature, verbose=self.verbose) + features=self.feature, verbose=self.verbose, + check_categorical=check_categorical) - modelmat = self._modelmat(X, term=term) + modelmat = self._modelmat(X, term=term, check_categorical=check_categorical) pdep = self._linear_predictor(modelmat=modelmat, term=term) out = [pdep] diff --git a/pygam/utils.py b/pygam/utils.py index ff5ea004..cb34505a 100644 --- a/pygam/utils.py +++ b/pygam/utils.py @@ -230,7 +230,8 @@ def check_y(y, link, dist, min_samples=1, verbose=True): return y def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, - features=None, verbose=True): + features=None, verbose=True, + check_categorical=True): """ tool to ensure that X: - is 2 dimensional @@ -253,6 +254,8 @@ def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, which features are considered by the model verbose : bool, default: True whether to print warnings + check_categorical : bool, default: True + whether to check categorical features Returns ------- @@ -284,23 +287,24 @@ def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, n = len(edge_knots) // 2 edge_knots = [(edge_knots.pop(), edge_knots.pop()) for _ in range(n)] - # check each categorical term - for i, ek in enumerate(edge_knots): - dt = dtypes[i] - feature = features[i] - x = X[:, feature] - - if dt == 'categorical': - min_ = ek[0] - max_ = ek[-1] - if (np.unique(x) < min_).any() or \ - (np.unique(x) > max_).any(): - min_ += .5 - max_ -= 0.5 - raise ValueError('X data is out of domain for categorical '\ - 'feature {}. Expected data on [{}, {}], '\ - 'but found data on [{}, {}]'\ - .format(i, min_, max_, x.min(), x.max())) + if check_categorical: + # check each categorical term + for i, ek in enumerate(edge_knots): + dt = dtypes[i] + feature = features[i] + x = X[:, feature] + + if dt == 'categorical': + min_ = ek[0] + max_ = ek[-1] + if (np.unique(x) < min_).any() or \ + (np.unique(x) > max_).any(): + min_ += .5 + max_ -= 0.5 + raise ValueError('X data is out of domain for categorical '\ + 'feature {}. Expected data on [{}, {}], '\ + 'but found data on [{}, {}]'\ + .format(i, min_, max_, x.min(), x.max())) return X From 8bf17ad5b3af39864a15540d422fa1fbabd4886b Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 13 Sep 2021 22:55:03 -0700 Subject: [PATCH 2/3] instead of checking categorical, made a private method to create an X with constant rows instead of 0s --- pygam/pygam.py | 38 +++++++++++++++++++++++--------------- pygam/utils.py | 40 ++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index a286be7e..bcb43f0a 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -433,8 +433,7 @@ def predict(self, X): """ return self.predict_mu(X) - def _modelmat(self, X, term=-1, - check_categorical=True): + def _modelmat(self, X, term=-1): """ Builds a model matrix, B, out of the spline basis for each feature @@ -455,8 +454,7 @@ def _modelmat(self, X, term=-1, """ X = check_X(X, n_feats=self.statistics_['m_features'], edge_knots=self.edge_knots_, dtypes=self.dtype, - features=self.feature, verbose=self.verbose, - check_categorical=check_categorical) + features=self.feature, verbose=self.verbose) return self.terms.build_columns(X, term=term) @@ -1392,7 +1390,7 @@ def _flatten_mesh(self, Xs, term): else: terms = [self.terms[term]] - X = np.zeros((n, self.statistics_['m_features'])) + X = self._validX(n) for term_, x in zip(terms, Xs): X[:, term_.feature] = x.ravel() return X @@ -1469,7 +1467,7 @@ def generate_X_grid(self, term, n=100, meshgrid=False): return (x,) # fill in feature matrix with only relevant features for this term - X = np.zeros((n, self.statistics_['m_features'])) + X = self._validX(n) X[:, self.terms[term].feature] = x if getattr(self.terms[term], 'by', None) is not None: X[:, self.terms[term].by] = 1. @@ -1555,10 +1553,6 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, if X is None: X = self.generate_X_grid(term=term, meshgrid=meshgrid) - # check categorical features if the variable - # is categorical - check_categorical = self.dtype[term] == 'categorical' - if meshgrid: if not isinstance(X, tuple): raise ValueError('X must be a tuple of grids if `meshgrid=True`, '\ @@ -1568,12 +1562,12 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, X = self._flatten_mesh(X, term=term) X = check_X(X, n_feats=self.statistics_['m_features'], edge_knots=self.edge_knots_, dtypes=self.dtype, - features=self.feature, verbose=self.verbose, - check_categorical=check_categorical) + features=self.feature, verbose=self.verbose) - modelmat = self._modelmat(X, term=term, check_categorical=check_categorical) + modelmat = self._modelmat(X, term=term) pdep = self._linear_predictor(modelmat=modelmat, term=term) - out = [pdep] + pdep_mean = np.mean(pdep) + out = [pdep - pdep_mean] compute_quantiles = (width is not None) or (quantiles is not None) if compute_quantiles: @@ -1584,7 +1578,7 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, term=term, xform=False) - out += [conf_intervals] + out += [conf_intervals - pdep_mean] if meshgrid: for i, array in enumerate(out): @@ -2220,6 +2214,20 @@ def _simulate_coef_from_bootstraps( return coef_draws + def _validX(self, n_sample): + """ + Make an X matrix with constant rows with values given + by the center of each term's edge knots. + """ + validX = np.ones((n_sample, self.statistics_['m_features']), + float) + for col, term in enumerate(self.terms): + if term.isintercept: + continue + else: + validX[:,col] = np.mean(term.edge_knots_) + + return validX class LinearGAM(GAM): """Linear GAM diff --git a/pygam/utils.py b/pygam/utils.py index cb34505a..ff5ea004 100644 --- a/pygam/utils.py +++ b/pygam/utils.py @@ -230,8 +230,7 @@ def check_y(y, link, dist, min_samples=1, verbose=True): return y def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, - features=None, verbose=True, - check_categorical=True): + features=None, verbose=True): """ tool to ensure that X: - is 2 dimensional @@ -254,8 +253,6 @@ def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, which features are considered by the model verbose : bool, default: True whether to print warnings - check_categorical : bool, default: True - whether to check categorical features Returns ------- @@ -287,24 +284,23 @@ def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, n = len(edge_knots) // 2 edge_knots = [(edge_knots.pop(), edge_knots.pop()) for _ in range(n)] - if check_categorical: - # check each categorical term - for i, ek in enumerate(edge_knots): - dt = dtypes[i] - feature = features[i] - x = X[:, feature] - - if dt == 'categorical': - min_ = ek[0] - max_ = ek[-1] - if (np.unique(x) < min_).any() or \ - (np.unique(x) > max_).any(): - min_ += .5 - max_ -= 0.5 - raise ValueError('X data is out of domain for categorical '\ - 'feature {}. Expected data on [{}, {}], '\ - 'but found data on [{}, {}]'\ - .format(i, min_, max_, x.min(), x.max())) + # check each categorical term + for i, ek in enumerate(edge_knots): + dt = dtypes[i] + feature = features[i] + x = X[:, feature] + + if dt == 'categorical': + min_ = ek[0] + max_ = ek[-1] + if (np.unique(x) < min_).any() or \ + (np.unique(x) > max_).any(): + min_ += .5 + max_ -= 0.5 + raise ValueError('X data is out of domain for categorical '\ + 'feature {}. Expected data on [{}, {}], '\ + 'but found data on [{}, {}]'\ + .format(i, min_, max_, x.min(), x.max())) return X From 0ed182f0f61a5cb29935fb69fafa2d033c3aea88 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 15 Sep 2021 13:19:43 -0700 Subject: [PATCH 3/3] undoing centering --- pygam/pygam.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index bcb43f0a..c0a4e96e 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -1566,9 +1566,8 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, modelmat = self._modelmat(X, term=term) pdep = self._linear_predictor(modelmat=modelmat, term=term) - pdep_mean = np.mean(pdep) - out = [pdep - pdep_mean] - + out = [pdep] + compute_quantiles = (width is not None) or (quantiles is not None) if compute_quantiles: conf_intervals = self._get_quantiles(X, width=width, @@ -1578,7 +1577,7 @@ def partial_dependence(self, term, X=None, width=None, quantiles=None, term=term, xform=False) - out += [conf_intervals - pdep_mean] + out += [conf_intervals] if meshgrid: for i, array in enumerate(out):