Learning selection with inference in the full modelΒΆ
This is the same example as considered in Liu et al. though we do not consider the special analysis in that paper. We let the computer guide us in correcting for selection.
[1]:
import functools
import numpy as np
from scipy.stats import norm as ndist
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import regreg.api as rr
from selectinf.tests.instance import gaussian_instance
from selectinf.learning.utils import full_model_inference, pivot_plot
from selectinf.learning.core import normal_sampler
from selectinf.learning.Rfitters import logit_fit
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000):
# description of statistical problem
X, y, truth = gaussian_instance(n=n,
p=p,
s=s,
equicorrelated=False,
rho=0.5,
sigma=sigma,
signal=signal,
random_signs=True,
scale=False)[:3]
dispersion = sigma**2
S = X.T.dot(y)
covS = dispersion * X.T.dot(X)
sampler = normal_sampler(S, covS)
def meta_algorithm(XTX, XTXi, lam, sampler):
p = XTX.shape[0]
success = np.zeros(p)
loss = rr.quadratic_loss((p,), Q=XTX)
pen = rr.l1norm(p, lagrange=lam)
scale = 0.
noisy_S = sampler(scale=scale)
loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)
problem = rr.simple_problem(loss, pen)
soln = problem.solve(max_its=100, tol=1.e-10)
success += soln != 0
return set(np.nonzero(success)[0])
XTX = X.T.dot(X)
XTXi = np.linalg.inv(XTX)
resid = y - X.dot(XTXi.dot(X.T.dot(y)))
dispersion = np.linalg.norm(resid)**2 / (n-p)
lam = 4. * np.sqrt(n)
selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)
# run selection algorithm
return full_model_inference(X,
y,
truth,
selection_algorithm,
sampler,
success_params=(1, 1),
B=B,
fit_probability=logit_fit,
fit_args={'df':20},
how_many=1)
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
from numpy.core.umath_tests import inner1d
Using TensorFlow backend.
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:455: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint8 = np.dtype([("qint8", np.int8, 1)])
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:456: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:457: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint16 = np.dtype([("qint16", np.int16, 1)])
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:458: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:459: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint32 = np.dtype([("qint32", np.int32, 1)])
/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:462: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
np_resource = np.dtype([("resource", np.ubyte, 1)])
R[write to console]: Loaded gbm 2.1.5
R[write to console]: randomForest 4.6-14
R[write to console]: Type rfNews() to see new features/changes/bug fixes.
[ ]:
for i in range(50):
df = simulate()
csvfile = 'lasso_exact.csv'
outbase = csvfile[:-4]
if df is not None and i > 0:
try: # concatenate to disk
df = pd.concat([df, pd.read_csv(csvfile)])
except FileNotFoundError:
pass
df.to_csv(csvfile, index=False)
if len(df['pivot']) > 0:
pivot_ax = pivot_plot(df, outbase)
prob(select): [0.762]
/Users/jonathantaylor/git-repos/selectinf/selectinf/distributions/discrete_family.py:86: RuntimeWarning: divide by zero encountered in log
self._lw = np.array([np.log(v) for v in xw[:,1]])
[ ]: