import pandas as pd
import matplotlib.pyplot as plt

dataset = '../housing/housing.csv'

housing = pd.read_csv(dataset)

A Glance of the Dataset¶

What you can conclude is that:
- The median income is being scaled and caped
- The housing median age and median house value are also capped, the solution:
  - collect proper labels
  - remove these districts from the training set
- The attributes have very different scales.
- many histograms are tail heavy

housing.head()

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

housing.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

housing.describe()

housing.hist(bins=50,figsize=(20,15))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2D4E198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B31F5518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3222780>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156AF7A19E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2C68C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2C91E80>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3312128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2B6B3C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2B6B400>]],
      dtype=object)

Create a Test Set¶

import numpy as np

def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

train_set,test_set = split_train_test(housing,0.2)
print(len(train_set),'train +',len(test_set),'test')

16512 train + 4128 test

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2,random_state=42)

To make the test set a representative of the population¶

Random selecting test instance may introduce bias when predicting
You can chat with experts to ensure the test set is representative of the population
The example below use median income as an important factor which should be ensured to be represented in the test dataset.

housing.median_income.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x156b62f0198>

housing['income_cat'] = np.ceil(housing.median_income/1.5)
housing['income_cat'].where(housing.income_cat<5,5,inplace=True)

housing['income_cat'].value_counts()/len(housing) #计算一个各个类别的百分比

3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing.income_cat):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]

strat_test_set.income_cat.value_counts()/len(strat_test_set) #可以看到，这和总体的情况非常接近

3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64

for set in (strat_train_set,strat_test_set):
    set.drop(['income_cat'],axis=1,inplace=True)

Discover and Visualize the Data to Gain Insights¶

Visualization¶

housing = strat_train_set.copy()

housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x156b65a3748>

housing.plot(kind="scatter",x='longitude',y='latitude',alpha=0.4,
            s=housing['population']/100,label='population',
            c='median_house_value', cmap=plt.get_cmap("jet"),colorbar=True,)
plt.legend()

<matplotlib.legend.Legend at 0x156b673b080>

Looking for Correlations¶

corr_matrix = housing.corr()

corr_matrix.median_house_value

longitude            -0.047432
latitude             -0.142724
housing_median_age    0.114110
total_rooms           0.135097
total_bedrooms        0.047689
population           -0.026920
households            0.064506
median_income         0.687160
median_house_value    1.000000
Name: median_house_value, dtype: float64

from pandas.plotting import scatter_matrix
attributes = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B65F6C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6B3A748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3D08CC0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3D37278>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B679D7B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6807D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B68372E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B685F898>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B685F8D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6B97390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6BBD908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6BE7E80>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C19438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C3F9B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C68F28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C994E0>]],
      dtype=object)

housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x156b672f748>

You can clearly see horizontal lines in the above picture
Besides, the correlation between the income and house price is very apparent

Experimenting with Attribute Combinations¶

housing['rooms_per_household'] = housing.total_rooms/housing.households
housing['bedromms_per_room'] = housing.total_bedrooms/housing.total_rooms
housing['population_per_household'] = housing.population/housing.households

corr_matrix = housing.corr()
corr_matrix.median_house_value.sort_values(ascending=False)

median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedromms_per_room          -0.259984
Name: median_house_value, dtype: float64

Prepare the data for machine learning¶

housing = strat_train_set.drop('median_house_value',axis=1)
housing_labels = strat_train_set.median_house_value.copy()

Data Cleaning¶

Get rid of the corresponding districts
Get rid of the whole attribute
Set the values to some value(0, the mean, the median, etc,)

print(len(housing))
print(housing.total_bedrooms.value_counts().sum())
print('This indicates that there na values in the dataset')
#housing.dropna(subset=['total_bedrooms'])

16512
16354
This indicates that there na values in the dataset

print('Three ways to process missing data:')
housing.dropna(subset=['total_bedrooms'])
housing.drop('total_bedrooms',axis=1)
median = housing['total_bedrooms'].median();housing.total_bedrooms.fillna(median);

Three ways to process missing data:

print('na value processor by sklearn')
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity',axis=1)

na value processor by sklearn

imputer.fit(housing_num)

#下面两句等价
housing_num.median().values
imputer.statistics_
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X,columns=housing_num.columns)

Handling text and categorical attributes¶

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing.ocean_proximity
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
encoder.classes_

array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
If you want the future behaviour and silence this warning, you can specify "categories='auto'".
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  warnings.warn(msg, FutureWarning)

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer(sparse_output=False)
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

Custom Transformers¶

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix,population_ix,household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)

housing_extra_attribs = attr_adder.transform(housing.values)

Feature Scaling¶

There are two common ways to get all attributes to have the same scale¶

MinMaxScaler¶

produce result range from 0-1 by:
$value = \frac{value-min(attribute)}{max(attribute)-min(attribute)}$

Standardization¶

$value = \frac{value-mean(attribute)}{std(attribute)}$

Transformation Pipelines¶

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attribute_names].values

from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline(
    [
        ('selector',DataFrameSelector(num_attribs)),
        ('imputer',SimpleImputer(strategy='median')),
        ('attribs_adder',CombinedAttributesAdder()),
        ('std_scaler',StandardScaler()),
    ]
)
cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('label_binarizer',OneHotEncoder(sparse=False)),
])
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape

(16512, 16)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape

(16512, 16)

Select and Train a Model¶

housing.head()

Linear Regression¶

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]

print(some_labels)

17606    286600.0
18632    340600.0
14650    196900.0
3230      46300.0
3555     254500.0
Name: median_house_value, dtype: float64

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68628.19819848923

Decision Tree Regression¶

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

Cross Validation¶

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())

[68930.44475821 66558.10126398 71222.37470024 68891.55140182
 70653.472854   75531.04198202 72407.55126445 71764.95822723
 77052.17560379 69702.50850308] 71271.41805588322 2988.326447170996

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-lin_scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())

[66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067] 69052.46136345083 2731.674001798349

Random Forest Regressor¶

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels,housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

22267.366749826804

fore_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-fore_scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())

[49946.50101164 50484.91805037 53208.39683726 53820.04129717
 51374.33312308 55476.13389189 51095.1559377  50385.37296508
 55664.65592003 53147.26475593] 52460.27737901548 1995.3245485192294

Saving Model¶

from sklearn.externals import joblib
joblib.dump(my_model,'mymodel.pkl')

my_model_loaded = joblib.load('my_model.pkl')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-116-7040504d0c9c> in <module>
      1 from sklearn.externals import joblib
----> 2 joblib.dump(my_model,'mymodel.pkl')
      3 
      4 my_model_loaded = joblib.load('my_model.pkl')

NameError: name 'my_model' is not defined

Fine-tuning the model!¶

Grid Search¶

from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_

grid_search.best_estimator_

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity
17606	-121.89	37.29	38.0	1568.0	351.0	710.0	339.0	2.7042	<1H OCEAN
18632	-121.93	37.05	14.0	679.0	108.0	306.0	113.0	6.4214	<1H OCEAN
14650	-117.20	32.77	31.0	1952.0	471.0	936.0	462.0	2.8621	NEAR OCEAN
3230	-119.61	36.31	25.0	1847.0	371.0	1460.0	353.0	1.8839	INLAND
3555	-118.59	34.23	17.0	6592.0	1525.0	4459.0	1463.0	3.0347	<1H OCEAN