In [2]:
import pandas as pd
import matplotlib.pyplot as plt
In [3]:
dataset = '../housing/housing.csv'
In [4]:
housing = pd.read_csv(dataset)

A Glance of the Dataset

  • What you can conclude is that:
    • The median income is being scaled and caped
    • The housing median age and median house value are also capped, the solution:
      • collect proper labels
      • remove these districts from the training set
    • The attributes have very different scales.
    • many histograms are tail heavy
In [5]:
housing.head()
Out[5]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
In [6]:
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [7]:
housing.ocean_proximity.value_counts()
Out[7]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
In [8]:
housing.describe()
Out[8]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
In [9]:
housing.hist(bins=50,figsize=(20,15))
Out[9]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2D4E198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B31F5518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3222780>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156AF7A19E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2C68C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2C91E80>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3312128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2B6B3C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B2B6B400>]],
      dtype=object)

Create a Test Set

In [10]:
import numpy as np
In [11]:
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]
In [12]:
train_set,test_set = split_train_test(housing,0.2)
print(len(train_set),'train +',len(test_set),'test')
16512 train + 4128 test
In [13]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2,random_state=42)

To make the test set a representative of the population

  • Random selecting test instance may introduce bias when predicting
  • You can chat with experts to ensure the test set is representative of the population
  • The example below use median income as an important factor which should be ensured to be represented in the test dataset.
In [14]:
housing.median_income.hist()
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x156b62f0198>
In [15]:
housing['income_cat'] = np.ceil(housing.median_income/1.5)
housing['income_cat'].where(housing.income_cat<5,5,inplace=True)
In [16]:
housing['income_cat'].value_counts()/len(housing) #计算一个各个类别的百分比
Out[16]:
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64
In [17]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing.income_cat):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]
In [18]:
strat_test_set.income_cat.value_counts()/len(strat_test_set) #可以看到,这和总体的情况非常接近
Out[18]:
3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64
In [19]:
for set in (strat_train_set,strat_test_set):
    set.drop(['income_cat'],axis=1,inplace=True)

Discover and Visualize the Data to Gain Insights

Visualization

In [20]:
housing = strat_train_set.copy()
In [21]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x156b65a3748>
In [22]:
housing.plot(kind="scatter",x='longitude',y='latitude',alpha=0.4,
            s=housing['population']/100,label='population',
            c='median_house_value', cmap=plt.get_cmap("jet"),colorbar=True,)
plt.legend()
Out[22]:
<matplotlib.legend.Legend at 0x156b673b080>

Looking for Correlations

In [23]:
corr_matrix = housing.corr()
In [24]:
corr_matrix.median_house_value
Out[24]:
longitude            -0.047432
latitude             -0.142724
housing_median_age    0.114110
total_rooms           0.135097
total_bedrooms        0.047689
population           -0.026920
households            0.064506
median_income         0.687160
median_house_value    1.000000
Name: median_house_value, dtype: float64
In [25]:
from pandas.plotting import scatter_matrix
attributes = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))
Out[25]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B65F6C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6B3A748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3D08CC0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B3D37278>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B679D7B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6807D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B68372E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B685F898>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B685F8D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6B97390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6BBD908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6BE7E80>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C19438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C3F9B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C68F28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000156B6C994E0>]],
      dtype=object)
In [26]:
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x156b672f748>
  • You can clearly see horizontal lines in the above picture
  • Besides, the correlation between the income and house price is very apparent

Experimenting with Attribute Combinations

In [27]:
housing['rooms_per_household'] = housing.total_rooms/housing.households
housing['bedromms_per_room'] = housing.total_bedrooms/housing.total_rooms
housing['population_per_household'] = housing.population/housing.households
In [28]:
corr_matrix = housing.corr()
corr_matrix.median_house_value.sort_values(ascending=False)
Out[28]:
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedromms_per_room          -0.259984
Name: median_house_value, dtype: float64

Prepare the data for machine learning

In [29]:
housing = strat_train_set.drop('median_house_value',axis=1)
housing_labels = strat_train_set.median_house_value.copy()

Data Cleaning

  • Get rid of the corresponding districts
  • Get rid of the whole attribute
  • Set the values to some value(0, the mean, the median, etc,)
In [30]:
print(len(housing))
print(housing.total_bedrooms.value_counts().sum())
print('This indicates that there na values in the dataset')
#housing.dropna(subset=['total_bedrooms'])
16512
16354
This indicates that there na values in the dataset
In [31]:
print('Three ways to process missing data:')
housing.dropna(subset=['total_bedrooms'])
housing.drop('total_bedrooms',axis=1)
median = housing['total_bedrooms'].median();housing.total_bedrooms.fillna(median);
Three ways to process missing data:
In [32]:
print('na value processor by sklearn')
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity',axis=1)
na value processor by sklearn
In [33]:
imputer.fit(housing_num)

#下面两句等价
housing_num.median().values
imputer.statistics_
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X,columns=housing_num.columns)

Handling text and categorical attributes

In [34]:
from sklearn.preprocessing import LabelEncoder
In [35]:
encoder = LabelEncoder()
housing_cat = housing.ocean_proximity
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
encoder.classes_
Out[35]:
array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)
In [36]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
If you want the future behaviour and silence this warning, you can specify "categories='auto'".
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  warnings.warn(msg, FutureWarning)
Out[36]:
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>
In [37]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer(sparse_output=False)
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
Out[37]:
array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

Custom Transformers

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix,population_ix,household_ix = 3,4,5,6
In [39]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
In [40]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
In [41]:
housing_extra_attribs = attr_adder.transform(housing.values)

Feature Scaling

There are two common ways to get all attributes to have the same scale

MinMaxScaler

  • produce result range from 0-1 by:
  • $value = \frac{value-min(attribute)}{max(attribute)-min(attribute)}$

Standardization

  • $value = \frac{value-mean(attribute)}{std(attribute)}$

Transformation Pipelines

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])
In [43]:
housing_num_tr = num_pipeline.fit_transform(housing_num)
In [60]:
from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attribute_names].values
In [73]:
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline(
    [
        ('selector',DataFrameSelector(num_attribs)),
        ('imputer',SimpleImputer(strategy='median')),
        ('attribs_adder',CombinedAttributesAdder()),
        ('std_scaler',StandardScaler()),
    ]
)
cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('label_binarizer',OneHotEncoder(sparse=False)),
])
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape
Out[73]:
(16512, 16)
In [74]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape
Out[74]:
(16512, 16)

Select and Train a Model

In [76]:
housing.head()
Out[76]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity
17606 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.7042 <1H OCEAN
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.4214 <1H OCEAN
14650 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.8621 NEAR OCEAN
3230 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.8839 INLAND
3555 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.0347 <1H OCEAN

Linear Regression

In [81]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
Out[81]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
In [82]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))
Predictions: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
In [83]:
print(some_labels)
17606    286600.0
18632    340600.0
14650    196900.0
3230      46300.0
3555     254500.0
Name: median_house_value, dtype: float64
In [88]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
Out[88]:
68628.19819848923

Decision Tree Regression

In [95]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
Out[95]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [98]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
Out[98]:
0.0

Cross Validation

In [104]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())
[68930.44475821 66558.10126398 71222.37470024 68891.55140182
 70653.472854   75531.04198202 72407.55126445 71764.95822723
 77052.17560379 69702.50850308] 71271.41805588322 2988.326447170996
In [108]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-lin_scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())
[66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067] 69052.46136345083 2731.674001798349

Random Forest Regressor

In [114]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels,housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[114]:
22267.366749826804
In [115]:
fore_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-fore_scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())
[49946.50101164 50484.91805037 53208.39683726 53820.04129717
 51374.33312308 55476.13389189 51095.1559377  50385.37296508
 55664.65592003 53147.26475593] 52460.27737901548 1995.3245485192294

Saving Model

In [116]:
from sklearn.externals import joblib
joblib.dump(my_model,'mymodel.pkl')

my_model_loaded = joblib.load('my_model.pkl')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-116-7040504d0c9c> in <module>
      1 from sklearn.externals import joblib
----> 2 joblib.dump(my_model,'mymodel.pkl')
      3 
      4 my_model_loaded = joblib.load('my_model.pkl')

NameError: name 'my_model' is not defined

Fine-tuning the model!

In [ ]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
In [ ]:
grid_search.best_params_
In [ ]:
grid_search.best_estimator_
In [ ]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)