import pandas as pd
import matplotlib.pyplot as plt
dataset = '../housing/housing.csv'
housing = pd.read_csv(dataset)
housing.head()
housing.info()
housing.ocean_proximity.value_counts()
housing.describe()
housing.hist(bins=50,figsize=(20,15))
import numpy as np
def split_train_test(data,test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data)*test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices],data.iloc[test_indices]
train_set,test_set = split_train_test(housing,0.2)
print(len(train_set),'train +',len(test_set),'test')
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2,random_state=42)
housing.median_income.hist()
housing['income_cat'] = np.ceil(housing.median_income/1.5)
housing['income_cat'].where(housing.income_cat<5,5,inplace=True)
housing['income_cat'].value_counts()/len(housing) #计算一个各个类别的百分比
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing.income_cat):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_test_set.income_cat.value_counts()/len(strat_test_set) #可以看到,这和总体的情况非常接近
for set in (strat_train_set,strat_test_set):
set.drop(['income_cat'],axis=1,inplace=True)
housing = strat_train_set.copy()
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)
housing.plot(kind="scatter",x='longitude',y='latitude',alpha=0.4,
s=housing['population']/100,label='population',
c='median_house_value', cmap=plt.get_cmap("jet"),colorbar=True,)
plt.legend()
corr_matrix = housing.corr()
corr_matrix.median_house_value
from pandas.plotting import scatter_matrix
attributes = ['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)
housing['rooms_per_household'] = housing.total_rooms/housing.households
housing['bedromms_per_room'] = housing.total_bedrooms/housing.total_rooms
housing['population_per_household'] = housing.population/housing.households
corr_matrix = housing.corr()
corr_matrix.median_house_value.sort_values(ascending=False)
housing = strat_train_set.drop('median_house_value',axis=1)
housing_labels = strat_train_set.median_house_value.copy()
print(len(housing))
print(housing.total_bedrooms.value_counts().sum())
print('This indicates that there na values in the dataset')
#housing.dropna(subset=['total_bedrooms'])
print('Three ways to process missing data:')
housing.dropna(subset=['total_bedrooms'])
housing.drop('total_bedrooms',axis=1)
median = housing['total_bedrooms'].median();housing.total_bedrooms.fillna(median);
print('na value processor by sklearn')
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)
#下面两句等价
housing_num.median().values
imputer.statistics_
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X,columns=housing_num.columns)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing.ocean_proximity
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
encoder.classes_
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer(sparse_output=False)
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix,population_ix,household_ix = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
def __init__(self,attribute_names):
self.attribute_names = attribute_names
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
return X[self.attribute_names].values
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline(
[
('selector',DataFrameSelector(num_attribs)),
('imputer',SimpleImputer(strategy='median')),
('attribs_adder',CombinedAttributesAdder()),
('std_scaler',StandardScaler()),
]
)
cat_pipeline = Pipeline([
('selector',DataFrameSelector(cat_attribs)),
('label_binarizer',OneHotEncoder(sparse=False)),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline',num_pipeline),
('cat_pipeline',cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape
housing.head()
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print(some_labels)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-lin_scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels,housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
fore_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error',cv=10)
rmse_scores = np.sqrt(-fore_scores)
print(rmse_scores,rmse_scores.mean(),rmse_scores.std())
from sklearn.externals import joblib
joblib.dump(my_model,'mymodel.pkl')
my_model_loaded = joblib.load('my_model.pkl')
from sklearn.model_selection import GridSearchCV
param_grid = [
# try 12 (3×4) combinations of hyperparameters
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# then try 6 (2×3) combinations with bootstrap set as False
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)