%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
plt.style.use('ggplot')
df = pd.read_csv('./data/data.csv')
# let's quickly check the meta-data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): longitude 20640 non-null float64 latitude 20640 non-null float64 housing_median_age 20640 non-null float64 total_rooms 20640 non-null float64 total_bedrooms 20433 non-null float64 population 20640 non-null float64 households 20640 non-null float64 median_income 20640 non-null float64 median_house_value 20640 non-null float64 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
df.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
As we can see from the above, the dataset contains 9 numerical variables and one categorical variable. Each row in the dataset represent a district in CA.
For numerical variables, Tukey's five-number summary statistics (minimum, first quartile, median, third quartile, maximum) can provide quick information about the dataset.
df.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
For categorical variables, we can simply check the count of each category.
df['ocean_proximity'].value_counts()
<1H OCEAN 9136 INLAND 6551 NEAR OCEAN 2658 NEAR BAY 2290 ISLAND 5 Name: ocean_proximity, dtype: int64
df.hist(bins=50, figsize=(20, 15))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA2A13EB8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA74A1DD8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA74D42B0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA74FA7F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA7521D68>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA7552320>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA7577898>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA759FE48>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001DEA759FE80>]], dtype=object)
df.plot(kind="scatter",
x="longitude",
y="latitude",
alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0xfe59668>
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=df["population"]/100,
label="population", figsize=(10,7),
c="median_house_value",
cmap=plt.get_cmap("jet"),
colorbar=True,
sharex=False)
plt.legend()
plt.tight_layout(pad=0.1)
plt.savefig('00_scatter_prices.png')
from matplotlib.image import imread
ca_img = imread('http://www.cs.uga.edu/~jwlee/images/00_california.png')
df.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
s=df['population']/100, label="Population",
c="median_house_value", cmap=plt.get_cmap("jet"),
colorbar=True, alpha=0.4)
plt.imshow(ca_img,
extent=[-124.55, -113.80, 32.45, 42.05],
alpha=0.5)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
prices = df["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
#cbar = plt.colorbar()
cbar_ax = plt.gcf().get_axes()[1]
cbar_ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values],
fontsize=14)
#cbar_ax.set_label('Median House Value', fontsize=16)
plt.legend(fontsize=16)
plt.tight_layout(pad=0.1)
plt.savefig("00_ca_housing_prices.png")
plt.show()
from pandas.plotting import scatter_matrix
attrs = ['median_house_value', 'median_income', 'housing_median_age',
'total_rooms']
scatter_matrix(df[attrs],
alpha=0.1, figsize=(10, 10), diagonal='kde')
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001F25979BC50>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25AB05CC0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A000320>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A02C8D0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A05BE80>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A093470>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A0C2A20>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A0F4FD0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A103080>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25A164B70>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25AB73160>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25ABA2710>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000001F25ABD6CC0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25B2B12B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25B2E3860>, <matplotlib.axes._subplots.AxesSubplot object at 0x000001F25B316E10>]], dtype=object)
corr_mat = df[attrs].corr()
corr_mat
median_house_value | median_income | housing_median_age | total_rooms | |
---|---|---|---|---|
median_house_value | 1.000000 | 0.688075 | 0.105623 | 0.134153 |
median_income | 0.688075 | 1.000000 | -0.119034 | 0.198050 |
housing_median_age | 0.105623 | -0.119034 | 1.000000 | -0.361262 |
total_rooms | 0.134153 | 0.198050 | -0.361262 | 1.000000 |
corr_mat['median_house_value'].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.688075 total_rooms 0.134153 housing_median_age 0.105623 Name: median_house_value, dtype: float64
corr_mat = df.corr()
fig, ax = plt.subplots(figsize=(10, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr_mat, cmap=cmap, linewidth=.5, square=True, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x14de5f60>
rows_with_missing_values = df[df.isnull().any(axis=1)].head()
rows_with_missing_values
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
290 | -122.16 | 37.77 | 47.0 | 1256.0 | NaN | 570.0 | 218.0 | 4.3750 | 161900.0 | NEAR BAY |
341 | -122.17 | 37.75 | 38.0 | 992.0 | NaN | 732.0 | 259.0 | 1.6196 | 85100.0 | NEAR BAY |
538 | -122.28 | 37.78 | 29.0 | 5154.0 | NaN | 3741.0 | 1273.0 | 2.5762 | 173400.0 | NEAR BAY |
563 | -122.24 | 37.75 | 45.0 | 891.0 | NaN | 384.0 | 146.0 | 4.9489 | 247100.0 | NEAR BAY |
696 | -122.10 | 37.69 | 41.0 | 746.0 | NaN | 387.0 | 161.0 | 3.9063 | 178400.0 | NEAR BAY |
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
housing_num = df.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)
X= imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=list(df.index.values))
housing_tr.loc[rows_with_missing_values.index.values]
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
290 | -122.16 | 37.77 | 47.0 | 1256.0 | 537.870553 | 570.0 | 218.0 | 4.3750 | 161900.0 |
341 | -122.17 | 37.75 | 38.0 | 992.0 | 537.870553 | 732.0 | 259.0 | 1.6196 | 85100.0 |
538 | -122.28 | 37.78 | 29.0 | 5154.0 | 537.870553 | 3741.0 | 1273.0 | 2.5762 | 173400.0 |
563 | -122.24 | 37.75 | 45.0 | 891.0 | 537.870553 | 384.0 | 146.0 | 4.9489 | 247100.0 |
696 | -122.10 | 37.69 | 41.0 | 746.0 | 537.870553 | 387.0 | 161.0 | 3.9063 | 178400.0 |
from sklearn.preprocessing import OneHotEncoder
housing_cat = df[['ocean_proximity']]
cat_encoder = OneHotEncoder(sparse=False) # Instantiating the OneHot Encoder class.
housing_cat_onehot = cat_encoder.fit_transform(housing_cat) # Encoding the Text.
print(housing_cat_onehot[:10]) # Converting into numpy arrays.
print("\n")
print(cat_encoder.categories_)
[[0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 1. 0.]] [array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#@ Pipeline for Numerical attributes:
num_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="mean")),
("std_scaler", StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr.shape
(20640, 9)
from sklearn.compose import ColumnTransformer
num_attrs = list(housing_num) # List of numerical attributes.
cat_attrs = ["ocean_proximity"] # List of categorical attributes.
pipeline = ColumnTransformer([
("num", num_pipeline, num_attrs),
("cat", OneHotEncoder(), cat_attrs)
])
housing_data = pipeline.fit_transform(df)
housing_label = df["median_house_value"].copy()
housing_data[:5]
array([[-1.32783522, 1.05254828, 0.98214266, -0.8048191 , -0.97522785, -0.9744286 , -0.97703285, 2.34476576, 2.12963148, 0. , 0. , 0. , 1. , 0. ], [-1.32284391, 1.04318455, -0.60701891, 2.0458901 , 1.3550882 , 0.86143887, 1.66996103, 2.33223796, 1.31415614, 0. , 0. , 0. , 1. , 0. ], [-1.33282653, 1.03850269, 1.85618152, -0.53574589, -0.82973217, -0.82077735, -0.84363692, 1.7826994 , 1.25869341, 0. , 0. , 0. , 1. , 0. ], [-1.33781784, 1.03850269, 1.85618152, -0.62421459, -0.72239929, -0.76602806, -0.73378144, 0.93296751, 1.16510007, 0. , 0. , 0. , 1. , 0. ], [-1.33781784, 1.03850269, 1.85618152, -0.46240395, -0.61506641, -0.75984669, -0.62915718, -0.012881 , 1.17289952, 0. , 0. , 0. , 1. , 0. ]])
#df_tr = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)
#df_tr.head()
X_train, X_test, Y_train, Y_test = train_test_split(housing_data, housing_label, test_size=0.3, random_state=10)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(14448, 14) (6192, 14) (14448,) (6192,)
# Create an object lm for the LinearRegression model
lm = LinearRegression()
# Fit the model on the training data
lm.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# regression coefficients
lm.coef_
array([ 8.12986216e-11, 1.68671119e-10, 2.15467161e-11, 3.88705838e-11, -8.99805034e-12, -2.16932722e-11, 3.18490569e-11, -3.68252413e-11, 1.15392820e+05, -3.19482215e-11, 4.19977728e-11, 5.19945705e-14, 8.58398092e-12, -2.92190990e-11])
# predict
Y_pred = lm.predict(X_test)
mse = (Y_pred - Y_test).mean()
print(f"MSE={mse}")
MSE=415.0572620847688
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
tree_reg = DecisionTreeRegressor() # Instantiating the Model.
tree_reg.fit(X_train, Y_train)
housing_predictions = tree_reg.predict(X_test)
tree_mse = mean_squared_error(Y_test, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)
95.35395776254865
data = {'alg': ['LinReg', 'DecTree', 'RF'],
'RMSE': [68628.198198489219, 71436.9531067, 52789.3237327]}
df
alg | RMSE | |
---|---|---|
0 | LinReg | 68628.198198 |
1 | DecTree | 71436.953107 |
2 | RF | 52789.323733 |
df.plot(kind='bar', x="alg", y="RMSE")
<matplotlib.axes._subplots.AxesSubplot at 0x1f2597e8550>