Data Science Portfolio

DRILL: fixing assumptions

import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

Assumption one: linear relationship

# Acquire, load, and preview the data.
data = pd.read_csv('https://tf-curricula-prod.s3.amazonaws.com/data-science/Advertising.csv')
display(data.head())

# Instantiate and fit our model.
regr = linear_model.LinearRegression()
Y = data['Sales'].values.reshape(-1, 1)
X1 = data[['TV','Radio','Newspaper']]

# Create new squared features
data['TV^2'] = data['TV']**2
data['Radio^2'] = data['Radio']**2
data['Newspaper^2'] = data['Newspaper']**2
data['Sales^2'] = data['Sales']**2

X2 = data[['TV^2', 'Radio^2', 'Newspaper^2']]

regr.fit(X2, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X2, Y))
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.100 37.800 69.200 22.100
1 2 44.500 39.300 45.100 10.400
2 3 17.200 45.900 69.300 9.300
3 4 151.500 41.300 58.500 18.500
4 5 180.800 10.800 58.400 12.900
Coefficients: 
 [[  1.42707531e-04   3.68654872e-03  -8.28260101e-05]]

Intercept: 
 [ 7.2029644]

R-squared:
0.799973684425

Assumption two: multivariate normality

Y = data['Sales'].values.reshape(-1, 1)
X = data[['TV', 'Radio', 'Newspaper']]

# Extract predicted values.
predicted = regr.predict(X).ravel()
actual = data['Sales']

# Calculate the error, also called the residual.
residual = actual - predicted

# This looks a bit concerning.
plt.hist(residual)
plt.title('Residual counts')
plt.xlabel('Residual')
plt.ylabel('Count')
plt.show()

png

Assumption three: homoscedasticity

plt.scatter(predicted, residual)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.axhline(y=0)
plt.title('Residual vs. Predicted')
plt.show()

png

Based on the scatter plot, it appears as though squaring the features improved the homoscedasticity.

Assumption four: low multicollinearity

corrmat = data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
display(data.corr())

png

Unnamed: 0 TV Radio Newspaper Sales TV^2 Radio^2 Newspaper^2 Sales^2
Unnamed: 0 1.000 0.018 -0.111 -0.155 -0.052 0.018 -0.091 -0.137 -0.043
TV 0.018 1.000 0.055 0.057 0.782 0.968 0.051 0.056 0.729
Radio -0.111 0.055 1.000 0.354 0.576 0.079 0.967 0.352 0.610
Newspaper -0.155 0.057 0.354 1.000 0.228 0.076 0.361 0.940 0.238
Sales -0.052 0.782 0.576 0.228 1.000 0.736 0.562 0.216 0.980
TV^2 0.018 0.968 0.079 0.076 0.736 1.000 0.075 0.085 0.715
Radio^2 -0.091 0.051 0.967 0.361 0.562 0.075 1.000 0.357 0.611
Newspaper^2 -0.137 0.056 0.352 0.940 0.216 0.085 0.357 1.000 0.228
Sales^2 -0.043 0.729 0.610 0.238 0.980 0.715 0.611 0.228 1.000

It appears that some of our features are highly correlated, particularly Radio/Sales and Newspaper/Sales.

correlation_matrix = data.corr()
display(correlation_matrix)
Unnamed: 0 TV Radio Newspaper Sales TV^2 Radio^2 Newspaper^2 Sales^2
Unnamed: 0 1.000 0.018 -0.111 -0.155 -0.052 0.018 -0.091 -0.137 -0.043
TV 0.018 1.000 0.055 0.057 0.782 0.968 0.051 0.056 0.729
Radio -0.111 0.055 1.000 0.354 0.576 0.079 0.967 0.352 0.610
Newspaper -0.155 0.057 0.354 1.000 0.228 0.076 0.361 0.940 0.238
Sales -0.052 0.782 0.576 0.228 1.000 0.736 0.562 0.216 0.980
TV^2 0.018 0.968 0.079 0.076 0.736 1.000 0.075 0.085 0.715
Radio^2 -0.091 0.051 0.967 0.361 0.562 0.075 1.000 0.357 0.611
Newspaper^2 -0.137 0.056 0.352 0.940 0.216 0.085 0.357 1.000 0.228
Sales^2 -0.043 0.729 0.610 0.238 0.980 0.715 0.611 0.228 1.000
features = ['TV', 'Radio', 'Newspaper', 'TV^2', 'Radio^2', 'Newspaper^2', 'Sales^2']

df_pca = data.loc[:, features].values
y = data.loc[:, ['Sales']].values

X = StandardScaler().fit_transform(df_pca)

Xt = X.T
Cx = np.cov(Xt)
print('Covariance Matrix:\n', Cx)
Covariance Matrix:
 [[ 1.00502513  0.05508408  0.05693254  0.97252509  0.05109954  0.05590531
   0.73256521]
 [ 0.05508408  1.00502513  0.35588317  0.07903933  0.97160989  0.3541069
   0.6134916 ]
 [ 0.05693254  0.35588317  1.00502513  0.07682624  0.36322152  0.94439868
   0.23939276]
 [ 0.97252509  0.07903933  0.07682624  1.00502513  0.07509284  0.08590238
   0.7185557 ]
 [ 0.05109954  0.97160989  0.36322152  0.07509284  1.00502513  0.3583038
   0.61367703]
 [ 0.05590531  0.3541069   0.94439868  0.08590238  0.3583038   1.00502513
   0.22896125]
 [ 0.73256521  0.6134916   0.23939276  0.7185557   0.61367703  0.22896125
   1.00502513]]
# Calculating eigenvalues and eigenvectors.
eig_val_cov, eig_vec_cov = np.linalg.eig(Cx)

# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_val_cov)):
    eigvec_cov = eig_vec_cov[:, i].reshape(1, 7).T
    print('Eigenvector {}: \n{}'.format(i + 1, eigvec_cov))
    print('Eigenvalue {}: {}'.format(i + 1, eig_val_cov[i]))
    print(40 * '-')

print(
    'The percentage of total variance in the dataset explained by each',
    'component calculated by hand.\n',
    eig_val_cov / sum(eig_val_cov)
)
Eigenvector 1: 
[[-0.32192638]
 [-0.41441704]
 [-0.32480276]
 [-0.33074094]
 [-0.41479104]
 [-0.32324108]
 [-0.48290695]]
Eigenvalue 1: 3.3533907440501256
----------------------------------------
Eigenvector 2: 
[[-0.53183042]
 [ 0.26803955]
 [ 0.35554971]
 [-0.51574108]
 [ 0.27193793]
 [ 0.35385085]
 [-0.23183203]]
Eigenvalue 2: 2.138315141603013
----------------------------------------
Eigenvector 3: 
[[ 0.16157412]
 [-0.4372079 ]
 [ 0.51608197]
 [ 0.16734606]
 [-0.43358999]
 [ 0.52089634]
 [-0.17048307]]
Eigenvalue 3: 1.3152329266320908
----------------------------------------
Eigenvector 4: 
[[ 0.13700407]
 [ 0.2664073 ]
 [-0.13288173]
 [ 0.44772813]
 [ 0.22523894]
 [ 0.09618972]
 [-0.7950811 ]]
Eigenvalue 4: 0.10705708130779526
----------------------------------------
Eigenvector 5: 
[[ 0.10729353]
 [ 0.01308928]
 [ 0.69087275]
 [ 0.00494538]
 [ 0.07675588]
 [-0.69424467]
 [-0.15205271]]
Eigenvalue 5: 0.05981778381043827
----------------------------------------
Eigenvector 6: 
[[-0.74294797]
 [ 0.01158729]
 [ 0.07664522]
 [ 0.62498504]
 [-0.12455246]
 [-0.08476171]
 [ 0.16945618]]
Eigenvalue 6: 0.02803879045367865
----------------------------------------
Eigenvector 7: 
[[-0.07183251]
 [-0.70284004]
 [-0.03016256]
 [ 0.07433458]
 [ 0.7027521 ]
 [ 0.02364543]
 [ 0.00096626]]
Eigenvalue 7: 0.033323411539842196
----------------------------------------
The percentage of total variance in the dataset explained by each component calculated by hand.
 [ 0.47666054  0.30394622  0.18695097  0.0152174   0.00850267  0.00398551
  0.00473668]
plt.plot(eig_val_cov)
plt.show()

png

It turns out that doing PCA indicates that we should get rid of the extra squared features. Only the first three features (TV, Radio, Newspaper) have eigenvalues greater than 1. Approximately 96% of the variance is captured in these three features.