Data Science Portfolio

DRILL: Prepare the Data

For this drill we will use a 2013 dataset of crimes in New York state, by city.

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline
ny_crime = pd.read_csv('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv')
ny_crime.head()
City Population Violent crime Murder and nonnegligent manslaughter Rape (revised definition)1 Rape (legacy definition)2 Robbery Aggravated assault Property crime Burglary Larceny- theft Motor vehicle theft Arson3
0 Adams Village 1,861 0 0 NaN 0 0 0 12 2 10 0 0.0
1 Addison Town and Village 2,577 3 0 NaN 0 0 3 24 3 20 1 0.0
2 Akron Village 2,846 3 0 NaN 0 0 3 16 1 15 0 0.0
3 Albany 97,956 791 8 NaN 30 227 526 4,090 705 3,243 142 NaN
4 Albion Village 6,388 23 0 NaN 3 4 16 223 53 165 5 NaN
# Select datafame columns
ny_crime = ny_crime[['City', 'Population', 'Murder and\rnonnegligent\rmanslaughter', 'Robbery', 'Property\rcrime']]

# Rename columns
ny_crime.columns = ['City', 'Population', 'Murder', 'Robbery', 'Property Crime']

# Remove commas from numeric strings
ny_crime['Population'] = ny_crime.Population.apply(lambda x: x.replace(',', ''))

# Change type to int
ny_crime['Population'] = ny_crime.Population.astype(int)

# Remove commas from numeric strings
ny_crime['Robbery'] = ny_crime.Robbery.apply(lambda x: x.replace(',', ''))

# Change type to int
ny_crime['Robbery'] = ny_crime.Robbery.astype(int)

# Remove commas from numeric strings
ny_crime['Property Crime'] = ny_crime['Property Crime'].apply(lambda x: x.replace(',', ''))

# Change type to int
ny_crime['Property Crime'] = ny_crime['Property Crime'].astype(int)

# Drop null values
ny_crime = ny_crime.dropna()

ny_crime.head()
City Population Murder Robbery Property Crime
0 Adams Village 1861 0 0 12
1 Addison Town and Village 2577 0 0 24
2 Akron Village 2846 0 0 16
3 Albany 97956 8 227 4090
4 Albion Village 6388 0 4 223
# Plot population distribution
ny_crime['Population'].hist(bins=100)
plt.title('Population')
plt.show()

png

# Plot murder distribution
ny_crime['Murder'].hist(bins=50)
plt.title('Murder')
plt.show()

png

# Plot robbery distribution
ny_crime['Robbery'].hist(bins=50)
plt.title('Robbery')
plt.show()

png

# Plot property crime distribution
ny_crime['Property Crime'].hist(bins=50)
plt.title('Murder')
plt.show()

png

# Filter out any outliers over two standard deviations above the mean
pop_cutoff = ny_crime['Population'].mean() + 2*ny_crime['Population'].std()
mur_cutoff = ny_crime['Murder'].mean() + 2*ny_crime['Murder'].std()
rob_cutoff = ny_crime['Robbery'].mean() + 2*ny_crime['Robbery'].std()
prop_cutoff = ny_crime['Property Crime'].mean() + 2*ny_crime['Property Crime'].std()

ny_crime['Population'] = ny_crime.Population.map(lambda x: x if x < pop_cutoff else None)
ny_crime['Murder'] = ny_crime.Murder.map(lambda x: x if x < mur_cutoff else None)
ny_crime['Robbery'] = ny_crime.Robbery.map(lambda x: x if x < rob_cutoff else None)
ny_crime['Property Crime'] = ny_crime['Property Crime'].map(lambda x: x if x < prop_cutoff else None)

ny_crime.describe()
Population Murder Robbery Property Crime
count 347.000000 345.000000 347.000000 347.000000
mean 15956.685879 0.350725 17.867435 385.752161
std 27080.218837 1.587160 94.972492 1034.369072
min 526.000000 0.000000 0.000000 0.000000
25% 2997.000000 0.000000 0.000000 40.000000
50% 7187.000000 0.000000 1.000000 112.000000
75% 18160.500000 0.000000 5.000000 340.500000
max 258789.000000 21.000000 1322.000000 12491.000000
# Create new feature
ny_crime['Population^2'] = ny_crime['Population']**2

# Convert specified columns to boolean
for col in ['Murder', 'Robbery']:
    ny_crime[col] = ny_crime[col] > 0

ny_crime.head()
City Population Murder Robbery Property Crime Population^2
0 Adams Village 1861.0 False False 12.0 3.463321e+06
1 Addison Town and Village 2577.0 False False 24.0 6.640929e+06
2 Akron Village 2846.0 False False 16.0 8.099716e+06
3 Albany 97956.0 True True 4090.0 9.595378e+09
4 Albion Village 6388.0 False True 223.0 4.080654e+07