Data Science Portfolio

DRILL: Prepare the Data

For this drill we will use a 2013 dataset of crimes in New York state, by city.

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline

ny_crime = pd.read_csv('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv')
ny_crime.head()

	City	Population	Violent crime	Murder and nonnegligent manslaughter	Rape (revised definition)1	Rape (legacy definition)2	Robbery	Aggravated assault	Property crime	Burglary	Larceny- theft	Motor vehicle theft	Arson3
0	Adams Village	1,861	0	0	NaN	0	0	0	12	2	10	0	0.0
1	Addison Town and Village	2,577	3	0	NaN	0	0	3	24	3	20	1	0.0
2	Akron Village	2,846	3	0	NaN	0	0	3	16	1	15	0	0.0
3	Albany	97,956	791	8	NaN	30	227	526	4,090	705	3,243	142	NaN
4	Albion Village	6,388	23	0	NaN	3	4	16	223	53	165	5	NaN

# Select datafame columns
ny_crime = ny_crime[['City', 'Population', 'Murder and\rnonnegligent\rmanslaughter', 'Robbery', 'Property\rcrime']]

# Rename columns
ny_crime.columns = ['City', 'Population', 'Murder', 'Robbery', 'Property Crime']

# Remove commas from numeric strings
ny_crime['Population'] = ny_crime.Population.apply(lambda x: x.replace(',', ''))

# Change type to int
ny_crime['Population'] = ny_crime.Population.astype(int)

# Remove commas from numeric strings
ny_crime['Robbery'] = ny_crime.Robbery.apply(lambda x: x.replace(',', ''))

# Change type to int
ny_crime['Robbery'] = ny_crime.Robbery.astype(int)

# Remove commas from numeric strings
ny_crime['Property Crime'] = ny_crime['Property Crime'].apply(lambda x: x.replace(',', ''))

# Change type to int
ny_crime['Property Crime'] = ny_crime['Property Crime'].astype(int)

# Drop null values
ny_crime = ny_crime.dropna()

ny_crime.head()

	City	Population	Murder	Robbery	Property Crime
0	Adams Village	1861	0	0	12
1	Addison Town and Village	2577	0	0	24
2	Akron Village	2846	0	0	16
3	Albany	97956	8	227	4090
4	Albion Village	6388	0	4	223

# Plot population distribution
ny_crime['Population'].hist(bins=100)
plt.title('Population')
plt.show()

png

# Plot murder distribution
ny_crime['Murder'].hist(bins=50)
plt.title('Murder')
plt.show()

png

# Plot robbery distribution
ny_crime['Robbery'].hist(bins=50)
plt.title('Robbery')
plt.show()

png

# Plot property crime distribution
ny_crime['Property Crime'].hist(bins=50)
plt.title('Murder')
plt.show()

png

# Filter out any outliers over two standard deviations above the mean
pop_cutoff = ny_crime['Population'].mean() + 2*ny_crime['Population'].std()
mur_cutoff = ny_crime['Murder'].mean() + 2*ny_crime['Murder'].std()
rob_cutoff = ny_crime['Robbery'].mean() + 2*ny_crime['Robbery'].std()
prop_cutoff = ny_crime['Property Crime'].mean() + 2*ny_crime['Property Crime'].std()

ny_crime['Population'] = ny_crime.Population.map(lambda x: x if x < pop_cutoff else None)
ny_crime['Murder'] = ny_crime.Murder.map(lambda x: x if x < mur_cutoff else None)
ny_crime['Robbery'] = ny_crime.Robbery.map(lambda x: x if x < rob_cutoff else None)
ny_crime['Property Crime'] = ny_crime['Property Crime'].map(lambda x: x if x < prop_cutoff else None)

ny_crime.describe()

	Population	Murder	Robbery	Property Crime
count	347.000000	345.000000	347.000000	347.000000
mean	15956.685879	0.350725	17.867435	385.752161
std	27080.218837	1.587160	94.972492	1034.369072
min	526.000000	0.000000	0.000000	0.000000
25%	2997.000000	0.000000	0.000000	40.000000
50%	7187.000000	0.000000	1.000000	112.000000
75%	18160.500000	0.000000	5.000000	340.500000
max	258789.000000	21.000000	1322.000000	12491.000000

# Create new feature
ny_crime['Population^2'] = ny_crime['Population']**2

# Convert specified columns to boolean
for col in ['Murder', 'Robbery']:
    ny_crime[col] = ny_crime[col] > 0

ny_crime.head()

	City	Population	Murder	Robbery	Property Crime	Population^2
0	Adams Village	1861.0	False	False	12.0	3.463321e+06
1	Addison Town and Village	2577.0	False	False	24.0	6.640929e+06
2	Akron Village	2846.0	False	False	16.0	8.099716e+06
3	Albany	97956.0	True	True	4090.0	9.595378e+09
4	Albion Village	6388.0	False	True	223.0	4.080654e+07