In [1]:
# For data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from __future__ import division
import datetime
Importing Data¶
In [2]:
df = pd.read_csv("RI_Pres_Election_2016.csv", parse_dates=['contb_receipt_dt'])
In [3]:
df.head()
Out[3]:
In [4]:
df.info()
25,888 Donations from Rhode Island¶
In [5]:
sum_df = df.groupby(['contb_receipt_dt'],as_index=False).sum()
In [6]:
sum_df.head()
Out[6]:
Line graph of how much money was donated per day¶
In [7]:
sum_df.plot('contb_receipt_dt','contb_receipt_amt',figsize=(12,6),marker='',linestyle='-',color='purple',
xlim=('2016-01-01','2016-11-08'))
Out[7]:
Line graph of how many people donated per day¶
In [8]:
count_df = df.groupby(['contb_receipt_dt'],as_index=False).count()
In [9]:
count_df.plot('contb_receipt_dt','contb_receipt_amt',figsize=(12,6),marker='',linestyle='-',color='purple',
xlim=('2016-01-01','2016-11-08'))
Out[9]:
Line graph of average donated per day¶
In [10]:
mean_df = df.groupby(['contb_receipt_dt'],as_index=False).mean()
In [11]:
mean_df.plot('contb_receipt_dt','contb_receipt_amt',figsize=(12,6),marker='',linestyle='-',color='purple',
xlim=('2016-01-01','2016-11-08'),ylim=(0,1000))
Out[11]:
In [12]:
df.groupby('Party')['contb_receipt_amt'].sum().sort_values().plot(kind='bar')
Out[12]:
In [13]:
df['contb_receipt_amt'].describe()
Out[13]:
In [14]:
df.groupby('cand_nm')['contb_receipt_amt'].count().sort_values(ascending=False)
Out[14]:
In [15]:
df['contb_receipt_amt'].value_counts()
Out[15]:
As with the full dataset $25 is the most frequent donation
Lets look at the canidates
In [16]:
cand_df = df.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt",ascending=False)
cand_df = cand_df.drop(['contbr_zip','file_num'], axis=1)
In [17]:
cand_df
Out[17]:
Obviously Hillary raised the most
However what really surprises me is how much Jill Stein raised, but maybe it was just a few people donating a lot
Lets find out!
In [18]:
count_df = df.groupby("cand_nm").count().sort_values(by="contb_receipt_amt",ascending=False)
count_df = count_df.drop(['contbr_zip','file_num'], axis=1)
count_df['cmte_id']
Out[18]:
Jill Stein got more donations then Rand Paul and Gary Johnson combined which I found surprising¶
Total Raised by Canidate¶
In [19]:
total_donations = df.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt",ascending=True)
In [20]:
total_donations["contb_receipt_amt"].plot(kind="bar",figsize=(12,5))
Out[20]:
Average Ammount Donated to Canidate¶
In [21]:
avg_donations = df.groupby("cand_nm").mean().sort_values(by="contb_receipt_amt")
avg_donations["contb_receipt_amt"].plot(kind="bar",figsize=(12,5))
Out[21]:
I expected the average donation to Sanders to be low, but not dead last, lets look at lower donations between him and Hillary¶
In [22]:
# Getting a dataframe of just bernie and Hillary
dem_df = df[df['cand_nm'].isin(['Sanders, Bernard', 'Clinton, Hillary Rodham'])]
In [23]:
# only getting values under 100
don_df_15 = dem_df[dem_df.contb_receipt_amt < 15]
don_df_25 = dem_df[dem_df.contb_receipt_amt < 25]
don_df_35 = dem_df[dem_df.contb_receipt_amt < 35]
don_df_50 = dem_df[dem_df.contb_receipt_amt < 50]
don_df_75 = dem_df[dem_df.contb_receipt_amt < 75]
don_df_100 = dem_df[dem_df.contb_receipt_amt < 100]
In [24]:
# Grouping by canidate and totaling the ammount
don_df_15 = don_df_15.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_25 = don_df_25.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_35 = don_df_35.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_50 = don_df_50.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_75 = don_df_75.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
don_df_100 = don_df_100.groupby("cand_nm").sum().sort_values(by="contb_receipt_amt")
In [25]:
# Concatinating the datasets together
frames = [don_df_15, don_df_25, don_df_35, don_df_50, don_df_75, don_df_100]
dem_concat = pd.concat(frames, keys=['15', '25', '35', '50', '75', '100'])
#resetting the index and dropping the columns we don't need
dem_concat = dem_concat.reset_index()
dem_concat = dem_concat.drop('contbr_zip', axis=1)
dem_concat = dem_concat.drop('file_num', axis=1)
In [26]:
dem_concat
Out[26]:
In [27]:
# Pivoting by the amt ranges
dem_concat = dem_concat.pivot_table('contb_receipt_amt',index='level_0',columns = 'cand_nm',aggfunc='sum')
In [28]:
new_index= ['15', '25', '35', '50', '75', '100']
dem_concat = dem_concat.reindex(new_index)
dem_concat.head()
Out[28]:
Bernie vs. Clinton Small Donations¶
In [29]:
dem_concat[['Clinton, Hillary Rodham','Sanders, Bernard']].plot(kind='bar',figsize=(12,4))
plt.xlabel('Ammount')
locs, labels = plt.xticks()
plt.setp(labels, rotation=360)
plt.title('Bernie vs. Clinton')
Out[29]:
Bernie out earns Hillary till the $50 Mark and larger values where she pushes ahead, though this data continued
on after Bernie dropped out. For a more accurate picture we would only want donations before Bernie dropped out
RI - Cities¶
In [30]:
df['contbr_city'].value_counts()
Out[30]:
In [31]:
city_df = df.pivot_table('contb_receipt_amt',index='contbr_city',columns = 'Party',aggfunc='sum')
city_df["Total"] = city_df.sum(axis=1)
city_df = city_df.sort_values(by="Total",ascending=True)
city_df.head()
Out[31]:
Total Ammount Donated by City¶
In [32]:
city_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[32]:
Average Donation by City¶
In [33]:
city_mean = df.pivot_table('contb_receipt_amt',index='contbr_city',columns = 'Party',aggfunc='mean')
city_mean["Total"] = city_mean.sum(axis=1)
city_mean = city_mean.sort_values(by="Total",ascending=True)
city_mean[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[33]:
Count of Donations by City¶
In [34]:
city_count = df.pivot_table('contb_receipt_amt',index='contbr_city',columns = 'Party',aggfunc='count')
city_count["Total"] = city_count.sum(axis=1)
city_count = city_count.sort_values(by="Total",ascending=True)
city_count[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[34]:
Jobs¶
In [35]:
occupation_df = df.pivot_table('contb_receipt_amt',index='contbr_occupation',columns = 'Party',aggfunc='sum')
occupation_df["Total"] = occupation_df.sum(axis=1)
occupation_df = occupation_df.sort_values(by="Total",ascending=True)
occupation_df.shape
Out[35]:
Well we can't graph 1,000+ Jobs so lets cut it down¶
In [36]:
occupation_df = occupation_df[occupation_df['Total'] > 10000]
# remove these rows
occupation_df.drop(['INFORMATION REQUESTED', 'RETIRED'],axis=0,inplace=True)
occupation_df.shape
Out[36]:
Total Ammount Donated by Career¶
In [37]:
occupation_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[37]:
In [38]:
occupation_count = df.pivot_table('contb_receipt_amt',index='contbr_occupation',columns = 'Party',aggfunc='count')
occupation_count["Total"] = occupation_count.sum(axis=1)
occupation_count = occupation_count.sort_values(by="Total",ascending=True)
occupation_count.shape
Out[38]:
In [39]:
occupation_count = occupation_count[occupation_count['Total'] > 100]
occupation_count.drop(['INFORMATION REQUESTED', 'RETIRED','DISABLED'],axis=0,inplace=True)
occupation_count.loc['REGISTERED NURSE'] = occupation_count.loc['REGISTERED NURSE'] + occupation_count.loc['RN']
occupation_count.drop('RN', inplace=True)
occupation_count.shape
Out[39]:
How Many Donated From Each Occupation¶
In [40]:
occupation_count[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[40]:
Looking at Donations by Company¶
In [41]:
employer_df = df.pivot_table('contb_receipt_amt',index='contbr_employer',columns = 'Party',aggfunc='sum')
employer_df["Total"] = employer_df.sum(axis=1)
employer_df = employer_df[employer_df['Total'] > 5000]
employer_df.shape
Out[41]:
In [42]:
employer_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED'],axis=0,inplace=True)
# combine like rows
employer_df.loc['SELF-EMPLOYED'] = employer_df.loc['SELF-EMPLOYED'] + employer_df.loc['SELF EMPLOYED'] + employer_df.loc['SELF']
employer_df.loc['NOT EMPLOYED'] = employer_df.loc['NOT EMPLOYED'] + employer_df.loc['NONE']
# remove self employed without dash
employer_df.drop('SELF EMPLOYED',inplace=True)
employer_df.drop('SELF',inplace=True)
employer_df.drop('NONE',inplace=True)
# These were far to large to allow us to see anything else
employer_df.drop('HOMEMAKER',inplace=True)
employer_df.drop('NOT EMPLOYED',inplace=True)
employer_df.drop('SELF-EMPLOYED',inplace=True)
employer_df = employer_df.sort_values(by="Total",ascending=True)
Total donations by Company¶
In [43]:
employer_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[43]:
In [44]:
employer_cnt = df.pivot_table('contb_receipt_amt',index='contbr_employer',columns = 'Party',aggfunc='count')
employer_cnt["Total"] = employer_cnt.sum(axis=1)
employer_cnt = employer_cnt[employer_cnt['Total'] > 50]
employer_cnt.shape
Out[44]:
In [45]:
employer_cnt.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED'],axis=0,inplace=True)
# combine like rows
employer_cnt.loc['SELF-EMPLOYED'] = employer_cnt.loc['SELF-EMPLOYED'] + employer_cnt.loc['SELF EMPLOYED'] + employer_cnt.loc['SELF'] + employer_cnt.loc['SELF EMPLOYED-EMPLOYED']
employer_cnt.loc['NOT EMPLOYED'] = employer_cnt.loc['NOT EMPLOYED'] + employer_cnt.loc['NONE'] + employer_cnt.loc['UNEMPLOYED']
employer_cnt.loc['UNIVERSITY OF RHODE ISLAND'] = employer_cnt.loc['UNIVERSITY OF RHODE ISLAND'] + employer_cnt.loc['UNIVERSITY OF RHODE ISLAND']
# remove self employed without dash
employer_cnt.drop('SELF EMPLOYED',inplace=True)
employer_cnt.drop('SELF EMPLOYED-EMPLOYED',inplace=True)
employer_cnt.drop('SELF',inplace=True)
employer_cnt.drop('NONE',inplace=True)
employer_cnt.drop('UNEMPLOYED',inplace=True)
employer_cnt.drop('UNIVERSITY OF RI',inplace=True)
# These were far to large to allow us to see anything else
employer_cnt.drop('NOT EMPLOYED',inplace=True)
employer_cnt.drop('SELF-EMPLOYED',inplace=True)
employer_cnt = employer_cnt.sort_values(by="Total",ascending=True)
Count of how many donations per Company¶
In [46]:
employer_cnt[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
Out[46]:
Counties¶
In [47]:
df['County'].value_counts()
Out[47]:
Total Ammount Donated by County¶
In [48]:
county_df = df.pivot_table('contb_receipt_amt',index='County',columns = 'Party',aggfunc='sum')
county_df["Total"] = county_df.sum(axis=1)
county_df = county_df.sort_values(by="Total",ascending=True)
county_df[['Democrat','Republican']].plot(kind='barh',figsize=(12,6),cmap='bwr')
Out[48]:
Average Donation by County¶
In [49]:
county_df = df.pivot_table('contb_receipt_amt',index='County',columns = 'Party',aggfunc='mean')
county_df["Total"] = county_df.mean(axis=1)
county_df = county_df.sort_values(by="Total",ascending=True)
county_df[['Democrat','Republican']].plot(kind='barh',figsize=(12,6),cmap='bwr')
Out[49]:
Count of Donations by County¶
In [50]:
county_df = df.pivot_table('contb_receipt_amt',index='County',columns = 'Party',aggfunc='count')
county_df["Total"] = county_df.sum(axis=1)
county_df = county_df.sort_values(by="Total",ascending=True)
county_df[['Democrat','Republican']].plot(kind='barh',figsize=(12,6),cmap='bwr')
Out[50]: