import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model
from scipy import stats as st
import statsmodels.api as sm

pd.options.mode.chained_assignment = None


# read in data and store as a dataframe
df = pd.read_csv('https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/covid_tracking/latest/covid_tracking.csv')

# remove deprecated columns based on data documentation
df = df.drop(df.columns[df.columns.str.contains('increase')], axis=1)
df = df.drop(labels=['date_checked','hospitalized','pos_neg','total'], axis=1)

# since we're looking at political leanings, we won't consider territories or DC
non_state_ids = ['AS','GU', 'MP', 'PR', 'VI','DC']
df = df.drop(df[(df['state'] == 'AS') | (df['state'] == 'GU') | (df['state'] == 'MP') | (df['state'] == 'PR') | (df['state'] == 'VI') | (df['state'] == 'DC')].index)

df


populations = pd.read_excel('NST-EST2021-POP.xlsx')

abbreviations = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", 
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA", 
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", 
    "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", 
    "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", 
    "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK", 
    "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC", 
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", 
    "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"
}

# some rows in the table aren't actual data rows, so we'll drop them 
populations = populations.drop(index=[0, 1, 2, 59, 61, 62, 63, 64, 65])
# also drop non-state geographic areas
populations = populations.drop(index=[3, 4, 5, 6, 7, 16, 60])
# rename the columns for greater clarity 
populations.columns = ['State', '4/1/20 Population', '7/1/20 Population', '7/1/21 Population']
# our other dataset uses abbreviations, so we want to add that column to this data frame
populations['ID'] = populations.apply(lambda r: abbreviations[r['State'][1:]], axis=1)

populations


# dropping unnecessary columns for linear regression models
lrdata = df.drop(df.columns[df.columns.str.contains('date|currently|iso|in|on|total|negative')], axis=1)
lrdata = lrdata.drop(labels=['data_quality_grade','hash','fips','fips_code','load_time'], axis=1)
lrdata.head()


# group the dataframe by state, and only take the max values in each column for each state
lrdata = lrdata.groupby('state').max()
lrdata.head()


# standardize columns by the population of each state so we can compare them fairly
lrdata['population'] = lrdata.apply(lambda r: populations[populations['ID'] == r.name]['4/1/20 Population'].values[0], axis = 1)

lrdata['positive'] = lrdata['positive'] / lrdata['population']
lrdata['hospitalized_cumulative'] = lrdata['hospitalized_cumulative'] / lrdata['population']
lrdata['recovered'] = lrdata['recovered'] / lrdata['population']
lrdata['death'] = lrdata['death'] / lrdata['population']
lrdata = lrdata.reset_index()
lrdata.head()


# group by state and get max of each col (since numbers in rows are cumulative)
covid_data = lrdata.groupby("state").max()

print(covid_data.count())
covid_data

positive                   50
hospitalized_cumulative    39
recovered                  34
death                      50
population                 50
dtype: int64


# read table
elections = pd.read_csv("president_county_candidate.csv")
      
# group table by state and democractic party, remove DC
filtered_dem = elections[elections["party"] == "DEM"]
grouped_dem = filtered_dem.groupby(["state", "party"]).sum()
grouped_dem = grouped_dem.drop(index=["District of Columbia"])

# group table by state and republican party, remove DC
filtered_rep = elections[elections["party"] == "REP"]
grouped_rep = filtered_rep.groupby(["state", "party"]).sum()
grouped_rep = grouped_rep.drop(index=["District of Columbia"])

# get totals
total = elections.groupby("state").sum()
total = total.drop(index=["District of Columbia"])

# loop through and get democratic and republican percentages
for index, row in grouped_dem.iterrows():
    total.at[index[0],"dem_pct"] = row["total_votes"]/total["total_votes"][index[0]]

for index, row in grouped_rep.iterrows():
    total.at[index[0],"rep_pct"] = row["total_votes"]/total["total_votes"][index[0]]
    
# change to state abbreviations instead of full names, set as index
total["stateID"] = ["AL", "AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
total = total.set_index("stateID")

# total percentage of votes (so we can account for third party votes)
sum_column = total["dem_pct"] + total["rep_pct"]
total["total_pct"] = sum_column
total.head()


# use pandas
total_pol = pd.concat([covid_data, total["dem_pct"], total["rep_pct"], total["total_pct"]],axis=1)
total_pol.head()


# loop through and get democratic and republican percentages
for index, row in total_pol.iterrows():
    if row["dem_pct"] > row["rep_pct"]:
        total_pol.at[index,"party"] = "blue"
    else:
        total_pol.at[index,"party"] = "red"
total_pol.head()


a = total_pol[total_pol["party"] == "red"]["positive"].to_numpy()
b = total_pol[total_pol["party"] == "blue"]["positive"].to_numpy()
st.ttest_ind(a=a,b=b, equal_var=True)

Ttest_indResult(statistic=3.794502335154351, pvalue=0.0004147543210582081)


# set x and y
x = pd.to_numeric(total_pol['rep_pct'])
y = pd.to_numeric(total_pol['positive'])

# plot percent of democratic votes and positive rates
fig, ax = plt.subplots()
ax.scatter(x,y)

# annotate
for i, txt in enumerate(total_pol.index):
    ax.annotate(txt, (x[i], y[i]), fontsize = 12)

# labelling the graph
ax.set_title("Positive Cases vs. Percentage of Republican Votes", fontsize = 12)
ax.set_ylabel("Positive Cases", fontsize = 12)
ax.set_xlabel("Percentage of Republican Votes", fontsize = 12)
fig.set_figwidth(18)
fig.set_figheight(12)
ax

<AxesSubplot:title={'center':'Positive Cases vs. Percentage of Republican Votes'}, xlabel='Percentage of Republican Votes', ylabel='Positive Cases'>


# set vars for x and y
rep_votes = np.array(total_pol['rep_pct'].values).reshape(-1, 1)
pos_rate = np.array(total_pol["positive"].values).reshape(-1, 1)

# create regression
regr = LinearRegression()
regr.fit(rep_votes, pos_rate)
inter = regr.intercept_
coeff = regr.coef_
pred = regr.predict(rep_votes.reshape(-1, 1))

# plotting line
fig, ax = plt.subplots()
ax.plot(total_pol['rep_pct'], total_pol["positive"], marker='o', linestyle = 'none');
ax.plot(rep_votes, pred, color="red", linewidth=3)
ax.set_title("Positive cases vs. Percentage of Republican Votes in the 2020 Election")
ax.set_ylabel('Percent of positive cases (adjusted by population)')
ax.set_xlabel('Percentage of Republican Votes in 2020 Election')
fig.set_figwidth(18)
fig.set_figheight(12)

# annotate
for i, txt in enumerate(total_pol.index):
    ax.annotate(txt, (x[i], y[i]), fontsize = 12)
    
print("Equation of regression line: y = " + str(coeff[0][0]) + "x + " + str(inter[0]))

Equation of regression line: y = 0.1328563831541722x + 0.01975976152636394


X = total_pol['rep_pct'].values
y = total_pol["positive"].values

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.329
Model:                            OLS   Adj. R-squared:                  0.315
Method:                 Least Squares   F-statistic:                     23.57
Date:                Tue, 10 May 2022   Prob (F-statistic):           1.32e-05
Time:                        23:19:43   Log-Likelihood:                 126.13
No. Observations:                  50   AIC:                            -248.3
Df Residuals:                      48   BIC:                            -244.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0198      0.014      1.415      0.163      -0.008       0.048
x1             0.1329      0.027      4.854      0.000       0.078       0.188
==============================================================================
Omnibus:                        2.230   Durbin-Watson:                   1.740
Prob(Omnibus):                  0.328   Jarque-Bera (JB):                1.626
Skew:                          -0.438   Prob(JB):                        0.443
Kurtosis:                       3.111   Cond. No.                         12.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# lin reg for positive to hospitalization relationship
# remove NaN values
lr_hospital = lrdata.dropna(subset=['positive', 'hospitalized_cumulative'])
lr_hospital.reset_index(drop=True, inplace=True)

x = pd.to_numeric(lr_hospital['positive'])
y = pd.to_numeric(lr_hospital['hospitalized_cumulative'])
# create plot for positive to hospitalization relationship
fig, ax = plt.subplots()
ax.scatter(x,y)
# annotate the points with state names
for i, txt in enumerate(lr_hospital.state):
    ax.annotate(txt, (x[i], y[i]), fontsize = 12)
# create and plot lin reg line
m, b = np.polyfit(x, y, 1)
ax.plot(x, m*x + b, color="red")
# label and enlarge the graph
ax.set_title("Positive Cases vs. Cumulative Hospitalized", fontsize = 12)
ax.set_xlabel("Positive Cases", fontsize = 12)
ax.set_ylabel("Cumulative Hospitalized", fontsize = 12)
fig.set_figwidth(15)
fig.set_figheight(10)
ax

<AxesSubplot:title={'center':'Positive Cases vs. Cumulative Hospitalized'}, xlabel='Positive Cases', ylabel='Cumulative Hospitalized'>


# lin reg for positive to recovered relationship
# remove NaN values
lr_recovered = lrdata.dropna(subset=['positive', 'recovered'])
lr_recovered.reset_index(drop=True, inplace=True)

x = pd.to_numeric(lr_recovered['positive'])
y = pd.to_numeric(lr_recovered['recovered'])
# create plot for positive to recovered relationship
fig, ax = plt.subplots()
ax.scatter(x,y)
# annotate the points with state names
for i, txt in enumerate(lr_recovered.state):
    ax.annotate(txt, (x[i], y[i]), fontsize = 12)
# create and plot lin reg line
m, b = np.polyfit(x, y, 1)
ax.plot(x, m*x + b, color="red")
# label and enlarge the graph
ax.set_title("Positive Cases vs. Cumulative Recovered", fontsize = 12)
ax.set_xlabel("Positive Cases", fontsize = 12)
ax.set_ylabel("Cumulative Recovered", fontsize = 12)
fig.set_figwidth(15)
fig.set_figheight(10)
ax

<AxesSubplot:title={'center':'Positive Cases vs. Cumulative Recovered'}, xlabel='Positive Cases', ylabel='Cumulative Recovered'>


# lin reg for positive to death relationship
# remove NaN values
lr_death = lrdata.dropna(subset=['positive', 'death'])

x = pd.to_numeric(lr_death['positive'])
y = pd.to_numeric(lr_death['death'])
# create plot for positive to death relationship
fig, ax = plt.subplots()
ax.scatter(x,y)
# annotate the points with state names
for i, txt in enumerate(lr_death.state):
    ax.annotate(txt, (x[i], y[i]), fontsize = 12)
# create and plot lin reg line
m, b = np.polyfit(x, y, 1)
ax.plot(x, m*x + b, color="red")
# label and enlarge the graph
ax.set_title("Positive Cases vs. Cumulative Death", fontsize = 12)
ax.set_xlabel("Positive Cases", fontsize = 12)
ax.set_ylabel("Cumulative Death", fontsize = 12)
fig.set_figwidth(15)
fig.set_figheight(10)
ax

<AxesSubplot:title={'center':'Positive Cases vs. Cumulative Death'}, xlabel='Positive Cases', ylabel='Cumulative Death'>


kmdata = df.copy()
# drop non numerical columns 
kmdata = kmdata.drop(
    labels=['date', 'data_quality_grade', 'last_update_et', 'hash', 'load_time', 'iso_subdivision', 'iso_country', 'fips', 'fips_code'], 
    axis=1
)
# drop non cumulative columns 
kmdata = kmdata.drop(
    labels=['hospitalized_currently', 'in_icu_currently', 'on_ventilator_currently'], 
    axis=1
)
# get max values for the cumulative columns
kmdata = kmdata.groupby('state').max()
kmdata.head()


kmdata = kmdata.dropna(axis=1)
kmdata.head()


# adding population to the data table to make calculations easier 
kmdata['population'] = kmdata.apply(
    lambda r: populations[populations['ID'] == r.name]['4/1/20 Population'].values[0],
    axis = 1
)

percapita = kmdata.copy()
percapita['positive'] = kmdata['positive'] / kmdata['population']
percapita['death'] = kmdata['death'] / kmdata['population']
percapita['total_test_results'] = kmdata['total_test_results'] / kmdata['population']
percapita = percapita.drop(labels=['population'], axis=1)
percapita.head()


kmdata_scaled = percapita.copy()
kmdata_scaled[percapita.columns] = StandardScaler().fit_transform(percapita)
kmdata_scaled.head()


ax = plt.axes(projection='3d')
ax.scatter3D(kmdata_scaled['positive'], kmdata_scaled['death'], kmdata_scaled['total_test_results'], color='teal')
ax.set_title('States by per capita COVID statistics (standardized)')
ax.set_xlabel('Positive cases', rotation=150)
ax.set_ylabel('Deaths')
ax.set_zlabel('Total tests', rotation=60)
plt.show()


X = kmdata_scaled.values

km = KMeans(n_clusters=2, init='k-means++')
y = km.fit_predict(X)
kmdata_scaled['group'] = y


group0 = kmdata_scaled[kmdata_scaled['group'] == 0]
group1 = kmdata_scaled[kmdata_scaled['group'] == 1]

ax = plt.axes(projection='3d')
ax.scatter3D(group0['positive'], group0['death'], group0['total_test_results'], color='forestgreen', label='Cluster 1')
ax.scatter3D(group1['positive'], group1['death'], group1['total_test_results'], color='deeppink',  label='Cluster 2')
ax.set_title('States by per capita COVID statistics (standardized), Grouped')
ax.set_xlabel('Positive cases', rotation=150)
ax.set_ylabel('Deaths')
ax.set_zlabel('Total tests', rotation=60)
plt.legend(bbox_to_anchor=(0, 0.5))

plt.show()


group0['party'] = group0.apply(lambda r: total_pol.loc[[r.name]]['party'][0], axis=1)

print('Red:', len(group0[group0['party'] == 'red']), '/', len(group0))
print('Blue:', len(group0[group0['party'] == 'blue']), '/', len(group0))

group0

Red: 24 / 43
Blue: 19 / 43


group1['party'] = group1.apply(lambda r: total_pol.loc[[r.name]]['party'][0], axis=1)

print('Red:', len(group1[group1['party'] == 'red']), '/', len(group1))
print('Blue:', len(group1[group1['party'] == 'blue']), '/', len(group1))

group1

Red: 1 / 7
Blue: 6 / 7


kmdata_scaled = kmdata_scaled.drop(labels=['total_test_results', 'group'], axis=1)
kmdata_scaled.head()


fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(kmdata_scaled['positive'], kmdata_scaled['death'], color='teal')

ax.set_title('States by Per Capita COVID Statistics (Standardized)')
ax.set_xlabel('Positive Cases')
ax.set_ylabel('Deaths')

# label each point on the graph 
for row in kmdata_scaled.iterrows():
    ax.annotate(row[0], (row[1]['positive'] + 0.07, row[1]['death'] - 0.07))

plt.show()


X = kmdata_scaled.values

km = KMeans(n_clusters=2, init='k-means++')
y = km.fit_predict(X)
kmdata_scaled['group'] = y


group0 = kmdata_scaled[kmdata_scaled['group'] == 0]
group1 = kmdata_scaled[kmdata_scaled['group'] == 1]

fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(group0['positive'], group0['death'], color='forestgreen', label='Cluster 1')
ax.scatter(group1['positive'], group1['death'], color='deeppink', label='Cluster 2')

ax.set_title('States by Per Capita COVID Statistics (Standardized), Grouped')
ax.set_xlabel('Positive Cases')
ax.set_ylabel('Deaths')

# label each point on the graph 
for row in kmdata_scaled.iterrows():
    ax.annotate(row[0], (row[1]['positive'] + 0.07, row[1]['death'] - 0.07))
    
plt.legend(bbox_to_anchor=(1.3, 0.6))
plt.show()


group0['party'] = group0.apply(lambda r: total_pol.loc[[r.name]]['party'][0], axis=1)

print('Red:', len(group0[group0['party'] == 'red']), '/', len(group0))
print('Blue:', len(group0[group0['party'] == 'blue']), '/', len(group0))

group0

Red: 24 / 40
Blue: 16 / 40


group1['party'] = group1.apply(lambda r: total_pol.loc[[r.name]]['party'][0], axis=1)

print('Red:', len(group1[group1['party'] == 'red']), '/', len(group1))
print('Blue:', len(group1[group1['party'] == 'blue']), '/', len(group1))

group1

Red: 1 / 10
Blue: 9 / 10


group0r = group0[group0['party'] == 'red']
group0b = group0[group0['party'] == 'blue']
group1r = group1[group1['party'] == 'red']
group1b = group1[group1['party'] == 'blue']

fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(group0r['positive'], group0r['death'], color='forestgreen', label='Cluster 1, Red')
ax.scatter(group0b['positive'], group0b['death'], color='lime', label='Cluster 1, Blue')
ax.scatter(group1r['positive'], group1r['death'], color='deeppink', label='Cluster 2, Red')
ax.scatter(group1b['positive'], group1b['death'], color='fuchsia', label='Cluster 2, Blue')

ax.set_title('States by Per Capita COVID Statistics (Standardized), Grouped by Cluster and Party')
ax.set_xlabel('Positive Cases')
ax.set_ylabel('Deaths')
    
plt.legend(bbox_to_anchor=(1.4, 0.6))
plt.show()

	date	state	positive	negative	pending	hospitalized_currently	hospitalized_cumulative	in_icu_currently	in_icu_cumulative	on_ventilator_currently	...	data_quality_grade	last_update_et	hash	death	total_test_results	fips	fips_code	iso_subdivision	load_time	iso_country
0	2021-03-07	AK	56886.0	NaN	NaN	33.0	1293.0	NaN	NaN	2.0	...	NaN	2021-03-05 03:59:00	dc4bccd4bb885349d7e94d6fed058e285d4be164	305.0	1731628.0	2	2	US-AK	2022-01-11 00:04:58	US
1	2021-03-07	AL	499819.0	1931711.0	NaN	494.0	45976.0	NaN	2676.0	NaN	...	NaN	2021-03-07 11:00:00	997207b430824ea40b8eb8506c19a93e07bc972e	10148.0	2323788.0	1	1	US-AL	2022-01-11 00:04:58	US
2	2021-03-07	AR	324818.0	2480716.0	NaN	335.0	14926.0	141.0	NaN	65.0	...	NaN	2021-03-07 00:00:00	50921aeefba3e30d31623aa495b47fb2ecc72fae	5319.0	2736442.0	5	5	US-AR	2022-01-11 00:04:58	US
4	2021-03-07	AZ	826454.0	3073010.0	NaN	963.0	57907.0	273.0	NaN	143.0	...	NaN	2021-03-07 00:00:00	0437a7a96f4471666f775e63e86923eb5cbd8cdf	16328.0	7908105.0	4	4	US-AZ	2022-01-11 00:04:58	US
5	2021-03-07	CA	3501394.0	NaN	NaN	4291.0	NaN	1159.0	NaN	NaN	...	NaN	2021-03-07 02:59:00	63c5c0fd2daef2fb65150e9db486de98ed3f7b72	NaN	49646014.0	6	6	US-CA	2022-01-11 00:04:58	US
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
22256	2020-01-17	WA	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	7cefac6b3681020741ca30f45399a7b22f2e45b4	NaN	NaN	53	53	US-WA	2022-01-11 00:04:58	US
22257	2020-01-16	WA	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	650501e005a5ee86d93c5f32dda56735ea2af967	NaN	NaN	53	53	US-WA	2022-01-11 00:04:58	US
22258	2020-01-15	WA	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	4987e61aad88182abfe641033b597304c2153d4f	NaN	NaN	53	53	US-WA	2022-01-11 00:04:58	US
22259	2020-01-14	WA	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1881c8a2f0d337b22066b4f05df06eb2259e8d57	NaN	NaN	53	53	US-WA	2022-01-11 00:04:58	US
22260	2020-01-13	WA	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	12b994ad07c276a5278a2465e081751688739765	NaN	NaN	53	53	US-WA	2022-01-11 00:04:58	US

	State	4/1/20 Population	7/1/20 Population	7/1/21 Population	ID
8	.Alabama	5024279	5024803	5039877.0	AL
9	.Alaska	733391	732441	732673.0	AK
10	.Arizona	7151502	7177986	7276316.0	AZ
11	.Arkansas	3011524	3012232	3025891.0	AR
12	.California	39538223	39499738	39237836.0	CA
13	.Colorado	5773714	5784308	5812069.0	CO
14	.Connecticut	3605944	3600260	3605597.0	CT
15	.Delaware	989948	991886	1003384.0	DE
17	.Florida	21538187	21569932	21781128.0	FL
18	.Georgia	10711908	10725800	10799566.0	GA
19	.Hawaii	1455271	1451911	1441553.0	HI
20	.Idaho	1839106	1847772	1900923.0	ID
21	.Illinois	12812508	12785245	12671469.0	IL
22	.Indiana	6785528	6785644	6805985.0	IN
23	.Iowa	3190369	3188669	3193079.0	IA
24	.Kansas	2937880	2935880	2934582.0	KS
25	.Kentucky	4505836	4503958	4509394.0	KY
26	.Louisiana	4657757	4651203	4624047.0	LA
27	.Maine	1362359	1362280	1372247.0	ME
28	.Maryland	6177224	6172679	6165129.0	MD
29	.Massachusetts	7029917	7022220	6984723.0	MA
30	.Michigan	10077331	10067664	10050811.0	MI
31	.Minnesota	5706494	5707165	5707390.0	MN
32	.Mississippi	2961279	2956870	2949965.0	MS
33	.Missouri	6154913	6154481	6168187.0	MO
34	.Montana	1084225	1086193	1104271.0	MT
35	.Nebraska	1961504	1961455	1963692.0	NE
36	.Nevada	3104614	3114071	3143991.0	NV
37	.New Hampshire	1377529	1377848	1388992.0	NH
38	.New Jersey	9288994	9279743	9267130.0	NJ
39	.New Mexico	2117522	2117566	2115877.0	NM
40	.New York	20201249	20154933	19835913.0	NY
41	.North Carolina	10439388	10457177	10551162.0	NC
42	.North Dakota	779094	778962	774948.0	ND
43	.Ohio	11799448	11790587	11780017.0	OH
44	.Oklahoma	3959353	3962031	3986639.0	OK
45	.Oregon	4237256	4241544	4246155.0	OR
46	.Pennsylvania	13002700	12989625	12964056.0	PA
47	.Rhode Island	1097379	1096229	1095610.0	RI
48	.South Carolina	5118425	5130729	5190705.0	SC
49	.South Dakota	886667	887099	895376.0	SD
50	.Tennessee	6910840	6920119	6975218.0	TN
51	.Texas	29145505	29217653	29527941.0	TX
52	.Utah	3271616	3281684	3337975.0	UT
53	.Vermont	643077	642495	645570.0	VT
54	.Virginia	8631393	8632044	8642274.0	VA
55	.Washington	7705281	7718785	7738692.0	WA
56	.West Virginia	1793716	1789798	1782959.0	WV
57	.Wisconsin	5893718	5892323	5895908.0	WI
58	.Wyoming	576851	577267	578803.0	WY

	positive	hospitalized_cumulative	recovered	death
state
AK	56886.0	1293.0	7165.0	305.0
AL	499819.0	45976.0	295690.0	10149.0
AR	324818.0	14926.0	315517.0	5417.0
AZ	826454.0	57907.0	NaN	16328.0
CA	3501394.0	NaN	NaN	32291.0

	state	positive	hospitalized_cumulative	recovered	death	population
0	AK	0.077566	0.001763	0.009770	0.000416	733391
1	AL	0.099481	0.009151	0.058852	0.002020	5024279
2	AR	0.107858	0.004956	0.104770	0.001799	3011524
3	AZ	0.115564	0.008097	NaN	0.002283	7151502
4	CA	0.088557	NaN	NaN	0.000817	39538223

	positive	hospitalized_cumulative	recovered	death	population
state
AK	0.077566	0.001763	0.009770	0.000416	733391
AL	0.099481	0.009151	0.058852	0.002020	5024279
AR	0.107858	0.004956	0.104770	0.001799	3011524
AZ	0.115564	0.008097	NaN	0.002283	7151502
CA	0.088557	NaN	NaN	0.000817	39538223
CO	0.075619	0.004140	NaN	0.001037	5773714
CT	0.079128	0.003825	0.002718	0.002136	3605944
DE	0.089251	NaN	0.019042	0.001488	989948
FL	0.088643	0.003818	NaN	0.001498	21538187
GA	0.095547	0.005302	NaN	0.001672	10711908
HI	0.019721	0.001530	0.008217	0.000306	1455271
IA	0.088511	0.000060	0.100319	0.001742	3190369
ID	0.094030	0.003906	0.052209	0.001022	1839106
IL	0.093529	NaN	NaN	0.001796	12812508
IN	0.098336	0.006369	NaN	0.001877	6785528
KS	0.100706	0.003195	NaN	0.001639	2937880
KY	0.091150	0.004318	0.010685	0.001070	4505836
LA	0.093132	NaN	0.089303	0.002093	4657757
MA	0.084120	0.002804	0.072369	0.002335	7029917
MD	0.062701	0.005771	0.001571	0.001288	6177224
ME	0.033614	0.001152	0.009425	0.000518	1362359
MI	0.065104	NaN	0.054566	0.001653	10077331
MN	0.085869	0.004552	0.083423	0.001148	5706494
MO	0.078091	NaN	NaN	0.001326	6154913
MS	0.100491	0.003094	0.093933	0.002299	2961279
MT	0.093075	0.004270	0.090335	0.001274	1084225
NC	0.083547	NaN	NaN	0.001102	10439388
ND	0.128856	0.004980	0.126206	0.001897	779094
NE	0.103505	0.003180	0.080114	0.001077	1961504
NH	0.055796	0.000821	0.053440	0.000860	1377529
NJ	0.087481	0.007025	NaN	0.002538	9288994
NM	0.088274	0.006258	0.073933	0.001798	2117522
NV	0.095403	NaN	NaN	0.001622	3104614
NY	0.083221	NaN	NaN	0.001620	20201249
OH	0.082925	0.004312	0.078449	0.001496	11799448
OK	0.108350	0.006145	0.104102	0.001145	3959353
OR	0.037071	0.002057	0.001385	0.000542	4237256
PA	0.072957	0.000088	0.066275	0.001873	13002700
RI	0.117353	0.008220	NaN	0.002321	1097379
SC	0.102740	0.004049	0.045091	0.001710	5118425
SD	0.128108	0.007562	0.123531	0.002143	886667
TN	0.113185	0.002730	0.109508	0.001670	6910840
TX	0.092186	NaN	0.085866	0.001111	29145505
UT	0.114576	0.004552	0.109719	0.000604	3271616
VA	0.067857	0.002857	NaN	0.001112	8631393
VT	0.025009	0.000078	0.020812	0.000323	643077
WA	0.044714	0.002544	NaN	0.000654	7705281
WI	0.105477	0.004489	0.093863	0.001206	5893718
WV	0.074396	NaN	0.069901	0.001296	1793716
WY	0.094936	0.002411	0.092832	0.001182	576851

CMSC320 Final Project: How Politics Impacts COVID¶

Joy Wang, Stephanie Wang, Lucinda Zhou¶

Data Curation and Parsing¶

Positive Infections and Party Affiliations¶

Grouping by state¶

Cleaning the Dataframe¶

2020 Election Dataset¶

Merging the tables and categorizing¶

Comparing red and blue states using t-test¶

Linear regression¶

Linear Regression Modeling for COVID Statistics¶

Graphing the Data and Lin Reg Models¶

Positive Cases vs. Cumulative Hospitalized¶

Positive Cases vs. Cumulative Recovered¶

Positive Cases vs. Cumulative Death¶

General Observations of Lin Reg Models¶

Using K-Means Clustering to Group States¶

Data Cleaning¶

Clustering with Three Dimensions¶

Clustering with Two Dimensions¶

Conclusion¶

	total_votes	won	dem_pct	rep_pct	total_pct
stateID
AL	2323304	67	0.365707	0.620310	0.986016
AK	391346	40	0.391993	0.485228	0.877221
AZ	3387326	15	0.493647	0.490560	0.984207
AR	1219069	75	0.347751	0.623957	0.971708
CA	17495906	58	0.634992	0.343278	0.978270

	positive	negative	pending	hospitalized_cumulative	in_icu_cumulative	on_ventilator_cumulative	recovered	death	total_test_results
state
AK	56886.0	NaN	14.0	1293.0	NaN	NaN	7165.0	305.0	1731628.0
AL	499819.0	1931711.0	46.0	45976.0	2676.0	1515.0	295690.0	10149.0	2323788.0
AR	324818.0	2480716.0	203.0	14926.0	43.0	1533.0	315517.0	5417.0	2736442.0
AZ	826454.0	3073010.0	130.0	57907.0	NaN	NaN	NaN	16328.0	7908105.0
CA	3501394.0	266839.0	15000.0	NaN	NaN	NaN	NaN	32291.0	49646014.0

	positive	death	total_test_results
state
AK	-0.361915	-1.793176	2.459706
AL	0.562421	1.045757	-1.333330
AR	0.915773	0.654221	-0.442024
AZ	1.240770	1.511503	-0.048180
CA	0.101685	-1.083800	0.251188

	positive	death	total_test_results	group	party
state
AL	0.562421	1.045757	-1.333330	0	red
AR	0.915773	0.654221	-0.442024	0	red
AZ	1.240770	1.511503	-0.048180	0	blue
CA	0.101685	-1.083800	0.251188	0	blue
CO	-0.444027	-0.693415	-0.037602	0	blue
CT	-0.296034	1.251904	1.355131	0	blue
DE	0.130955	0.104173	0.632439	0	blue
FL	0.105304	0.122094	-0.185243	0	red
GA	0.396488	0.429177	-0.884851	0	blue
IA	0.099754	0.553983	-1.426496	0	red
ID	0.332515	-0.721015	-1.555768	0	red
IL	0.311367	0.649719	0.649143	0	blue
IN	0.514140	0.792841	0.169382	0	red
KS	0.614083	0.371977	-1.393346	0	red
KY	0.211064	-0.636402	-0.494603	0	red
LA	0.294631	1.174706	0.185551	0	red
MA	-0.085471	1.603794	2.524227	0	blue
MD	-0.988875	-0.250070	0.361531	0	blue
MI	-0.887538	0.396293	-0.151568	0	blue
MN	-0.011697	-0.497804	0.232315	0	blue
MO	-0.339761	-0.182575	-0.775321	0	red
MS	0.605019	1.539554	-1.072023	0	red
MT	0.292229	-0.274979	-0.222203	0	red
NC	-0.109650	-0.579261	-0.403173	0	red
ND	1.801419	0.828223	1.397100	0	red
NE	0.732168	-0.622716	0.209154	0	red
NJ	0.056287	1.962240	0.108351	0	blue
NM	0.089738	0.653462	0.368822	0	blue
NV	0.390436	0.342150	-0.474737	0	blue
NY	-0.123384	0.337770	1.668295	0	blue
OH	-0.135864	0.119011	-0.520669	0	red
OK	0.936522	-0.502545	-0.453325	0	red
PA	-0.556286	0.784926	-0.616146	0	blue
RI	1.316251	1.578450	3.428763	0	blue
SC	0.699874	0.497657	-0.234318	0	red
SD	1.769860	1.263202	-0.675793	0	red
TN	1.140457	0.426835	-0.270552	0	red
TX	0.254757	-0.562149	-0.892771	0	red
UT	1.199128	-1.460269	-0.480284	0	red

	positive	death	total_test_results	group	party
state
AK	-0.361915	-1.793176	2.459706	1	red
HI	-2.801711	-1.988014	-0.683013	1	blue
ME	-2.215728	-1.612053	0.177191	1	blue
NH	-1.280110	-1.008042	-0.086006	1	blue
OR	-2.069911	-1.570213	-0.441665	1	blue
VT	-2.578642	-1.956760	1.235171	1	blue
WA	-1.747551	-1.371348	-0.858862	1	blue

	positive	death	group	party
state
AL	0.562421	1.045757	0	red
AR	0.915773	0.654221	0	red
AZ	1.240770	1.511503	0	blue
CA	0.101685	-1.083800	0	blue
CT	-0.296034	1.251904	0	blue
DE	0.130955	0.104173	0	blue
FL	0.105304	0.122094	0	red
GA	0.396488	0.429177	0	blue
IA	0.099754	0.553983	0	red
ID	0.332515	-0.721015	0	red
IL	0.311367	0.649719	0	blue
IN	0.514140	0.792841	0	red
KS	0.614083	0.371977	0	red
KY	0.211064	-0.636402	0	red
LA	0.294631	1.174706	0	red
MA	-0.085471	1.603794	0	blue
MI	-0.887538	0.396293	0	blue
MN	-0.011697	-0.497804	0	blue
MO	-0.339761	-0.182575	0	red
MS	0.605019	1.539554	0	red
MT	0.292229	-0.274979	0	red
NC	-0.109650	-0.579261	0	red
ND	1.801419	0.828223	0	red
NE	0.732168	-0.622716	0	red
NJ	0.056287	1.962240	0	blue
NM	0.089738	0.653462	0	blue
NV	0.390436	0.342150	0	blue
NY	-0.123384	0.337770	0	blue
OH	-0.135864	0.119011	0	red
OK	0.936522	-0.502545	0	red
PA	-0.556286	0.784926	0	blue
RI	1.316251	1.578450	0	blue
SC	0.699874	0.497657	0	red
SD	1.769860	1.263202	0	red
TN	1.140457	0.426835	0	red
TX	0.254757	-0.562149	0	red
UT	1.199128	-1.460269	0	red
WI	0.815348	-0.395377	0	blue
WV	-0.495615	-0.235210	0	red

	positive	death	group	party
state
AK	-0.361915	-1.793176	1	red
CO	-0.444027	-0.693415	1	blue
HI	-2.801711	-1.988014	1	blue
MD	-0.988875	-0.250070	1	blue
ME	-2.215728	-1.612053	1	blue
NH	-1.280110	-1.008042	1	blue
OR	-2.069911	-1.570213	1	blue
VA	-0.771413	-0.561622	1	blue
VT	-2.578642	-1.956760	1	blue
WA	-1.747551	-1.371348	1	blue