#importing pandas, so that we can use it
import pandas

#reading in the data into a dataframe
data = pandas.read_excel("PredictIt-data/Price History By Market -NoahFine1.xlsx")
#displaying the first few rows of the data to make sure it looks as expected
data.head()


#adding the remaining data to the data dataframe
data=data.append(pandas.read_excel("PredictIt-data/Price History By Market -NoahFine2.xlsx"))
data=data.append(pandas.read_excel("PredictIt-data/Price History By Market -NoahFine3.xlsx"))
data=data.append(pandas.read_excel("PredictIt-data/Price History By Market -NoahFine4.xlsx"))

#resetting indices, as the original indices are at first kept, and we have multiple of the 
#same index
data=data.reset_index()
#now, there is still an "index" column with the original indices, but we don't need these
data.drop(columns='index', inplace=True)
#displaying the first few rows of the data to make sure it looks as expected
data.head()


#labelling each entry with how far out it is from election day
far_out = []
for i in range(0, data.shape[0]):
    far_out.append((pandas.Timestamp('20201103') - data.at[i,'Date (ET)']).days)
    
#adding these labellings to the data
data['Days Out'] =  far_out
data.head()


#importing re, which allows use of regular expressions
import re

#a state_names list, which will later be used as the appropriate column in the dataframe
state_names = []

for i in range(0, data.shape[0]):
    line = data.at[i,'Market Name']
    #all market names are structured as "Which party will win the U.S. Senate (race|election) in (state_name) in 2020?"
    #first, we will find the start index of the state name
    #looking for "race"
    s = re.search('race', line)
    if s is None:
        #since race is not present, the state name will be after "election"
        s = re.search('election', line)
        #assigning s to everything after "election"; we will then remove excess junk in the next step
        s = line[(s.start()+12):]
    else:
        #assigning s to everything after "race"; we will then remove excess junk in the next step
        s = line[(s.start()+8):]
    
    #now, s is the state name and some junk at the end; we need to check if the state is one word or two words, and 
    #then we can remove junk
    words = re.findall('\S+', s)
    #we will assign the proper state name to "state"
    state = ""
    if words[0] == 'North' or words[0] == 'South' or words[0] == 'West' or words[0] == 'New' or words[0] == 'Rhode':
        state = words[0] + ' ' + words[1]
    else:
        state = words[0]
    
    #now, we have the correct state name, but we need to deal with the Georgia regular versus Georgia special edge case
    georgia = re.search("Georgia", state)
    if georgia is not None:
        #we know that any Georgia market which refers to the special election has "special" in it, so we utilize this
        special = re.search("special", line)
        if special is not None:
            state = "Georgia (special)"
        else:
            state = "Georgia (regular)"
    
    #finally, adding the state name in the list
    state_names.append(state)
        
#Adding the "State" column to the data, which contains state names
data['State'] = state_names
data.head()


#creating a dictionary to use functionally as a hashset: 
#keys are state names, values are True booleans, but meaningless
all_state_names = {}

for i in range(0, data.shape[0]):
    all_state_names[data.at[i, 'State']] = True

#printing the final keyset of the dictionary, 
#which contains all unique state names present in data
n = 1
for k in all_state_names.keys():
    print(str(n) + ": " + k)
    n += 1

1: North Carolina
2: Arizona
3: Maine
4: Alabama
5: Georgia (special)
6: Kentucky
7: Michigan
8: Georgia (regular)
9: Alaska
10: South Carolina
11: Kansas
12: Colorado
13: Montana
14: Iowa
15: New Mexico
16: New Jersey
17: Louisiana
18: Virginia
19: Illinois
20: Mississippi
21: Minnesota
22: Oregon
23: Texas
24: Tennessee
25: New Hampshire
26: Nebraska
27: West Virginia
28: Massachusetts
29: Oklahoma
30: Rhode Island
31: Wyoming
32: Delaware
33: Idaho
34: South Dakota
35: Arkansas


#reading in my excel sheet
winner_data = pandas.read_excel("PredictIt-data/Election Winners.xlsx")

#winners dictionary; keys are state names, values are winners (Republican or Democratic)
winners = {}

#fte_district dictionary; keys are state names, values are 538 district names
fte_district = {}

#iterating through winner_data to set up the dictionaries 
for i in range(0, winner_data.shape[0]):
    winners[winner_data.at[i,'State']] = winner_data.at[i,'Winner']
    fte_district[winner_data.at[i,'State']] = winner_data.at[i,'538 District']

#party_winners list, which we be populated with the winner in each row of data
party_winners = []
#fte_districts list, which we be populated with the 538 district in each row of data
fte_districts = []
for i in range(0, data.shape[0]):
    party_winners.append(winners[data.at[i, 'State']])
    fte_districts.append(fte_district[data.at[i, 'State']])

#adding the lists as the appropriate columns
data['Winner'] = party_winners
data['538 District'] = fte_districts
data.head()


#closing_prices dictionary: keys will be strings containing the state, 
#contract name (party concerned), and date; values will be closing prices of the contracts
closing_prices = {}

#populating the dictionary accordingly
for i in range(0, data.shape[0]):
    k = data.at[i, 'State'] + data.at[i, 'Contract Name'] + str(data.at[i, 'Date (ET)'])
    closing_prices[k] = data.at[i, 'Close Share Price']

#a list containing the implied probability at each row
implied_probs = []

#populating the list as described above
for i in range(0, data.shape[0]):
    if (data.at[i, 'State'] + "Democratic" + str(data.at[i, 'Date (ET)']) not in closing_prices or
        data.at[i, 'State'] + "Republican" + str(data.at[i, 'Date (ET)']) not in closing_prices):
        #if there isn't a value for both Democrats and Republicans in the closing_prices
        #dictionary, the implied probability is the closing price
        implied_probs.append(data.at[i, 'Close Share Price'])
    else:
        #implied probability is calculated as described in the "how PredictIt works" section
        priceD = closing_prices[data.at[i, 'State'] + "Democratic" + str(data.at[i, 'Date (ET)'])]
        priceR = closing_prices[data.at[i, 'State'] + "Republican" + str(data.at[i, 'Date (ET)'])]
        if data.at[i, 'Contract Name'] == "Democratic":
            implied_probs.append(priceD/(priceD + priceR)) 
        else: 
            if data.at[i, 'Contract Name'] == "Republican":
                implied_probs.append(priceR/(priceD + priceR))
            else:
                implied_probs.append(0.0)

#adding the column to the dataframe
data['Closing Implied Probability'] = implied_probs
data.head()


#importing the necessary libraries
import matplotlib.pyplot as plt
import numpy as np

#Since I love FiveThirtyEight's graphics, I will use their plot style for the remainder of this project
plt.style.use('fivethirtyeight')

#list for obtaining prices corresponding to each row
prices = []
#list for obtaining winners corresponding to each row; a value of 0 means that the winner did not match the contract name,
#while a value of 1 means that it did
won = []

#populating the lists appropriately
for i in range(0, data.shape[0]):
    if data.at[i, 'Days Out'] > 0:
        prices.append(data.at[i, 'Close Share Price'])
        if data.at[i, 'Winner'] == data.at[i, 'Contract Name']:
            won.append(1)
        else:
            won.append(0)
        
#labels for the ranges on the x-axis
labels = []
#for each range, will contain expected frequency at which a contract in that range would resolve to "Yes" 
expected = []
#populating accordingly
for i in range(0,10):
    s = '0.' + str(i) + '0-0.' + str(i) + '9'
    labels.append(s)
    expected.append(i/10 + 0.045)

#index 0 will contain the total number of contracts from $0.00 to $0.09 which resolved to yes, 
#index 1 will contain the total number of contracts from $0.10 to $0.19 which resolved to yes, 
#and so on
total_wins = 10*[0]
#index 0 will contain the total number of contracts from $0.00 to $0.09, 
#index 1 will contain the total number of contracts from $0.10 to $0.19, 
#and so on
total = 10*[0]
#populating total_wins and total appropriately
for i in range(0,len(won)):
    ind = int(prices[i]*10)
    total_wins[ind] = total_wins[ind] + won[i]
    total[ind] = total[ind] + 1

#index 0 will contain the proportion of contracts from $0.00 to $0.09 which resolved to yes,
#index 1 will contain the proportion of contracts from $0.10 to $0.19 which resolved to yes,
#and so on
ys = 10*[-1] #-1 is the default value, as that will enable us to detect regions with no markets visually
#populating ys appropriately
for i in range(0,10):
    if total[i] != 0:
        ys[i] = total_wins[i]/total[i]

#creating the plot
x = np.arange(len(labels))  # the label locations
width = 0.45  # the width of the bars

fig = plt.figure()
ax = fig.add_axes([0,0,1.8,1])
ax.bar(x - width/2, ys, width, label='Observed', color='b')
ax.bar(x + width/2, expected, width, label='Expected', color='g')

#adding some text for labels, title, custom x-axis tick labels, etc.
ax.set_ylabel('Proportion Resolved to \"Yes\"')
ax.set_xlabel('Contract Price, in Dollars')
ax.set_title('Results versus Prices, pre-election')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()


#this library will allow us to label lines nicely in our graphs
import matplotlib.patches as mpatches

#price_to_proportion library: keys are prices, values will be lists containing two values - at index 0, the total number of 
#resolutions to "Yes" for contracts of the price in the key, and at index 1, the total number of contracts of the price
#in the key; this will allow us to get the proportion which resolved to "Yes" later on
price_to_proportion = {}

#populating appropriately
for i in range(0,data.shape[0]-1):
    if data.at[i, 'Days Out'] > 0:
        price = data.at[i,'Close Share Price']
        if price not in price_to_proportion:
            price_to_proportion[price] = 2*[0]
        if data.at[i, 'Winner'] == data.at[i, 'Contract Name']:
            price_to_proportion[price][0] = price_to_proportion[price][0] + 1
        price_to_proportion[price][1] = price_to_proportion[price][1] + 1
    
#lists which will be used to make the plot
#xs are the x-coordinates, which will go from 0.01 to 0.99
xs=[]
#ys are the y-coordinates, which will have the value of the proportion of contracts priced at the x-value 
#which resolved to yes
ys=[]

#populating xs and ys appropriately
for k in price_to_proportion.keys():
    xs.append(k)
    ys.append(price_to_proportion[k][0]/price_to_proportion[k][1])

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys)

line = x
plt.plot(x, line, 'b', label='y={:.2f}x+{:.2f}'.format(1,0))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
ax.set_ylabel('Proportion Resolved to \"Yes\"')
ax.set_xlabel('Contract Price, in Dollars')
ax.set_title('Results versus Prices, pre-election')
blue_patch = mpatches.Patch(color='b', label='y=x line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()


#price_to_proportion library: keys are prices, values will be lists containing two values - at index 0, the total number of 
#resolutions to "Yes" for contracts of the price in the key, and at index 1, the total number of contracts of the price
#in the key; this will allow us to get the proportion which resolved to "Yes" later on
price_to_proportion = {}

#populating appropriately
for i in range(0,data.shape[0]-1):
    if data.at[i, 'Days Out'] > 0:
        price = data.at[i,'Close Share Price']
        if price not in price_to_proportion:
            price_to_proportion[price] = 2*[0]
        if data.at[i, 'Winner'] == data.at[i, 'Contract Name']:
            price_to_proportion[price][0] = price_to_proportion[price][0] + 1
        price_to_proportion[price][1] = price_to_proportion[price][1] + 1
    
#lists which will be used to make the plot
#xs are the x-coordinates, which will go from 0.01 to 0.99
xs=[]
#ys are the y-coordinates, which will have the value of the observed proportion of contracts priced at the x-value 
#which resolved to yes minus the expected proportion of contracts priced at the x-value which resolved to yes
ys=[]

#populating xs and ys appropriately
for k in price_to_proportion.keys():
    xs.append(k)
    ys.append(price_to_proportion[k][0]/price_to_proportion[k][1] - k)

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys)

line = 0*x
plt.plot(x, line, 'b', label='y={:.2f}x+{:.2f}'.format(0,0))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(-0.01, 1.01)
plt.ylim(-0.50, 0.50)
ax.set_ylabel('Prop. Resolved to \"Yes\" - Expected')
ax.set_xlabel('Contract Price, in Dollars')
ax.set_title('Results versus Prices, pre-election')
blue_patch = mpatches.Patch(color='b', label='x-axis')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()


#the following list will contain the y-values for the bar chart; 
#index 0 contains the values for each bucket (0.00-0.09, 0.10-0.19, ..., 0.90-0.99) for 90 days out, 
#index 1 for 30 days out, index 2 for 7 days out, and index 3 for 1 day out
ys = [[],[],[],[]]

#setting the labels for each bucket and their correspondng expected values as appropriate
labels = []
expected = []
for i in range(0,10):
    s = '0.' + str(i) + '0-0.' + str(i) + '9'
    labels.append(s)
    expected.append(i/10 + 0.045)

#the following list will be used in tandem with the ys 2D list sp that we can create the bar chart through iteration
#rather than copy pasting
days_out = [90, 30, 7, 1]

#populating ys as appropriate
for j in range(0,4):
    #the current day we are setting up the ys for
    day = days_out[j]
    #list for obtaining prices corresponding to each row with the given number of days out
    prices = []
    #list for obtaining winners corresponding to each row; a value of 0 means that the winner did not match the 
    #contract name, while a value of 1 means that it did
    won = []
    for i in range(0, data.shape[0]):
        if data.at[i, 'Days Out'] == day:
            prices.append(data.at[i, 'Close Share Price'])
            if data.at[i, 'Winner'] == data.at[i, 'Contract Name']:
                won.append(1)
            else:
                won.append(0)

    #index 0 will contain the total number of contracts from $0.00 to $0.09 which resolved to yes, 
    #index 1 will contain the total number of contracts from $0.10 to $0.19 which resolved to yes, 
    #and so on
    total_wins = 10*[0]
    #index 0 will contain the total number of contracts from $0.00 to $0.09, 
    #index 1 will contain the total number of contracts from $0.10 to $0.19, 
    #and so on
    total = 10*[0]
    for i in range(0,len(won)):
        ind = int(prices[i]*10)
        total_wins[ind] = total_wins[ind] + won[i]
        total[ind] = total[ind] + 1

    ys[j] = 10*[-1] #-1 is default; will enable us to detect regions with zero markets, though we see in the output
    #that these do not occur
    for i in range(0,10):
        if total[i] != 0:
            ys[j][i] = total_wins[i]/total[i]

#creating the plot
x = np.arange(len(labels))  # the label locations
width = 0.15  # the width of the bars

fig = plt.figure()
ax = fig.add_axes([0,0,2,1])
ax.bar(x - 2*width, ys[0], width, label='90 days out', color= (0.0, 0.8, 0.9))
ax.bar(x - width, ys[1], width, label='30 days out', color=(0.0, 0.6, 0.7))
ax.bar(x, ys[2], width, label='7 days out', color=(0.0, 0.4, 0.5))
ax.bar(x + width, ys[3], width, label='1 day out', color=(0.0, 0.2, 0.3))
ax.bar(x + 2*width, expected, width, label='Expected', color='green')


#adding some text for labels, title, custom x-axis tick labels, etc.
ax.set_ylabel('Proportion Resolved to \"Yes\"')
ax.set_xlabel('Contract Price, in Dollars')
ax.set_title('Results versus Prices, pre-election, by Days Out')

ax.set_xticks(x)
ax.set_xticklabels(labels) # labels
ax.legend()

plt.show()

#displaying if there was an area with no markets
for y in ys:
    for elt in y:
        if elt == -1:
            print("Some area has no markets")


#the following list will contain the y-values for the bar chart; 
#index 0 contains the values for each bucket (0.00-0.09, 0.10-0.19, ..., 0.90-0.99) for 0 trade volume, 
#index 1 for 30 days out, index 2 for 7 days out, and index 3 for 1 day out
ys = [[],[],[],[],[]]

#setting the labels for each bucket and their correspondng expected values as appropriate
labels = []
expected = []
for i in range(0,10):
    s = '0.' + str(i) + '0-0.' + str(i) + '9'
    labels.append(s)
    expected.append(i/10 + 0.045)

#the following list will be used in tandem with the ys 2D list sp that we can create the bar chart through iteration
#rather than copy pasting
vols = [0, 1, 10, 100, 1000]

#populating ys as appropriate
for j in range(0,5):
    #list for obtaining prices corresponding to each row within the given trading volume range
    prices = []
    #list for obtaining winners corresponding to each row; a value of 0 means that the winner did not match the 
    #contract name, while a value of 1 means that it did
    won = []
    for i in range(0, data.shape[0]):
        #the trading volume of the current row
        vol = data.at[i, 'Trade Volume']
        #adjusting prices and won accordingly
        if (j < 4 and vol >= vols[j] and vol < vols[j+1]) or (vol >= vols[j]):
            prices.append(data.at[i, 'Close Share Price'])
            if data.at[i, 'Winner'] == data.at[i, 'Contract Name']:
                won.append(1)
            else:
                won.append(0)

    #index 0 will contain the total number of contracts from $0.00 to $0.09 which resolved to yes, 
    #index 1 will contain the total number of contracts from $0.10 to $0.19 which resolved to yes, 
    #and so on
    total_wins = 10*[0]
    #index 0 will contain the total number of contracts from $0.00 to $0.09, 
    #index 1 will contain the total number of contracts from $0.10 to $0.19, 
    #and so on
    total = 10*[0]
    for i in range(0,len(won)):
        ind = int(prices[i]*10)
        total_wins[ind] = total_wins[ind] + won[i]
        total[ind] = total[ind] + 1

    ys[j] = 10*[-1] #-1 is default; will enable us to detect regions with zero markets, though we see in the output
    #that these do not occur
    for i in range(0,10):
        if total[i] != 0:
            ys[j][i] = total_wins[i]/total[i]

#creating the plot
x = np.arange(len(labels))  # the label locations
width = 0.10  # the width of the bars

fig = plt.figure()
ax = fig.add_axes([0,0,2,1])
ax.bar(x - 2.5*width, ys[0], width, label='0', color= (0.0, 0.7, 0.8))
ax.bar(x - 1.5*width, ys[1], width, label='1-9', color= (0.0, 0.6, 0.7))
ax.bar(x - 0.5*width, ys[2], width, label='10-99', color=(0.0, 0.5, 0.6))
ax.bar(x + 0.5*width, ys[3], width, label='100-999', color=(0.0, 0.4, 0.5))
ax.bar(x + 1.5*width, ys[4], width, label='1000+', color=(0.0, 0.3, 0.4))
ax.bar(x + 2.5*width, expected, width, label='Expected', color='green')

#adding some text for labels, title, custom x-axis tick labels, etc.
ax.set_ylabel('Proportion Resolved to \"Yes\"')
ax.set_xlabel('Contract Price, in Dollars')
ax.set_title('Results versus Prices, pre-election, by Trading Volume')

ax.set_xticks(x)
ax.set_xticklabels(labels) # labels
ax.legend()

plt.show()


#making a data frame
df=pandas.DataFrame({'x': range(1,91)}) #populating with the appropriate x-axis
#this for loop will fill the dataframe with a column of 90 entries of 0.0 for each market by default,
#with the i^th entry (indexed at 0) representing the sum of the contract prices on the (i+1)th day out from the election; 
#the values are then updated accordingly
for i in range(0,data.shape[0]):
    if data.at[i, 'Days Out'] > 0 and data.at[i, 'Days Out'] <= 90:
        if data.at[i,'State'] not in df.keys():
            df[data.at[i,'State']] = 90*[0.0]
        df[data.at[i,'State']][data.at[i,'Days Out']-1] = (df[data.at[i,'State']][data.at[i,'Days Out']-1] + 
                                                           data.at[i,'Close Share Price'])

#now, getting the average for each day out
averages = 90*[0.0]
for i in range(1,91):
    for k in df.keys():
        if k != 'x':
            averages[i-1] = averages[i-1] + df[k][i-1]
    averages[i-1] = averages[i-1]/35 #dividing by 35 because there are 35 markets each day, at least up to 90 days out

#using a built-in color palette to differentiate
palette = plt.get_cmap('Set1')
#plotting lines for the individual markets
num=0
for column in df.drop('x', axis=1):
    num+=1
    plt.plot(df['x'], df[column], marker='', color=palette(num), linewidth=1.0, alpha=0.9, label=column)
    if num == 7:
        break

#plotting average line
plt.plot(df['x'], averages, marker='', color='black', linewidth=1.0, alpha=0.9, label="average")
#adding a legend
plt.legend(loc='best', bbox_to_anchor=(1.1, -0.2), ncol=5)
#adding titles
plt.title("Sum of All Shares in a Market over Days Out", fontsize=14, fontweight=0, color='black')
plt.xlabel("Days Out")
plt.ylabel("Sum of All Shares in a Market", fontsize=10)
#showing the graph
plt.show()

<ipython-input-14-3cf78eb947fd>:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[data.at[i,'State']][data.at[i,'Days Out']-1] = (df[data.at[i,'State']][data.at[i,'Days Out']-1] +


#importing the stats library, which will allow us to obtain the p and r-values
from scipy import stats

#price_to_proportion library: keys are prices, values will be lists containing two values - at index 0, the total number of 
#resolutions to "Yes" for contracts of the price in the key, and at index 1, the total number of contracts of the price
#in the key; this will allow us to get the proportion which resolved to "Yes" later on
price_to_proportion = {}

#populating appropriately
for i in range(0,data.shape[0]-1):
    if data.at[i, 'Days Out'] > 0:
        price = data.at[i,'Close Share Price']
        if price not in price_to_proportion:
            price_to_proportion[price] = 2*[0]
        if data.at[i, 'Winner'] == data.at[i, 'Contract Name']:
            price_to_proportion[price][0] = price_to_proportion[price][0] + 1
        price_to_proportion[price][1] = price_to_proportion[price][1] + 1
    
#lists which will be used to make the plot
#xs are the x-coordinates, which will go from 0.01 to 0.99
xs=[]
#ys are the y-coordinates, which will have the value of the proportion of contracts priced at the x-value 
#which resolved to yes
ys=[]

#populating xs and ys appropriately
for k in price_to_proportion.keys():
    xs.append(k)
    ys.append(price_to_proportion[k][0]/price_to_proportion[k][1])

#creating the scatterplot
fig, ax = plt.subplots()
ax.scatter(xs, ys)

#regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
line = slope*x+intercept
#plotting the regression line
plt.plot(x, line, 'b', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
ax.set_ylabel('Proportion Resolved to \"Yes\"')
ax.set_xlabel('Contract Price, in Dollars')
ax.set_title('Results versus Prices, pre-election')
blue_patch = mpatches.Patch(color='b', label='Regression Line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#printing some stats for the analysis
print('Regression line: y = ' + str(slope) + 'x + ' + str(intercept)) #regression line
print('p-value = ' + str(p_value)) #p-value
print('r-value = ' + str(r_value)) #r-value

Regression line: y = 1.1230516912061947x + -0.06614136824221906
p-value = 1.2556881768033598e-38
r-value = 0.9089169655478043


#for reasons that will become clear later (when we reuse pred), the pred dictionary will work in the following manner:
#keys are a string comprised of the 538 District and Days Out; for each row, this will be unique
#values are an array where index 0 is the days out, index 1 is the closing implied probability, and index 2 is whether or
#not the contract resolved to yes (1 means resolved to yes, 0 means it did not)
pred = {}

#populating pred appropriately
for i in range(0, data.shape[0]):
    if data.at[i, 'Days Out'] > 0 and data.at[i, 'Contract Name'] == 'Democratic':
        if data.at[i, 'Contract Name'] == data.at[i, 'Winner']:
            pred[str(data.at[i, '538 District']) + str(data.at[i, 'Days Out'])] = (
                [data.at[i, 'Days Out'], data.at[i, 'Closing Implied Probability'], 1])
        else:
            pred[str(data.at[i, '538 District']) + str(data.at[i, 'Days Out'])] = (
                [data.at[i, 'Days Out'], data.at[i, 'Closing Implied Probability'], 0])

scores = [] #contains the FTE-scores of each row
days_out = [] #days out corresponding to the score above
markets_on_day = {} #keys are days out, values are the total number of markets on that day

#populating the above lists and dictionary appropriately
for k in pred.keys():
    days_out.append(pred[k][0])
    if pred[k][0] not in markets_on_day:
        markets_on_day[pred[k][0]] = 0
    markets_on_day[pred[k][0]] += 1
    sq_diff = (pred[k][1] - pred[k][2])**2
    scores.append(25 - (100 *  sq_diff))

#keys are days out, values are the average PredictIt score on that day
avg_scores = {}

#populating avg_scores appropriately
for i in range(0,len(days_out)):
    if days_out[i] not in avg_scores:
        avg_scores[days_out[i]] = 0
    avg_scores[days_out[i]] = avg_scores[days_out[i]] + scores[i]/markets_on_day[days_out[i]]
    
#creating the coordinates for the scatterplot
xs = []
ys = []

#populating appropriately
for k in avg_scores.keys():
    xs.append(k)
    ys.append(avg_scores[k])

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'blue')

#adding a vertical black line on each day a market is added, and a vertical red line on each day that a market is removed
for k in markets_on_day.keys():
    if k > 50:
        if markets_on_day[k-1] > markets_on_day[k]:
            plt.axvline(x=k-1, linewidth = 0.5, color = 'black')
        if markets_on_day[k-1] < markets_on_day[k]:
            plt.axvline(x=k-1, linewidth = 0.5, color = 'red')

#plotting the regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'blue', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(min(xs)-5, max(xs)+10)
plt.ylim(min(ys)-2, max(ys)+2)
ax.set_xlabel('Days Out')
ax.set_ylabel('Average PredictIt FTE-score')
ax.set_title('Average PredictIt FTE-score versus Days Out')
blue_patch = mpatches.Patch(color='blue', label='Regression line')
black_patch = mpatches.Patch(color='black', label='New market added')
plt.legend(handles=[blue_patch, black_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#printing some stats for the analysis
print('Regression line: y = ' + str(slope) + 'x + ' + str(intercept)) #regression line
print('p-value = ' + str(p_value)) #p value

Regression line: y = -0.011496526838507463x + 15.071951345398222
p-value = 2.8417495420322203e-19


#creating the coordinates for the scatterplot
xs = []
ys = []

#populating appropriately
for k in avg_scores.keys():
    if markets_on_day[k] == 35: #as there are 35 total markets, we use 35 here
        xs.append(k)
        ys.append(avg_scores[k])

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'blue')

#plotting the regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'blue', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(min(xs)-5, max(xs)+10)
plt.ylim(min(ys)-2, max(ys)+2)
ax.set_xlabel('Days Out')
ax.set_ylabel('Average PredictIt FTE-score')
ax.set_title('Average PredictIt FTE-score versus Days Out')
blue_patch = mpatches.Patch(color='blue', label='Regression line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#printing some stats for the analysis
print('Regression line: y = ' + str(slope) + 'x + ' + str(intercept)) #regression line
print('p-value = ' + str(p_value)) #p value

Regression line: y = -0.01139972396687458x + 17.77907506096637
p-value = 8.569003440248834e-12


#plot for close to the election

#creating the coordinates for the scatterplot
xs = []
ys = []

#populating appropriately
for k in avg_scores.keys():
    if markets_on_day[k] == 35 and k <= 20: #taking points within 20 days, inclusively
        xs.append(k)
        ys.append(avg_scores[k])

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'blue')

#plotting the regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'blue', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(min(xs)-1, max(xs)+1)
plt.ylim(min(ys)-2, max(ys)+2)
ax.set_xlabel('Days Out')
ax.set_ylabel('Average PredictIt FTE-score')
ax.set_title('Average PredictIt FTE-score versus Days Out')
blue_patch = mpatches.Patch(color='blue', label='Regression line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#printing some stats for the analysis
print('Regression line for the above plot: y = ' + str(slope) + 'x + ' + str(intercept)) #regression line
print('p-value for the above plot = ' + str(p_value)) #p value


#plot for farther from the election


#creating the coordinates for the scatterplot
xs = []
ys = []

#populating appropriately
for k in avg_scores.keys():
    if markets_on_day[k] == 35 and k > 20: #this will get the points further than 20 days out and where all 35 markets exist
        xs.append(k)
        ys.append(avg_scores[k])

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'blue')

#plotting the regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'blue', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(min(xs)-1, max(xs)+1)
plt.ylim(min(ys)-2, max(ys)+2)
ax.set_xlabel('Days Out')
ax.set_ylabel('Average PredictIt FTE-score')
ax.set_title('Average PredictIt FTE-score versus Days Out')
blue_patch = mpatches.Patch(color='blue', label='Regression line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#printing some stats for the analysis
print('Regression line for the above plot: y = ' + str(slope) + 'x + ' + str(intercept)) #regression line
print('p-value for the above plot = ' + str(p_value)) #p value

Regression line for the above plot: y = -0.11510913142861182x + 19.163977221197026
p-value for the above plot = 7.362587585689751e-11

Regression line for the above plot: y = -0.003327560941175766x + 17.210453283207254
p-value for the above plot = 0.011174489538679578


#the pred2 dictionary will work in the following manner:
#keys are a string comprised of the 538 District, Days Out, and Contract Name; for each row, this will be unique
#values are an array: index 0 is the trade volume, index 1 is the closing implied probability, and index 2 is whether or
#not the contract resolved to yes (1 means resolved to yes, 0 means it did not)
pred2 = {}

#populating pred2 appropriately
for i in range(0, data.shape[0]):
    if data.at[i, 'Days Out'] > 0 and data.at[i, 'Contract Name'] == 'Democratic':
        if data.at[i, 'Contract Name'] == data.at[i, 'Winner']:
            pred2[str(data.at[i, '538 District']) + str(data.at[i, 'Days Out'])] = (
                [data.at[i, 'Trade Volume'], data.at[i, 'Closing Implied Probability'], 1])
        else:
            pred2[str(data.at[i, '538 District']) + str(data.at[i, 'Days Out'])] = (
                [data.at[i, 'Trade Volume'], data.at[i, 'Closing Implied Probability'], 0])

scores = [] #contains the FTE-scores of each row
vols = [] #trade volumes corresponding to the score above
markets_at_vol = {} #keys are volumes, values are the total number of markets at that volume

#populating the above lists and dictionary appropriately
for k in pred2.keys():
    vols.append(pred2[k][0])
    if pred2[k][0] not in markets_at_vol:
        markets_at_vol[pred2[k][0]] = 0
    markets_at_vol[pred2[k][0]] += 1
    sq_diff = (pred2[k][1] - pred2[k][2])**2
    scores.append(25 - (100 *  sq_diff))

#keys are days out, values are the average PredictIt score on that day
avg_scores = {}

#populating avg_scores appropriately
for i in range(0,len(vols)):
    if vols[i] not in avg_scores:
        avg_scores[vols[i]] = 0
    avg_scores[vols[i]] = avg_scores[vols[i]] + scores[i]/markets_at_vol[vols[i]]
    
#creating the coordinates for the scatterplot
xs = []
ys = []

#populating appropriately
for k in avg_scores.keys():
    xs.append(k)
    ys.append(avg_scores[k])

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'blue')

#plotting the regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'blue', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#adding some text for labels, title, custom x-axis tick labels, etc.
plt.xlim(min(xs)-5, max(xs)+10)
plt.ylim(min(ys)-2, max(ys)+4)
ax.set_xlabel('Trade Volume')
ax.set_ylabel('Average PredictIt FTE-score')
ax.set_title('Average PredictIt FTE-score versus Trade Volume')
blue_patch = mpatches.Patch(color='blue', label='Regression line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#printing some stats for the analysis
print('Regression line: y = ' + str(slope) + 'x + ' + str(intercept)) #regression line
print('p-value = ' + str(p_value)) #p value

Regression line: y = -2.15101058419969e-05x + 10.871882612978396
p-value = 0.7694872022901091


#loading in 538 data
ftedata = pandas.read_csv("election-forecasts-2020/senate_state_toplines_2020.csv")
ftedata.head()


#removing unnecessary columns
ftedata = ftedata[['district', 'winner_Dparty', 'winner_Rparty', 'forecastdate']]
ftedata.head()


#labelling each entry with how far out it is from election day

far_out = []

for i in range(0, ftedata.shape[0]):
    far_out.append((pandas.Timestamp('20201103') - pandas.Timestamp(ftedata.at[i,'forecastdate'])).days)
    
ftedata['Days Out'] =  far_out
ftedata.head()


max_days_out = 0

for i in range(0, ftedata.shape[0]):
    if ftedata.at[i, 'Days Out'] > max_days_out:
        max_days_out = ftedata.at[i, 'Days Out']

print("The highest value of days out in the FiveThirtyEight data is " + str(max_days_out))

The highest value of days out in the FiveThirtyEight data is 94


#first, we will make an fte dictionary with a similar structure to the pred dictionary, enabling easy finding of
#corresponding predictions
#keys are a string comprised of the 538 District and Days Out; for each row, this will be unique
#values are the 538 implied Democratic win probability on that date
#the corresponding PredictIt prediction is the 1 index of the value at the same key in pred;
#the result is stored at the same key in pred, index 2
fte = {}

#populating fte accordingly
for i in range(0, ftedata.shape[0]):
    if ftedata.at[i, 'Days Out'] > 0:
        fte[str(ftedata.at[i, 'district']) + str(ftedata.at[i, 'Days Out'])] = ftedata.at[i,'winner_Dparty']/(
            ftedata.at[i,'winner_Dparty'] + ftedata.at[i,'winner_Rparty'])

#x and y coords for the upcoming scatterplot; xs will be for 538 implied probabilities, ys will be for PredictIt
xs = []
ys = []
#for calculating the total PredictIt democratic bias, relative to 538
total_pred_dem_bias = 0.0

#populating accordingly
for k in fte.keys():
    xs.append(fte[k])
    ys.append(pred[k][1])
    total_pred_dem_bias += pred[k][1] - fte[k]

#average = total/(number of samples)
average_pred_dem_bias = total_pred_dem_bias/len(fte.keys())
    

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys)

x=np.linspace(min(xs), max(xs))
line = 1*x
plt.plot(x, line, 'black', label='y=x')

#labelling
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
ax.set_xlabel('538 Implied Dem. Win Prob.')
ax.set_ylabel('PredictIt Implied Dem. Win Prob.')
ax.set_title('538 probability versus PredictIt probability')
black_patch = mpatches.Patch(color='black', label='y=x line')
plt.legend(handles=[black_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

print("Relative to FiveThirtyEight, PredictIt has an average Democratic bias of " + str(average_pred_dem_bias))

Relative to FiveThirtyEight, PredictIt has an average Democratic bias of -0.0016078394767594624


#x and y coords for the upcoming scatterplot; xs will be for 538 implied probabilities, ys will be for PredictIt
xs = []
ys = []
#will label points blue for a Democratic victory, and red for a Republican victory
labels = []

#populating accordingly
for k in fte.keys():
    xs.append(fte[k])
    ys.append(pred[k][1])
    if pred[k][2] == 1: #the Democrats won
        labels.append('blue')
    else:
        labels.append('red')
    
#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = labels, alpha = 0.75) #reducing opacity so that overlapping points can both be somewhat seen

x=np.linspace(min(xs), max(xs))
line = 1*x
plt.plot(x, line, 'black', label='y=x')

#labelling
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
ax.set_xlabel('538 Implied Dem. Win Prob.')
ax.set_ylabel('PredictIt Implied Dem. Win Prob.')
ax.set_title('538 probability versus PredictIt probability')
blue_patch = mpatches.Patch(color='black', label='y=x line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()


#x and y coords for the upcoming scatterplot; xs will be for 538 implied probabilities, ys will be for PredictIt
xs = []
ys = []
#will label points blue for a Democratic victory, and red for a Republican victory
labels = []

#populating accordingly
for k in fte.keys():
    if pred[k][0] == 1:
        xs.append(fte[k])
        ys.append(pred[k][1])
        if pred[k][2] == 1: #the Democrats won
            labels.append('blue')
        else:
            labels.append('red')
    
#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = labels)

x=np.linspace(min(xs), max(xs))
line = 1*x
plt.plot(x, line, 'black', label='y=x')

#labelling
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
ax.set_xlabel('538 Implied Dem. Win Prob.')
ax.set_ylabel('PredictIt Implied Dem. Win Prob.')
ax.set_title('538 probability versus PredictIt probability')
blue_patch = mpatches.Patch(color='black', label='y=x line')
plt.legend(handles=[blue_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()


#We will need this library later, as we now also need the p-value of the intercept
import statsmodels.api as sm

xs = range(1,95) #xs are days out, which we know range from 1 to 94 in fte
ys = 94 * [0] #ys will be the average PredictIt democratic bias on a given day; index i stores average bias for day i+1

#populating appropriately
for k in fte.keys():
    dayout = pred[k][0]
    ys[dayout-1] += (pred[k][1] - fte[k])/35
    #to get the average, we need to divide everything by 35, as that is the total number of predictions on each day;
    #so, add (predictit prob - fte prob)/35

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'red')

#regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'black', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#labelling
plt.xlim(min(xs)-1, max(xs)+5)
plt.ylim(min(ys)-0.001, max(ys)+0.001)
ax.set_xlabel('Days Out')
ax.set_ylabel('PredictIt Dem. Bias Relative to 538')
ax.set_title('PredictIt Dem. Bias Relative to 538, on Days Out')
black_patch = mpatches.Patch(color='black', label='Regression Line')
plt.legend(handles=[black_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

print('Regression line: y = ' + str(slope) + 'x + ' + str(intercept))

#getting statistics
reshapedx = []
for i in range(0,94):
    reshapedx.append([xs[i]])

reshapedx2 = sm.add_constant(reshapedx)
est = sm.OLS(ys,reshapedx2)
est = est.fit()
print(est.summary())

Regression line: y = 0.00020764315155322055x + -0.011470889175537435
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.149
Model:                            OLS   Adj. R-squared:                  0.139
Method:                 Least Squares   F-statistic:                     16.05
Date:                Wed, 15 Dec 2021   Prob (F-statistic):           0.000125
Time:                        00:12:35   Log-Likelihood:                 271.37
No. Observations:                  94   AIC:                            -538.7
Df Residuals:                      92   BIC:                            -533.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0115      0.003     -4.046      0.000      -0.017      -0.006
x1             0.0002   5.18e-05      4.006      0.000       0.000       0.000
==============================================================================
Omnibus:                        5.113   Durbin-Watson:                   0.074
Prob(Omnibus):                  0.078   Jarque-Bera (JB):                2.687
Skew:                           0.151   Prob(JB):                        0.261
Kurtosis:                       2.229   Cond. No.                         110.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#creating the plot for 20 days out and closer
xs_close = xs[0:20]
ys_close = ys[0:20]
fig, ax = plt.subplots()
ax.scatter(xs_close, ys_close, color = 'red')

#regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs_close,ys_close)
x=np.linspace(min(xs_close), max(xs_close))
line = slope*x+intercept
plt.plot(x, line, 'black', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#labelling
plt.xlim(min(xs_close)-1, max(xs_close)+5)
plt.ylim(min(ys_close)-0.001, max(ys_close)+0.001)
ax.set_xlabel('Days Out')
ax.set_ylabel('PredictIt Dem. Bias')
ax.set_title('PredictIt Dem. Bias Relative to 538, on Days Out, Near Election Day')
black_patch = mpatches.Patch(color='black', label='Regression Line')
plt.legend(handles=[black_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

print('Regression line for near the election: y = ' + str(slope) + 'x + ' + str(intercept))

reshapedx_close = []
for i in range(0,20):
    reshapedx_close.append([xs_close[i]])

reshapedx2_close = sm.add_constant(reshapedx_close)
est = sm.OLS(ys_close,reshapedx2_close)
est = est.fit()
print(est.summary())


#creating the plot for 21 days out and farther
xs_far = xs[20:]
ys_far = ys[20:]
fig, ax = plt.subplots()
ax.scatter(xs_far, ys_far, color = 'red')

#regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs_far,ys_far)
x=np.linspace(min(xs_far), max(xs_far))
line = slope*x+intercept
plt.plot(x, line, 'black', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#labelling
plt.xlim(min(xs_far)-1, max(xs_far)+5)
plt.ylim(min(ys_far)-0.001, max(ys_far)+0.001)
ax.set_xlabel('Days Out')
ax.set_ylabel('PredictIt Dem. Bias')
ax.set_title('PredictIt Dem. Bias Relative to 538, on Days Out, Farther Out')
black_patch = mpatches.Patch(color='black', label='Regression Line')
plt.legend(handles=[black_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

print('Regression line for farther from the election: y = ' + str(slope) + 'x + ' + str(intercept))

#getting statistics
reshapedx_far = []
for i in range(0,74):
    reshapedx_far.append([xs_far[i]])

reshapedx2_far = sm.add_constant(reshapedx_far)
est = sm.OLS(ys_far,reshapedx2_far)
est = est.fit()
print(est.summary())

Regression line for near the election: y = 0.001663616365416215x + -0.040173981110693494
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.703
Method:                 Least Squares   F-statistic:                     45.87
Date:                Wed, 15 Dec 2021   Prob (F-statistic):           2.41e-06
Time:                        00:12:42   Log-Likelihood:                 73.910
No. Observations:                  20   AIC:                            -143.8
Df Residuals:                      18   BIC:                            -141.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0402      0.003    -13.653      0.000      -0.046      -0.034
x1             0.0017      0.000      6.773      0.000       0.001       0.002
==============================================================================
Omnibus:                        1.382   Durbin-Watson:                   0.574
Prob(Omnibus):                  0.501   Jarque-Bera (JB):                1.217
Skew:                          -0.485   Prob(JB):                        0.544
Kurtosis:                       2.280   Cond. No.                         25.0
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Regression line for farther from the election: y = -0.00019453894353564514x + 0.015280357829800618
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.206
Model:                            OLS   Adj. R-squared:                  0.194
Method:                 Least Squares   F-statistic:                     18.63
Date:                Wed, 15 Dec 2021   Prob (F-statistic):           4.99e-05
Time:                        00:12:42   Log-Likelihood:                 250.74
No. Observations:                  74   AIC:                            -497.5
Df Residuals:                      72   BIC:                            -492.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0153      0.003      5.527      0.000       0.010       0.021
x1            -0.0002   4.51e-05     -4.316      0.000      -0.000      -0.000
==============================================================================
Omnibus:                        3.381   Durbin-Watson:                   0.144
Prob(Omnibus):                  0.184   Jarque-Bera (JB):                2.573
Skew:                           0.403   Prob(JB):                        0.276
Kurtosis:                       3.430   Cond. No.                         176.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


xs = range(1,95) #Days out
ys_pred = 94*[0] #PredictIt average FTE-score on i+1 days out
ys_fte = 94*[0] #538 average FTE-score on i+1 days out

#populating the two sets of ys accordingly
for k in fte.keys():
    pred_score = (pred[k][1] - pred[k][2])**2
    fte_score = (fte[k] - pred[k][2])**2
    ys_pred[pred[k][0]-1] += (25.0 - (100.0 *  pred_score))/35 #dividing by 35 so it ends up being an average
    ys_fte[pred[k][0]-1] += (25.0 - (100.0 *  fte_score))/35 #dividing by 35 so it ends up being an average

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys_pred, color = 'blue') #coloring PredictIt points blue
ax.scatter(xs, ys_fte, color = 'orange') #coloring 538 points blue

#labelling
plt.xlim(min(xs)-1, max(xs)+5)
plt.ylim(min(ys_pred)-2, max(ys_fte)+2)
ax.set_xlabel('Days Out')
ax.set_ylabel('Average FTE-Score')
ax.set_title('PredictIt Score versus 538 Score, on Days Out')
blue_patch = mpatches.Patch(color='blue', label='PredictIt scores')
orange_patch = mpatches.Patch(color='orange', label='538 scores')
plt.legend(handles=[blue_patch, orange_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()


#we already have the xs from last time, and the ys are easy to create: ys[i] = ys_pred[i] - ys_fte[i]
ys = 94 * [0]
for i in range(0,94):
    ys[i] = ys_pred[i] - ys_fte[i]

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'red')

#regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'black', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#labelling
plt.xlim(min(xs)-1, max(xs)+5)
plt.ylim(min(ys)-0.1, max(ys)+0.1)
ax.set_xlabel('Days out')
ax.set_ylabel('PredictIt Score - 538 Score')
ax.set_title('PredictIt Score versus 538 Score, on Days Out')
blue_patch = mpatches.Patch(color='blue', label='PredictIt scores')
orange_patch = mpatches.Patch(color='orange', label='538 scores')
plt.legend(handles=[blue_patch, orange_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#getting statistics
reshapedx = []
for i in range(0,94):
    reshapedx.append([xs[i]])

reshapedx2 = sm.add_constant(reshapedx)
est = sm.OLS(ys,reshapedx2)
est = est.fit()
print(est.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     3.590
Date:                Wed, 15 Dec 2021   Prob (F-statistic):             0.0613
Time:                        00:15:33   Log-Likelihood:                -20.878
No. Observations:                  94   AIC:                             45.76
Df Residuals:                      92   BIC:                             50.84
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1051      0.064      1.656      0.101      -0.021       0.231
x1            -0.0022      0.001     -1.895      0.061      -0.005       0.000
==============================================================================
Omnibus:                        1.133   Durbin-Watson:                   0.488
Prob(Omnibus):                  0.567   Jarque-Bera (JB):                1.215
Skew:                          -0.239   Prob(JB):                        0.545
Kurtosis:                       2.713   Cond. No.                         110.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#first, we need to figure out which markets are not close, so we can exclude them in construction of our ys;
#note that the first two digits of the fte key indicate the unique race in all cases except for Georgia, where two
#elections were occuring. However, this will not pose any issues, as neither party in either of the two Georgia
#races was given over a 90% chance of victory at any point within 94 days of the election by either model

#creating a dictionary of races to exclude; keys will be the two digit state abbrevation, and values will be true;
#if a state is not present, than it should not be excluded
to_exclude = {}
for k in fte.keys():
    if fte[k] <= 0.10 or fte[k] >= 0.90:
        to_exclude[k[0:2]] = True
        
to_exclude

xs = range(1,95) #Days out
ys_pred = 94*[0] #PredictIt average FTE-score on i+1 days out
ys_fte = 94*[0] #538 average FTE-score on i+1 days out

#populating the two sets of ys accordingly
for k in fte.keys():
    if k[0:2] not in to_exclude:
        pred_score = (pred[k][1] - pred[k][2])**2
        fte_score = (fte[k] - pred[k][2])**2
        ys_pred[pred[k][0]-1] += (25.0 - (100.0 *  pred_score))/(35-len(to_exclude.keys())) #dividing by no. of predictions
        ys_fte[pred[k][0]-1] += (25.0 - (100.0 *  fte_score))/(35-len(to_exclude.keys())) #dividing by number of predictions

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys_pred, color = 'blue') #coloring PredictIt points blue
ax.scatter(xs, ys_fte, color = 'orange') #coloring 538 points blue

#labelling
plt.xlim(min(xs)-1, max(xs)+5)
plt.ylim(min(ys_pred)-2, max(ys_fte)+2)
ax.set_xlabel('Days Out')
ax.set_ylabel('Average FTE-Score')
ax.set_title('PredictIt Score versus 538 Score in Close Races, on Days Out')
blue_patch = mpatches.Patch(color='blue', label='PredictIt scores')
orange_patch = mpatches.Patch(color='orange', label='538 scores')
plt.legend(handles=[blue_patch, orange_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()


#we already have the xs from last time, and the ys are easy to create: ys[i] = ys_pred[i] - ys_fte[i]
ys = 94 * [0]
for i in range(0,94):
    ys[i] = ys_pred[i] - ys_fte[i]

#creating the plot
fig, ax = plt.subplots()
ax.scatter(xs, ys, color = 'red')

#regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(xs,ys)
x=np.linspace(min(xs), max(xs))
line = slope*x+intercept
plt.plot(x, line, 'black', label='y={:.2f}x+{:.2f}'.format(slope,intercept))

#labelling
plt.xlim(min(xs)-1, max(xs)+5)
plt.ylim(min(ys)-0.1, max(ys)+0.1)
ax.set_xlabel('Days out')
ax.set_ylabel('PredictIt Score - 538 Score')
ax.set_title('PredictIt Score versus 538 Score in Close Races, on Days Out')
blue_patch = mpatches.Patch(color='blue', label='PredictIt scores')
orange_patch = mpatches.Patch(color='orange', label='538 scores')
plt.legend(handles=[blue_patch, orange_patch], loc = 'best', bbox_to_anchor=(1.1, -0.2))
plt.show()

#getting statistics
reshapedx = []
for i in range(0,94):
    reshapedx.append([xs[i]])

reshapedx2 = sm.add_constant(reshapedx)
est = sm.OLS(ys,reshapedx2)
est = est.fit()
print(est.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     5.033
Date:                Wed, 15 Dec 2021   Prob (F-statistic):             0.0273
Time:                        00:18:30   Log-Likelihood:                -108.95
No. Observations:                  94   AIC:                             221.9
Df Residuals:                      92   BIC:                             227.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9999      0.162      6.169      0.000       0.678       1.322
x1            -0.0066      0.003     -2.243      0.027      -0.013      -0.001
==============================================================================
Omnibus:                        0.575   Durbin-Watson:                   0.471
Prob(Omnibus):                  0.750   Jarque-Bera (JB):                0.707
Skew:                          -0.159   Prob(JB):                        0.702
Kurtosis:                       2.719   Cond. No.                         110.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#importing necessary modules
import random
from sklearn import tree

#we will create 5 trees randomly
for tree_num in range(1,6):
    #we will use the lists below to store the testing and training inputs and outputs
    X_train = [] #training inputs
    X_test = [] #testing inputs
    y_train = [] #training outputs
    y_test = [] #testing true outputs; we will check if these match those produced by the model
    
    #populating testing and training inputs and outputs accordingly; each input has a 70% chance of being used for training,
    #and a 30% chance of being used for testing
    #for inputs, we use a list containing days out, PredictIt's implied Dem odds, and 538's implied Dem. odds, in that order
    #for outputs, we use a list containing the actual outcome
    for k in fte.keys():
        if random.random() < 0.7:
            X_train.append([pred[k][0],pred[k][1],fte[k]])
            y_train.append(pred[k][2])
        else:
            X_test.append([pred[k][0],pred[k][1],fte[k]])
            y_test.append([pred[k][2]])

    #fitting a decision tree to the training data with a max_depth of 1
    clf = tree.DecisionTreeClassifier(criterion = "entropy", max_depth=1)
    clf.fit(X_train, y_train)

    #this list contains the classifications that the decision tree would make for the test data
    predictions = clf.predict(X_test)

    #correct will be assigned the number of classifications that the decision tree made correctly in the test data
    correct = 0

    for i in range(0,len(y_test)):
        if predictions[i] == y_test[i]: 
            #if the prediction matches the true classification, we increment correct
            correct += 1

    #the proportion correct is the number of correct classifications over the total number of classifications attempted
    proportion = correct/len(y_test)

    #we are now going to use a while loop to find the optimal value for max_depth
    #n will be the max_depth value we are attempting in the current iteration of the while loop, and as soon as 
    #incrementing n leads to a less accurate tree, we stop the loop
    n = 1
    #the proportion that the loop got correct on the previous iteration; will be stored at the start of each iteration
    prev_proportion = 0
    #the tree that the loop created on the previous iteration; will be stored at the start of each iteration
    prev_tree = clf

    #we exit the the loop when max_depth=n leads to a worse-performing tree than max_depth=n-1; the performance of the tree
    #with max_depth=n is stored in "proportion", while the performance of the tree with max_depth=n-1 is stored in 
    #"prev_proportion"
    while proportion >= prev_proportion:
        n = n+1
        #storing the previous proportion
        prev_proportion = proportion
        #storing the previous tree
        prev_tree = clf

        #fitting a tree with max_depth=n
        clf = tree.DecisionTreeClassifier(criterion = "entropy", max_depth=n)
        clf.fit(X_train, y_train)

        #this list contains the classifications that the decision tree would make for the test data
        predictions = clf.predict(X_test)

        #correct will be assigned the number of classifications that the decision tree made correctly in the test data
        correct = 0
        for i in range(0,len(y_test)):
            if predictions[i] == y_test[i]:
                #if the prediction matches the true classification, we increment correct
                correct += 1

        #the proportion correct is the number of correct classifications over the total number of classifications attempted
        proportion = correct/len(y_test)

    #when this point in the code has been reached, the decision tree for max_length=n-1 performed the best; this tree
    #is prev_tree, so we display this tree
    fig = plt.figure(figsize=(25,20))
    _ = tree.plot_tree(prev_tree, 
                       feature_names=['Days Out', 'PredictIt prob', 'FiveThirtyEight prob'],  
                       class_names=['0','1'],
                       filled=True)
    
    #the proportion correct for prev_tree was prev_proportion, and we can print this
    print("Tree " + str(tree_num) + " classified " + str(int(100 * prev_proportion)) + "% of the test data correctly.")

Tree 1 classified 97% of the test data correctly.
Tree 2 classified 94% of the test data correctly.
Tree 3 classified 94% of the test data correctly.
Tree 4 classified 95% of the test data correctly.
Tree 5 classified 95% of the test data correctly.

	Market ID	Market Name	Contract ID	Contract Name	Date (ET)	Open Share Price	Close Share Price	Low Share Price	High Share Price	Average Trade Price	Trade Volume
0	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-22	0.37	0.49	0.37	0.49	0.4514	80
1	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-23	0.49	0.40	0.40	0.49	0.4000	1
2	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-24	0.40	0.40	0.40	0.40	0.0000	0
3	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-25	0.40	0.40	0.40	0.40	0.0000	0
4	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-26	0.40	0.40	0.40	0.40	0.0000	0

	Market ID	Market Name	Contract ID	Contract Name	Date (ET)	Open Share Price	Close Share Price	Low Share Price	High Share Price	Average Trade Price	Trade Volume
0	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-22	0.37	0.49	0.37	0.49	0.4514	80
1	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-23	0.49	0.40	0.40	0.49	0.4000	1
2	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-24	0.40	0.40	0.40	0.40	0.0000	0
3	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-25	0.40	0.40	0.40	0.40	0.0000	0
4	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-26	0.40	0.40	0.40	0.40	0.0000	0

	Market ID	Market Name	Contract ID	Contract Name	Date (ET)	Open Share Price	Close Share Price	Low Share Price	High Share Price	Average Trade Price	Trade Volume	Days Out
0	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-22	0.37	0.49	0.37	0.49	0.4514	80	439
1	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-23	0.49	0.40	0.40	0.49	0.4000	1	438
2	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-24	0.40	0.40	0.40	0.40	0.0000	0	437
3	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-25	0.40	0.40	0.40	0.40	0.0000	0	436
4	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-26	0.40	0.40	0.40	0.40	0.0000	0	435

	Market ID	Market Name	Contract ID	Contract Name	Date (ET)	Open Share Price	Close Share Price	Low Share Price	High Share Price	Average Trade Price	Trade Volume	Days Out	State
0	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-22	0.37	0.49	0.37	0.49	0.4514	80	439	North Carolina
1	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-23	0.49	0.40	0.40	0.49	0.4000	1	438	North Carolina
2	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-24	0.40	0.40	0.40	0.40	0.0000	0	437	North Carolina
3	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-25	0.40	0.40	0.40	0.40	0.0000	0	436	North Carolina
4	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-26	0.40	0.40	0.40	0.40	0.0000	0	435	North Carolina

	Market ID	Market Name	Contract ID	Contract Name	Date (ET)	Open Share Price	Close Share Price	Low Share Price	High Share Price	Average Trade Price	Trade Volume	Days Out	State	Winner	538 District
0	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-22	0.37	0.49	0.37	0.49	0.4514	80	439	North Carolina	Republican	NC-S2
1	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-23	0.49	0.40	0.40	0.49	0.4000	1	438	North Carolina	Republican	NC-S2
2	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-24	0.40	0.40	0.40	0.40	0.0000	0	437	North Carolina	Republican	NC-S2
3	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-25	0.40	0.40	0.40	0.40	0.0000	0	436	North Carolina	Republican	NC-S2
4	5808	Which party will win the U.S. Senate race in N...	17017	Democratic	2019-08-26	0.40	0.40	0.40	0.40	0.0000	0	435	North Carolina	Republican	NC-S2

Predicting PredictIt: An Analysis of a Political Betting Forum and its Accuracy¶

Noah Fine¶

Introduction¶

But first- a dive into how PredictIt works¶

Obtaining and Unpacking Data¶

Parsing and Improving the Data¶

Exploratory Data Analysis of the PredictIt data¶

Hypothesis Testing¶

Sidenote - establishing a metric¶

Back to hypothesis testing¶

Comparison Introduction - Obtaining and Cleaning the 538 Data¶

Exploratory Data Analysis - PredictIt versus FiveThirtyEight¶

Hypothesis Testing - PredictIt versus FiveThirtyEight¶

Machine Learning - PredictIt versus FiveThirtyEight¶

Conclusion¶

Short Thank-Yous¶

	cycle	branch	district	forecastdate	expression	name_D1	name_D2	name_D3	name_D4	name_I1	...	wonrunoff_R2	lostrunoff_R2	wonrunoff_R3	lostrunoff_R3	wonrunoff_R4	lostrunoff_R4	wonrunoff_I1	lostrunoff_I1	simulations	timestamp
0	2020	Senate	WY-S2	11/3/20	_lite	Merav Ben-David	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40000	00:19:54 3 Nov 2020
1	2020	Senate	WV-S2	11/3/20	_lite	Paula Jean Swearengin	NaN	NaN	NaN	Franklin Riley	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40000	00:19:54 3 Nov 2020
2	2020	Senate	VA-S2	11/3/20	_lite	Mark Warner	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40000	00:19:54 3 Nov 2020
3	2020	Senate	TX-S2	11/3/20	_lite	M.J. Hegar	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40000	00:19:54 3 Nov 2020
4	2020	Senate	TN-S2	11/3/20	_lite	Marquita Bradshaw	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40000	00:19:54 3 Nov 2020

	district	winner_Dparty	winner_Rparty	forecastdate
0	WY-S2	0.008025	0.991975	11/3/20
1	WV-S2	0.036900	0.963100	11/3/20
2	VA-S2	0.993375	0.006625	11/3/20
3	TX-S2	0.158875	0.841125	11/3/20
4	TN-S2	0.018150	0.981850	11/3/20