The goal in this analysis to:
knitr::opts_chunk$set(echo = TRUE)
plot_save <- FALSE
library(readxl)
library(tidyverse)
library(lubridate)
library(binom)
lott <- read_xlsx("lottery_sales_data.xlsx")
View(lott)
# examine cols&types
t(data.frame(lapply(lott, class)))
## [,1] [,2]
## date "POSIXct" "POSIXt"
## sales_mm "numeric" "numeric"
## sales_jj "numeric" "numeric"
## jackpot "numeric" "numeric"
The goal here is to just get the data into a nice, workable format. And to do some quick and dirty, cursory visualizations, to get a sense of what the main variables of interest (jackpot sizes, and total sales) look like.
lott_clean <-
lott %>%
mutate(jackpot = jackpot * 1000000,
sales = sales_mm + sales_jj)
lott_clean
Graph of jackpot sizes over time. Axes and everything still ugly.
ggplot(data = lott_clean,
aes(x = date,
y = jackpot)) +
geom_point() +
geom_line()
## 1.2. Graph: Just Jackpot (NICE)
color_jackpot <- "green4"
plot_just_sales <-
lott_clean %>%
mutate(jackpot = jackpot / 1e6,
sales = sales / 1e6) %>%
ggplot(aes(x = as.Date(date))) +
geom_line(aes(y = jackpot),
color = color_jackpot,
size = 2,
alpha = 0.5) +
scale_x_date(name = "Date",
date_labels = "%Y",
date_breaks = "1 year") +
scale_y_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
breaks = c(0, 1:16*100),
name = "Jackpot Value \n (m=million, b=billion)") +
labs(title = "Jackpot Size Over Time") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,
size = 15),
axis.text.y.left = element_text(color = color_jackpot,
size = 15),
axis.title.y.left = element_text(color = color_jackpot,
size = 15,
margin = margin(t = 0, r = 10, b = 0, l = 0)),
axis.text.x = element_text(angle = 0)
)
plot_just_sales
## 1.2. Save
if (plot_save){
ggsave(plot = plot_just_sales,
filename = "jackpot_sales_only.pdf",
width = 15,
height = 7)
}
Jackpot sizes at various percentiles (in millions of dollars)
quantile(round(lott_clean$jackpot/1e6, digits = 1), probs = c(0, 0.01, 0.05, seq(0.1, 0.9, 0.1), 0.95, 0.99, 1))
## 0% 1% 5% 10% 20% 30% 40% 50% 60% 70% 80%
## 20.0 20.7 40.0 45.0 55.0 75.0 97.0 125.5 160.0 205.0 265.0
## 90% 95% 99% 100%
## 354.0 427.0 652.3 1537.0
mean(lott_clean$jackpot)/1e6
## [1] 170.625
Graph of total sales figures over time. Axes and everything still ugly.
ggplot(data = lott_clean,
aes(x = date,
y = sales)) +
geom_point() +
geom_line()
Total ticket sales at various percentiles.
quantile(round(lott_clean$sales/1e6, digits = 1), probs = c(0, 0.01, 0.05, seq(0.1, 0.9, 0.1), 0.95, 0.99, 1))
## 0% 1% 5% 10% 20% 30% 40% 50% 60% 70%
## 16.000 16.435 17.800 18.400 19.400 20.650 22.300 24.200 26.000 29.100
## 80% 90% 95% 99% 100%
## 34.900 48.100 69.700 196.855 740.100
Average
mean(lott_clean$sales)/1e6
## [1] 34.73495
Just calculate the correlation between jackpot size and total sales. Very high.
cor(lott_clean$sales, lott_clean$jackpot)
## [1] 0.8135666
And the same relationship, just as a regression.
m1 <- lm(data = lott_clean,
formula = sales ~ jackpot)
summary(m1)
##
## Call:
## lm(formula = sales ~ jackpot, data = lott_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57774557 -15728933 2483981 16127021 317148914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.374e+07 2.558e+06 -5.374 1.45e-07 ***
## jackpot 2.841e-01 1.111e-02 25.570 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31470000 on 334 degrees of freedom
## Multiple R-squared: 0.6619, Adjusted R-squared: 0.6609
## F-statistic: 653.8 on 1 and 334 DF, p-value: < 2.2e-16
The goal here is to create a pretty plot that shows both jackpot sizes and total sales together (using two y-axes), in a way that captures how much they move together.
Resources:
coeff <- max(lott_clean$jackpot/1e6)/max(lott_clean$sales/1e6)
color_jackpot <- "green4"
color_sales <- "grey32"
plot_2.1 <-
lott_clean %>%
mutate(jackpot = jackpot / 1e6,
sales = sales / 1e6) %>%
ggplot(aes(x = as.Date(date))) +
geom_line(aes(y = jackpot),
color = color_jackpot,
size = 2,
alpha = 0.5) +
geom_line(aes(y = sales * coeff),
color = color_sales,
size = 1.25,
alpha = 0.5) +
scale_x_date(name = "Date",
date_labels = "%Y",
date_breaks = "1 year") +
scale_y_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
breaks = c(0, 1:16*100),
name = "Jackpot Value \n (m=million, b=billion)",
sec.axis = sec_axis(trans = ~.*(1/coeff),
name = "Tickets Sold (millions)",
breaks = c(0, seq(0, 8, 0.5)*100))) +
labs(title = "Jackpot Size & Ticket Sales, Over Time") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,
size = 17),
axis.text.y.left = element_text(color = color_jackpot,
size = 15),
axis.ticks.y.left = element_line(color = color_jackpot),
axis.title.y.left = element_text(color = color_jackpot,
size = 15,
margin = margin(t = 0, r = 10, b = 0, l = 0)),
axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10),
size = 15,
color = color_sales),
axis.text.y.right = element_text(size = 15,
color = color_sales),
axis.ticks.y.right = element_line(color = color_sales)
)
plot_2.1
if (plot_save){
ggsave(plot = plot_2.1,
filename = "jackpot_sales_full.pdf",
width = 15,
height = 7)
}
The goal in this section is to calculate the odds of splitting the lottery jackpot, as a function of how any other people have bought tickets. First, I will make a function to do these calculations. Then I will implement the function on actual sales data.
inputs:
output:
See Bill Butler’s website for formal calculations. (It’s just the binomial distribution.)
prob_k <- function(num_winners, num_tickets, prob_win){
#return(choose(n = num_tickets, k = num_winners) * prob_win^num_winners * (1-prob_win)^(num_tickets-num_winners))
return(dbinom(size = num_tickets, x = num_winners, prob = prob_win)) #(same as above)
}
Test cases for function (long endless scroll of output not displayed in Rmd).
print("---1---")
(1/302575350)
print("---2---")
prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
prob_k(num_winners = 1, num_tickets = 2, prob_win = (1/302575350))
print("---3---")
prob_k(num_winners = 1, num_tickets = 2, prob_win = (1/302575350))/prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
prob_k(num_winners = 1, num_tickets = 3, prob_win = (1/302575350))/prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
prob_k(num_winners = 1, num_tickets = 4, prob_win = (1/302575350))/prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
print("---4---")
prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))/prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))
prob_k(num_winners = 2, num_tickets = 3, prob_win = (1/302575350))/prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))
prob_k(num_winners = 2, num_tickets = 4, prob_win = (1/302575350))/prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))
inputs:
outputs:
ev_split <- function(jackpot, tickets, max_winners, prob_win, method, split){
#
if (method == "inverse" & split == TRUE){
ev <- jackpot * (1 - prob_k(num_winners = 0, num_tickets = tickets, prob_win = prob_win))
}
if (method == "inverse" & split == FALSE){
return("nah, this combo (inverse & no split) doesn't make sense")
}
if (method == "sum"){
ev <- 0
for (n in 1:max_winners){
if (split == TRUE){
ev <- ev + ((jackpot/n)*n * prob_k(num_winners = n, num_tickets = tickets, prob_win = prob_win))
#ev <- ev + ((jackpot/n)*n * dbinom(size = n, x = tickets, prob = prob_win)) #(same as above)
}
if (split == FALSE){
ev <- ev + ((jackpot/1)*n * prob_k(num_winners = n, num_tickets = tickets, prob_win = prob_win))
}
}
}
return(ev/tickets)
}
Test cases for function (long endless scroll of output not displayed in Rmd).
calculate odds of split pot, as function of number of tickets sold
Plan: * loop from 1 million to 1 billion ticket sales, by million * loop from 0 winners to 10 winners * calculate the odds of winning, for each combination * store results
# set values to loop through
loop_winners <- 0:10
loop_tickets <- 1:1000*1e6
total_loop <- length(loop_winners) * length(loop_tickets)
# create data frame, to store results
dat_split <- data.frame(num_winners = integer(length = total_loop),
num_tickets = integer(length = total_loop),
prob = double(length = total_loop))
# loop through all values, calculate odds, and store
index <- 0
for (num_winners_i in loop_winners){
for (num_tickets_k in loop_tickets){
index <- index + 1
dat_split$num_winners[index] <- num_winners_i
dat_split$num_tickets[index] <- num_tickets_k
dat_split$prob[index] <- prob_k(num_winners = num_winners_i,
num_tickets = num_tickets_k,
prob_win = (1/302575350))
}
}
dat_split_mod <-
dat_split %>%
arrange(num_tickets, num_winners) %>%
pivot_wider(names_from = num_winners,
values_from = prob) %>%
rename(prob_0 = "0", prob_1 = "1", prob_2 = "2", prob_3 = "3", prob_4 = "4",
prob_5 = "5", prob_6 = "6", prob_7 = "7", prob_8 = "8", prob_9 = "9", prob_10 = "10") %>%
mutate(prob_5_plus = 1 - (prob_0 + prob_1 + prob_2 + prob_3 + prob_4)) %>%
#mutate(check_1 = prob_0 + prob_1 + prob_2 + prob_3 + prob_4 + prob_5,
# check_2 = prob_0 + prob_1 + prob_2 + prob_3 + prob_4,
# check_3 = prob_0 + prob_1 + prob_2 + prob_3 + prob_4 + prob_6)
select(num_tickets, prob_0, prob_1, prob_2, prob_3, prob_4, prob_5_plus, everything()) %>%
pivot_longer(cols = c("prob_0", "prob_1", "prob_2", "prob_3", "prob_4", "prob_5_plus"),
names_to = "num_winners",
values_to = "prob") %>%
select(num_tickets, num_winners, prob) %>%
mutate(num_winners = case_when(num_winners == "prob_0" ~ "None",
num_winners == "prob_1" ~ "One",
num_winners == "prob_2" ~ "Two",
num_winners == "prob_3" ~ "Three",
num_winners == "prob_4" ~ "Four",
num_winners == "prob_5_plus" ~ "Five or More"),
num_winners = factor(num_winners,
levels = c("None", "One", "Two", "Three", "Four", "Five or More")))
dat_split_mod
dat_split %>%
arrange(num_tickets, num_winners) %>%
group_by(num_tickets) %>%
summarize(prob_sum = sum(prob))
## `summarise()` ungrouping output (override with `.groups` argument)
Going to graph this with:
The latter one are for didactic/explanatory purposes.
dat_split_mod %>%
ggplot(aes(x = num_tickets,
y = prob,
color = num_winners)) +
geom_line(size = 1.25,
alpha = 0.5) +
scale_x_continuous(labels = seq(0, 1000, 100),
breaks = seq(0, 1000, 100)*1e6) +
scale_y_continuous(breaks = seq(0, 1, 0.1)) +
scale_color_manual(values = c("red", "green4", "blue4", "dodgerblue2", "skyblue", "turquoise1")) +
labs(x = "Number of Tickets Sold \n (millions)",
y = "Probability of This Many Winners",
title = "Odds of a Split Pot v. Number of Tickets Sold",
color = "Numbers of Winners") +
theme_bw() +
theme(axis.text.x = element_text(angle = 0,
size = 10),
axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
size = 12),
axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
size = 12),
plot.title = element_text(hjust = 0.5,
size = 15),
legend.position = "right")
dat_split_mod %>%
filter(num_winners == "None") %>%
ggplot(aes(x = num_tickets,
y = prob,
color = num_winners)) +
geom_line(size = 1.25,
alpha = 0.5) +
scale_x_continuous(labels = seq(0, 1000, 100),
breaks = seq(0, 1000, 100)*1e6) +
scale_y_continuous(breaks = seq(0, 1, 0.1)) +
scale_color_manual(values = c("red", "green4", "blue4", "dodgerblue2", "skyblue", "turquoise1")) +
labs(x = "Number of Tickets Sold \n (millions)",
y = "Probability of This Many Winners",
title = "",
color = "Numbers of Winners") +
theme_bw() +
theme(axis.text.x = element_text(angle = 0,
size = 10),
axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
size = 12),
axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
size = 12),
plot.title = element_text(hjust = 0.5,
size = 15),
legend.position = "right")
dat_split_mod %>%
filter(num_winners == "None" | num_winners == "One") %>%
ggplot(aes(x = num_tickets,
y = prob,
color = num_winners)) +
geom_line(size = 1.25,
alpha = 0.5) +
scale_x_continuous(labels = seq(0, 1000, 100),
breaks = seq(0, 1000, 100)*1e6) +
scale_y_continuous(breaks = seq(0, 1, 0.1),
limits = c(0, 1)) +
scale_color_manual(values = c("red", "green4")) +
labs(x = "Number of Tickets Sold \n (millions)",
y = "Probability of This Many Winners",
title = "",
color = "Numbers of Winners") +
theme_bw() +
theme(axis.text.x = element_text(angle = 0,
size = 10),
axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
size = 12),
axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
size = 12),
plot.title = element_text(hjust = 0.5,
size = 15),
legend.position = "right")
plot_3.6 <-
dat_split_mod %>%
ggplot(aes(x = num_tickets,
y = prob,
color = num_winners)) +
geom_line(size = 1.25,
alpha = 0.5) +
scale_x_continuous(labels = seq(0, 1000, 100),
breaks = seq(0, 1000, 100)*1e6) +
scale_y_continuous(breaks = seq(0, 1, 0.1)) +
scale_color_manual(values = c("red", "green4", "blue4", "dodgerblue2", "skyblue", "turquoise1")) +
labs(x = "Number of Tickets Sold \n (millions)",
y = "Probability of This Number of Winners",
title = "Odds of a Split Pot v. Number of Tickets Sold",
color = "Numbers of Winners") +
annotate("rect",
xmin = min(lott_clean$sales),
xmax = quantile(lott_clean$sales, prob = 0.50),
ymin = 0,
ymax = 1,
fill = "blue",
alpha = 0.2) +
annotate("rect",
xmin = min(lott_clean$sales),
xmax = quantile(lott_clean$sales, prob = 0.90),
ymin = 0,
ymax = 1,
fill = "blue",
alpha = 0.2) +
annotate("rect",
xmin = min(lott_clean$sales),
xmax = quantile(lott_clean$sales, prob = 0.99),
ymin = 0,
ymax = 1,
fill = "blue",
alpha = 0.2) +
annotate("text",
x = quantile(lott_clean$sales, prob = 0.50),
y = 0.5,
label = "50th percentile",
angle = 90,
size = 4) +
annotate("text",
x = quantile(lott_clean$sales, prob = 0.90),
y = 0.5,
label = "90th percentile",
angle = 90,
size = 4) +
annotate("text",
x = quantile(lott_clean$sales, prob = 0.99),
y = 0.5,
label = "99th percentile",
angle = 90,
size = 4) +
geom_vline(xintercept = max(lott_clean$sales)) +
annotate("text",
x = max(lott_clean$sales) + 5e6,
y = 0.5,
label = "maximum sales ever",
angle = 90,
size = 4) +
geom_vline(xintercept = min(lott_clean$sales)) +
annotate("text",
x = min(lott_clean$sales) - 10e6,
y = 0.5,
label = "minimum sales ever",
angle = 90,
size = 4) +
theme_bw() +
theme(axis.text.x = element_text(angle = 0,
size = 10),
axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
size = 12),
axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
size = 12),
plot.title = element_text(hjust = 0.5,
size = 15),
legend.position = "right")
plot_3.6
if (plot_save){
ggsave(plot = plot_3.6,
filename = "split_pot_overlay.pdf",
width = 15,
height = 7)
}
The goal here is to figure out the cash value of various jackpots, from their advertised (i.e. annuity) amounts. Can’t find a super clear answer (at least to me on this). Below are the more details written state rules/laws for the lottery.
But honestly, I don’t really understand these, so will just look at historical records of jackpot size, and compare advertised (annuity) amount of the jackpot, to the stated equivalent cash prize.
Used, wayback machine for this:
(Seems like the date of the screenshots are kind of out of wack with the date of the drawing. But when you click on the link, it gives you jackpot, cash, and date of drawing, so those are all correctly linked. And that is what I am recording, date of drawing, not date of screenshot.)
Method: clicked through at least one screenshot on each month, in the years 2021 to 2017 (only up to October 31, when rules changed). Not every month, while having a screen shot had a unique drawing value. So just, noted where there was a unique drawing value. And noted that. Not perfectly rigorously. Fully rigorous method could do a full scrape from here.
cash <-
rbind(
c(25e6, 18.6e6, "2021-01-29"),
c(20e6, 14.7e6, "2021-01-26"),
c(850e6, 628.2e6, "2021-01-19"),
c(510e6, 377.4e6, "2021-01-08"),
c(401e6, 305.4e6, "2021-01-01"),
c(310e6, 238e6, "2020-12-18"),
c(57e6, 45.4e6, "2020-08-25"),
c(22e6, 18e6, "2020-08-07"),
c(101e6, 76.5e6, "2020-03-24"),
c(65e6, 48.6e6, "2020-03-03"),
c(202e6, 142e6, "2020-02-11"),
c(266e6, 182.3e6, "2019-12-03"),
c(418e6, 263.3e6, "2019-05-28"),
c(316e6, 195.5e6, "2019-05-14"),
c(305e6, 181e6, "2018-12-18"),
c(208e6, 119e6, "2018-12-04"),
c(122e6, 69e6, "2018-11-16"),
c(106e6, 59e6, "2018-11-13"),
c(90e6, 50e6, "2018-11-09"),
c(52e6, 29e6, "2018-11-02"),
c(40e6, 22e6, "2018-10-26"),
c(45e6, 25e6, "2018-10-30"),
c(1600e6, 913e6, "2018-10-23"),
c(1000e6, 565e6, "2018-10-19"),
c(470e6, 265e6, "2018-10-09"),
c(405e6, 235e6, "2018-10-05"),
c(367e6, 213e6, "2018-10-02"),
c(252e6, 148e6, "2018-09-18"),
c(187e6, 111e6, "2018-09-07"),
c(88e6, 52e6, "2018-08-17"),
c(50e6, 29e6, "2018-08-03"),
c(306e6, 185e6, "2018-07-10"),
c(144e6, 85e6, "2018-06-12"),
c(84e6, 49e6, "2018-05-29"),
c(143e6, 84e6, "2018-05-04"),
c(80e6, 47e6, "2018-04-20"),
c(40e6, 24e6, "2018-04-03"),
c(502e6, 301e6, "2018-03-30"),
c(318e6, 187e6, "2018-03-13"),
c(290e6, 172e6, "2018-03-09"),
c(243e6, 143e6, "2018-03-02"),
c(204e6, 120e6, "2018-02-23"),
c(185e6, 109e6, "2018-02-20"),
c(136e6, 81e6, "2018-02-09"),
c(63e6, 38e6, "2018-01-23"),
c(50e6, 30e6, "2018-01-16"),
c(40e6, 25e6, "2018-01-09"),
c(361e6, 225e6, "2018-01-02"),
c(306e6, 191e6, "2017-12-29"),
c(277e6, 172e6, "2017-12-26"),
c(247e6, 155e6, "2017-12-22"),
c(208e6, 130e6, "2017-12-15"),
c(160e6, 100e6, "2017-12-05"),
c(145e6, 91e6, "2017-12-01"),
c(106e6, 66e6, "2017-11-21"),
c(82e6, 51e6, "2017-11-14"),
c(71e6, 44e6, "2017-11-10"),
c(59e6, 36e6, "2017-11-07"),
c(48e6, 29e6, "2017-11-03")
)
# name columns
colnames(cash) <- c("advertised", "cash", "date")
# reshape data
cash <-
data.frame(cash) %>%
mutate(advertised = as.integer(advertised),
cash = as.integer(cash),
date = as.Date(date),
perc = cash/advertised*100)
# view data
cash
coeff <- max(cash$advertised)/max(cash$perc)
color_advertised <- "dodgerblue4"
color_perc <- "black"
plot_4.3 <-
cash %>%
ggplot(aes(x = date)) +
geom_line(aes(y = advertised),
color = color_advertised,
size = 2,
alpha = 0.5) +
geom_point(aes(y = perc * coeff),
color = color_perc) +
geom_line(aes(y = perc * coeff),
color = color_perc,
size = 1,
alpha = 0.5) +
scale_x_date(name = "Date",
date_labels = "%Y (%b)",
date_breaks = "1 month") +
scale_y_continuous(labels = scales::label_dollar(),
breaks = c(0, 1:17*100e6),
name = "Advertised Jackpot Value (Annuity)",
sec.axis = sec_axis(trans = ~.*(1/coeff),
name = "Cash Value (as % of Advertised Jackpot)",
breaks = seq(0, 100, 10))) +
labs(title = "Advertised Jackpot Size & Cash Value of Jackpot, Over Time") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y.left = element_text(color = color_advertised, size = 10),
axis.title.y.left = element_text(color = color_advertised),
axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10)),
axis.text.x = element_text(angle = 90),
panel.grid.minor = element_blank())
plot_4.3
## 4.3. Save
if (plot_save){
ggsave(plot = plot_4.3,
filename = "cash_jackpot.pdf",
width = 15,
height = 7)
}
coeff <- max(cash$advertised)/max(cash$perc)
color_advertised <- "black"
color_perc <- "steelblue3" #"dodgerblue4"
plot_4.4 <-
cash %>%
pivot_longer(cols = c("advertised", "cash"),
names_to = "claim_method",
values_to = "value") %>%
mutate(claim_method = case_when(claim_method == "advertised" ~ "Advertised/Annuity Value",
claim_method == "cash" ~ "Cash Value")) %>%
ggplot(aes(x = date)) +
geom_line(aes(y = value,
color = claim_method),
size = 2,
alpha = 0.5) +
geom_point(aes(y = perc * coeff),
color = color_perc) +
geom_line(aes(y = perc * coeff),
color = color_perc,
size = 1,
alpha = 0.5) +
scale_x_date(name = "Date",
date_labels = "%Y",
date_breaks = "1 year") +
scale_y_continuous(labels = c("$0",
paste0("$", seq(1, 9, 1)*100, "m"),
paste0("$", seq(1, 1.6, 0.1), "b")),
breaks = c(0, 1:16*100e6),
name = "Jackpot Value",
sec.axis = sec_axis(trans = ~.*(1/coeff),
name = "Cash Value (as % of Advertised/Annuity Value)",
breaks = seq(0, 100, 10))) +
scale_color_manual(values = c("green4", "darkorchid3")) +
labs(title = "Jackpot Size, Over Time",
color = "") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,
size = 15),
axis.text.y.left = element_text(color = color_advertised,
size = 10),
axis.text.y.right = element_text(color = color_perc,
size = 10),
axis.title.y.left = element_text(color = color_advertised,
margin = margin(t = 0, r = 10, b = 0, l = 0),
size = 15),
axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10),
size = 15,
color = color_perc),
axis.ticks.y.right = element_line(color = color_perc),
axis.text.x = element_text(angle = 0),
panel.grid.minor = element_blank(),
legend.position = "top")
plot_4.4
## 4.4. Save
if (plot_save){
ggsave(plot = plot_4.4,
filename = "cash_jackpot_bothlines.pdf",
width = 15,
height = 7)
}
quantile(x = cash$perc,
probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
## 0% 1% 5% 10% 25% 50% 75% 90%
## 55.00000 55.32222 55.64990 56.47660 58.73571 60.00000 62.62652 74.47385
## 95% 99% 100%
## 76.22106 80.56013 81.81818
cash %>%
mutate(year = year(date)) %>%
group_by(year) %>%
summarise(median = median(perc))
## `summarise()` ungrouping output (override with `.groups` argument)
This is where I will set coefficient (proportion) to multiply advertised (annuity) jackpot values by. Since I don’t have the same number of observations for every year, I will just take the mean of the median cash percent for each year.
cash_perc <-
cash %>%
mutate(year = year(date)) %>%
group_by(year) %>%
summarize(median = median(perc)) %>%
summarize(mean_median = mean(median))
## `summarise()` ungrouping output (override with `.groups` argument)
cash_perc
cash_coeff <- cash_perc$mean_median/100
cash_coeff
## [1] 0.668581
Note: I used kind of recursive functions to calculate taxes here. This is not the most computationally efficient thing to do here. (For any income, you should only need to do 2 operations: add fixed sum to pay from brackets below and proportion for top bracket. But that at least requires a one time cost of calculating the fixed values at every bracket. This recursive function (1) is more fun to think about, and (2) avoid having to calculate and manually type in the fixed sum for any given bracket. I just typed in the income brackets.)
# recursive function to calculate taxes
fed_taxes <- function(income){
if (income <= 0) {
return(0)
}
if (income > 518400){
return(0.37 * (income - 518400) + fed_taxes(518400))
}
if (income <= 518400 & income > 207350){
return(0.35 * (income - 207350) + fed_taxes(207350))
}
if (income <= 207350 & income > 163300){
return(0.32 * (income - 163300) + fed_taxes(163300))
}
if (income <= 163300 & income > 85525){
return(0.24 * (income - 85525) + fed_taxes(85525))
}
if (income <= 85525 & income > 40125){
return(0.22 * (income - 40125) + fed_taxes(40125))
}
if (income <= 40125 & income > 9875){
return(0.12 * (income - 9875) + fed_taxes(9875))
}
if (income <= 9875 & income > 0){
return(0.10 * income)
}
}
Test cases for function (long endless scroll of output not displayed in Rmd).
fed_taxes(-100)
fed_taxes(0)
fed_taxes(1)
fed_taxes(9874)
fed_taxes(9875)
fed_taxes(9876)
fed_taxes(40124)
fed_taxes(40125)
fed_taxes(40126)
fed_taxes(85524)
fed_taxes(85525)
fed_taxes(85526)
fed_taxes(163299)
fed_taxes(163300)
fed_taxes(163301)
fed_taxes(207349)
fed_taxes(207350)
fed_taxes(207351)
fed_taxes(518399)
fed_taxes(518400)
fed_taxes(518401)
fed_taxes(1e6)
fed_taxes(10e6)
fed_taxes(100e6)
fed_taxes(1000e6)
fed_taxes(50e3)
fed_taxes(100e3)
Initially was going to calculate taxes for California residents. BUt apparently California residents are exempt from paying state income tax on lottery winnings. Sources:
Going to implement state income tax rates from New York:
California Tax Brackets (for reference):
# recursive function to calculate taxes
state_taxes <- function(income){
state <- "NY" # create internal state variable (not passed to function bc then have to rewrite a bunch of other code below)
if (state == "NY"){
if (income <= 0) {
return(0)
}
if (income > 1077550){
return((0.0882) * (income - 1077550) + state_taxes(1077550))
}
if (income <= 1077550 & income > 215400){
return(0.0685 * (income - 215400) + state_taxes(215400))
}
if (income <= 215400 & income > 80650){
return(0.0641 * (income - 80650) + state_taxes(80650))
}
if (income <= 80650 & income > 21400){
return(0.0609 * (income - 21400) + state_taxes(21400))
}
if (income <= 21400 & income > 13900){
return(0.059 * (income - 13900) + state_taxes(13900))
}
if (income <= 13900 & income > 11700){
return(0.0525 * (income - 11700) + state_taxes(11700))
}
if (income <= 11700 & income > 8500){
return(0.045 * (income - 8500) + state_taxes(8500))
}
if (income <= 8500 & income > 0){
return(0.04 * income)
}
}
# just leaving this here, in case I ever want to use it
if (state == "CA"){
if (income <= 0) {
return(0)
}
if (income > 1e6){
return((0.123+0.01) * (income - 1e6) + state_taxes(1e6))
}
if (income <= 1e6 & income > 599012){
return(0.123 * (income - 599012) + state_taxes(599012))
}
if (income <= 599012 & income > 359407){
return(0.113 * (income - 359407) + state_taxes(359407))
}
if (income <= 359407 & income > 299508){
return(0.103 * (income - 299508) + state_taxes(299508))
}
if (income <= 299508 & income > 58634){
return(0.093 * (income - 58634) + state_taxes(58634))
}
if (income <= 58634 & income > 46394){
return(0.08 * (income - 46394) + state_taxes(46394))
}
if (income <= 46394 & income > 33421){
return(0.06 * (income - 33421) + state_taxes(33421))
}
if (income <= 33421 & income > 21175){
return(0.04 * (income - 21175) + state_taxes(21175))
}
if (income <= 21175 & income > 8932){
return(0.02 * (income - 8932) + state_taxes(8932))
}
if (income <= 8932 & income > 0){
return(0.01 * income)
}
}
}
Test cases for function (long endless scroll of output not displayed in Rmd).
state_taxes(-10)
state_taxes(0)
state_taxes(8499)
state_taxes(8500)
state_taxes(8501)
state_taxes(11699)
state_taxes(11700)
state_taxes(11701)
state_taxes(13899)
state_taxes(13900)
state_taxes(13901)
state_taxes(21399)
state_taxes(21400)
state_taxes(21401)
state_taxes(80649)
state_taxes(80650)
state_taxes(80651)
state_taxes(215399)
state_taxes(215400)
state_taxes(215401)
state_taxes(1077549)
state_taxes(1077550)
state_taxes(1077551)
state_taxes(2e6)
state_taxes(10e6)
state_taxes(100e6)
state_taxes(1000e6)
state_taxes(87701-4601)
Here I will look at some basic stats about how much the jackpot is reduced by taxes and by the cash discount.
jackpot_tax_check <-
lott_clean %>%
mutate(jackpot_taxed = jackpot - fed_taxes(jackpot) - state_taxes(jackpot),
jackpot_taxed_prop = jackpot_taxed / jackpot,
jackpot_both = (jackpot*cash_coeff) - fed_taxes(jackpot*cash_coeff) - state_taxes(jackpot*cash_coeff),
jackpot_both_prop = jackpot_both / jackpot,
jackpot_wrong_way = jackpot_taxed*cash_coeff,
jackpot_wrong_way_prop = jackpot_wrong_way/jackpot)
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income > 518400) {: the condition has length > 1 and only the
## first element will be used
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income > 1077550) {: the condition has length > 1 and only the
## first element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income > 518400) {: the condition has length > 1 and only the
## first element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income > 1077550) {: the condition has length > 1 and only the
## first element will be used
jackpot_tax_check
# just taxes
quantile(x = jackpot_tax_check$jackpot_taxed_prop,
probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
## 0% 1% 5% 10% 25% 50% 75% 90%
## 0.5418379 0.5418894 0.5419365 0.5419646 0.5420561 0.5422643 0.5427035 0.5430948
## 95% 99% 100%
## 0.5432567 0.5446207 0.5447134
# just taxes
mean(jackpot_tax_check$jackpot_taxed_prop)
## [1] 0.5424479
# cash and then taxes
quantile(x = jackpot_tax_check$jackpot_both_prop,
probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
## 0% 1% 5% 10% 25% 50% 75% 90%
## 0.3622751 0.3623266 0.3623736 0.3624018 0.3624933 0.3627015 0.3631407 0.3635320
## 95% 99% 100%
## 0.3636939 0.3650579 0.3651506
# cash and then taxes
mean(jackpot_tax_check$jackpot_both_prop)
## [1] 0.362885
The goal here is to calculate the expected value of the others prizes in the lottery
Odds for smaller prizes can be found here:
# total possible ticket combinations
tick_combos <- choose(n = 70, k = 5) * 25
# odds of winning each of the smaller prizes
odds_5.0 <- choose(n = 5, k = 5) * choose(n = 24, k = 1) / tick_combos # ~(1/12607306)
odds_4.1 <- choose(n = 5, k = 4) * choose(n = 65, k = 1) * choose(n = 1, k = 1) / tick_combos # ~(1/931001)
odds_4.0 <- choose(n = 5, k = 4) * choose(n = 65, k = 1) * choose(n = 24, k = 1) / tick_combos # ~(1/38792)
odds_3.1 <- choose(n = 5, k = 3) * choose(n = 65, k = 2) * choose(n = 1, k = 1) / tick_combos # ~(1/14547)
odds_3.0 <- choose(n = 5, k = 3) * choose(n = 65, k = 2) * choose(n = 24, k = 1) / tick_combos # ~(1/606)
odds_2.1 <- choose(n = 5, k = 2) * choose(n = 65, k = 3) * choose(n = 1, k = 1) / tick_combos # ~(1/693)
odds_1.1 <- choose(n = 5, k = 1) * choose(n = 65, k = 4) * choose(n = 1, k = 1) / tick_combos # ~(1/89)
odds_0.1 <- choose(n = 5, k = 0) * choose(n = 65, k = 5) * choose(n = 1, k = 1) / tick_combos # ~(1/37)
# calculate additional expected value of ticket, from small prizes
# method 1. without taxes
other_prizes <- 1e6 * odds_5.0 +
10e3 * odds_4.1 +
500 * odds_4.0 +
200 * odds_3.1 +
10 * odds_3.0 +
10 * odds_2.1 +
4 * odds_1.1 +
2 * odds_0.1
other_prizes
## [1] 0.2469817
# method 2. with taxes
other_prizes_taxes <- (1e6 - fed_taxes(1e6) - state_taxes(1e6)) * odds_5.0 +
(10e3 - fed_taxes(10e3) - state_taxes(10e3)) * odds_4.1 +
(500 - fed_taxes(500) - state_taxes(500)) * odds_4.0 +
(200 - fed_taxes(200) - state_taxes(200)) * odds_3.1 +
(10 - fed_taxes(10) - state_taxes(10)) * odds_3.0 +
(10 - fed_taxes(10) - state_taxes(10)) * odds_2.1 +
(4 - fed_taxes(4) - state_taxes(4)) * odds_1.1 +
(2 - fed_taxes(2) - state_taxes(2)) * odds_0.1
other_prizes_taxes
## [1] 0.1916548
The expected value is a function of:
Here, I try to identify how each of these affect the expected value of a ticket. And then graph those effects
The goal here is to compute and then graph expected value of a ticket (EV) as function of:
Must also select max # winners to cycle through (picked: 100).
Note: will compute
# preset value, to loop through & use
jackpots <- c(1, 1e3, 1e6, 10e6, 20e6, 40e6, 50e6, 75e6, 100e6, 125e6, 150e6, 175e6, 200e6, 300e6, 400e6, 500e6, 600e6,
700e6, 800e6, 900e6, 1e9, 1.1e9, 1.2e9, 1.3e9, 1.4e9, 1.5e9, 1.6e9, 1.7e9, 1.8e9, 1.9e9, 2e9, 5e9, 10e9)
tickets <- c(1, 10, 1e3, 10e3, 100e3, 1e6, 10e6, 15e6, 20e6, 25e6, 30e6, 35e6, 40e6, 50e6, 100e6, 200e6, 500e6, 750e6, 1e9, 2e9, 5e9, 10e9)
max_winners <- 100
splits <- c(TRUE, FALSE)
total_EVs <- length(jackpots) * length(tickets) * length(splits)
total_EVs
## [1] 1452
# data frame, to save results
EV <- data.frame(jackpot = integer(total_EVs),
tickets = integer(total_EVs),
split = logical(total_EVs),
ev_jj = double(total_EVs),
ev_full = double(total_EVs),
ev_full_cash = double(total_EVs),
ev_full_taxes = double(total_EVs))
# loop and calculate all possible EVs
i <- 0
for (jackpot_i in jackpots){
for (tickets_i in tickets){
for (split_i in splits){
i <- i + 1
EV$jackpot[i] <- jackpot_i
EV$tickets[i] <- tickets_i
EV$split[i] <- split_i
EV$ev_jj[i] <- ev_split(jackpot = jackpot_i,
tickets = tickets_i,
max_winners = max_winners,
prob_win = (1/302575350),
method = "sum",
split = split_i)
EV$ev_full[i] <- EV$ev_jj[i] + other_prizes
EV$ev_full_cash[i] <- ev_split(jackpot = jackpot_i * cash_coeff,
tickets = tickets_i,
max_winners = max_winners,
prob_win = (1/302575350),
method = "sum",
split = split_i) + other_prizes
EV$ev_full_taxes[i] <- ev_split(jackpot = (jackpot_i * cash_coeff) -
fed_taxes(jackpot_i * cash_coeff) -
state_taxes(jackpot_i * cash_coeff),
tickets = tickets_i,
max_winners = max_winners,
prob_win = (1/302575350),
method = "sum",
split = split_i) + other_prizes_taxes
}
}
}
EV
EV_long <-
EV %>%
pivot_longer(cols = c("ev_full", "ev_full_cash", "ev_full_taxes"),
names_to = "type",
values_to = "ev") %>%
mutate(tickets = factor(tickets,
levels = c(1, 10, 1e3, 10e3, 100e3, 1e6, 10e6, 15e6, 20e6, 25e6, 30e6, 35e6, 40e6, 50e6, 100e6,
200e6, 500e6, 750e6, 1e9, 2e9, 5e9, 10e9),
labels = c("1", "10", "1k", "10k", "100k", "1m", "10m", "15m", "20m", "25m", "30m", "35m", "40m", "50m", "100m",
"200m", "500m", "750m", "1b", "2b", "5b", "10b")),
jackpot = factor(jackpot,
levels = c(1, 1e3, 1e6, 10e6, 20e6, 40e6, 50e6, 75e6, 100e6, 125e6, 150e6, 175e6, 200e6, 300e6,
400e6, 500e6, 600e6, 700e6, 800e6, 900e6, 1e9, 1.1e9, 1.2e9, 1.3e9, 1.4e9, 1.5e9,
1.6e9, 1.7e9, 1.8e9, 1.9e9, 2e9, 5e9, 10e9),
labels = c("$1", "$1k", "$1m", "$10m", "$20m", "$40m", "$50m", "$75m", "$100m", "$125m", "$150m",
"$175m", "$200m", "$300m", "$400m", "$500m", "$600m", "$700m", "$800m", "$900m",
"$1b", "$1.1b", "$1.2b", "$1.3b", "$1.4b", "$1.5b", "$1.6b", "$1.7b", "$1.8b", "$1.9b",
"$2b", "$5b", "$10b")),
split = factor(split,
levels = c(FALSE, TRUE),
labels = c("Jackpots Never Split :)", "Jackpots Split :(")),
type = factor(type,
levels = c("ev_full", "ev_full_cash", "ev_full_taxes"),
labels = c("Annuity Value (No Taxes)", "Cash Value (No Taxes)", "Cash Value (Fed & State Taxes)")),
ev_print = round(ev, 2),
sign = case_when(ev_print <2 ~ "negative",
ev_print == 2 ~ "zero",
ev_print >2 ~ "positive"),
sign = factor(sign,
levels = c("negative", "zero", "positive"))) %>%
group_by(sign) %>%
mutate(ev_max_sign = max(ev_print)^(1/4), # note: raising to root, done to squish together values
ev_min_sign = min(ev_print)^(1/3)) %>%
ungroup() %>%
mutate(alpha = case_when(sign == "positive" ~ 0.4 + 0.6 * ((ev_print^(1/4) - ev_min_sign) / ev_max_sign), # add some baseline alpha
sign == "negative" ~ 0.4 + 0.6 * (1 - ((ev_print^(1/3) - ev_min_sign) / ev_min_sign)),
sign == "zero" ~ 1))
plot_7.2 <-
EV_long %>%
ggplot(aes(y = tickets,
x = jackpot,
fill = sign,
label = ev_print)) +
geom_raster(alpha = EV_long$alpha) +
geom_text(color = "white",
size = 2) +
geom_rect(aes(xmin = 5 - 0.5, #which(jackpots %in% 20e6), #min(lott_clean$jackpot)
xmax = 26 + 0.5, #which(jackpots %in% 1.5e9), #max(lott_clean$jackpot)
ymin = 8 - 0.5, #which(tickets %in% 15e6), #min(lott_clean$sales)
ymax = 18 + 0.5, #which(tickets %in% 750e6), #max(lott_clean$sales)
fill = NA),
color = "white") +
scale_fill_manual(values = c("red2", "green4", "grey")) +
labs(x = "Ticket Sales",
y = "Advertised Jackpot Size ($)",
title = "Expected Value of A Lottery Ticket") +
guides(fill = FALSE) +
facet_grid(split ~ type) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90,
vjust = 0.25),
axis.text.y = element_text(vjust = 0.25),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
plot_7.2
if (plot_save){
ggsave(plot = plot_7.2,
filename = "ev_theoretical_wide.pdf",
width = 20,
height = 10)
}
EV_long_filtered <-
EV_long %>%
filter(split == "Jackpots Split :(" & type == "Cash Value (Fed & State Taxes)")
EV_long_filtered %>%
ggplot(aes(y = tickets,
x = jackpot,
fill = sign,
label = format(ev_print, digits = 2))) +
geom_raster(alpha = EV_long_filtered$alpha) +
geom_text(color = "white",
size = 3.5) +
geom_rect(aes(xmin = 5 - 0.5, #which(jackpots %in% 20e6), #min(lott_clean$jackpot)
xmax = 26 + 0.5, #which(jackpots %in% 1.5e9), #max(lott_clean$jackpot)
ymin = 8 - 0.5, #which(tickets %in% 15e6), #min(lott_clean$sales)
ymax = 18 + 0.5, #which(tickets %in% 750e6), #max(lott_clean$sales)
fill = NA),
color = "white") +
scale_fill_manual(values = c("red2", "green4", "grey")) +
labs(y = "Ticket Sales",
x = "Advertised Jackpot Size",
title = paste0("Expected Value of Lottery Ticket (in $), Given:",
"\n (a) Jackpot Size, ",
" (b) Number of Tickets Sold",
"\n (accounting for taxes & cash discount)")) +
guides(fill = FALSE) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90,
vjust = 0.25,
size = 12),
axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
size = 14),
axis.text.y = element_text(vjust = 0.25,
size = 11),
axis.title.y = element_text(size = 14),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
EV_long_filtered <-
EV_long %>%
filter(split == "Jackpots Split :(" & type == "Cash Value (Fed & State Taxes)")
EV_long_filtered %>%
ggplot(aes(y = tickets,
x = jackpot,
fill = sign,
label = format(ev_print, digits = 2))) +
geom_raster(alpha = EV_long_filtered$alpha) +
geom_text(color = "white",
size = 3.5) +
# cross-hairs
geom_segment(aes(x = 0, xend = 20.5, y = 11, yend = 11), linetype = "dotted", size = 1) +
geom_segment(aes(x = 21.5, xend = length(jackpots),
y = 11, yend = 11), linetype = "dotted", size = 1) +
geom_segment(aes(x = 21, xend = 21, y = 0, yend = 10.5), linetype = "dotted", size = 1) +
geom_segment(aes(x = 21, xend = 21, y = 11.5, yend = 22), linetype = "dotted", size = 1) +
geom_rect(aes(xmin = 20.5,
xmax = 21.5,
ymin = 10.5,
ymax = 11.5,
fill = NA),
color = "black",
linetype = "dotted",
size = 1) +
geom_rect(aes(xmin = 5 - 0.5, #which(jackpots %in% 20e6), #min(lott_clean$jackpot)
xmax = 26 + 0.5, #which(jackpots %in% 1.5e9), #max(lott_clean$jackpot)
ymin = 8 - 0.5, #which(tickets %in% 15e6), #min(lott_clean$sales)
ymax = 18 + 0.5, #which(tickets %in% 750e6), #max(lott_clean$sales)
fill = NA),
color = NA) + #HACKY SOLUTION: just make it invisible (colors get messed up for some reason otherwise)
scale_fill_manual(values = c("red2", "green4", "grey")) +
labs(y = "Ticket Sales",
x = "Advertised Jackpot Size",
title = paste0("Expected Value of Lottery Ticket (in $), Given:",
"\n (a) Jackpot Size, ",
" (b) Number of Tickets Sold",
"\n (after accounting for taxes & cash discount)")) +
guides(fill = FALSE) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90,
vjust = 0.25,
size = 12),
axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
size = 14),
axis.text.y = element_text(vjust = 0.25,
size = 11),
axis.title.y = element_text(size = 14),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# shape data
EV_time <-
lott_clean %>%
rowwise() %>%
mutate(ev_advert_ns = ev_split(jackpot = jackpot,
tickets = sales,
max_winners = 100,
prob_win = (1/302575350),
method = "sum",
split = FALSE) + other_prizes,
ev_cash_ns = ev_split(jackpot = jackpot * cash_coeff,
tickets = sales,
max_winners = 100,
prob_win = (1/302575350),
method = "sum",
split = FALSE) + other_prizes,
ev_taxes_ns = ev_split(jackpot = ((jackpot * cash_coeff) - fed_taxes(jackpot * cash_coeff) - state_taxes(jackpot * cash_coeff)),
tickets = sales,
max_winners = 100,
prob_win = (1/302575350),
method = "sum",
split = FALSE) + other_prizes_taxes,
ev_advert_s = ev_split(jackpot = jackpot,
tickets = sales,
max_winners = 100,
prob_win = (1/302575350),
method = "sum",
split = TRUE) + other_prizes,
ev_cash_s = ev_split(jackpot = jackpot * cash_coeff,
tickets = sales,
max_winners = 100,
prob_win = (1/302575350),
method = "sum",
split = TRUE) + other_prizes,
ev_taxes_s = ev_split(jackpot = ((jackpot * cash_coeff) - fed_taxes(jackpot * cash_coeff) - state_taxes(jackpot * cash_coeff)),
tickets = sales,
max_winners = 100,
prob_win = (1/302575350),
method = "sum",
split = TRUE) + other_prizes) %>%
pivot_longer(cols = contains("ev_"),
names_to = "ev_type",
values_to = "ev") %>%
mutate(split = case_when(ev_type %in% c("ev_advert_ns", "ev_cash_ns", "ev_taxes_ns") ~ FALSE,
ev_type %in% c("ev_advert_s", "ev_cash_s", "ev_taxes_s") ~ TRUE),
ev_calc = case_when(ev_type %in% c("ev_advert_ns", "ev_advert_s") ~ "advertised",
ev_type %in% c("ev_cash_ns", "ev_cash_s") ~ "cash",
ev_type %in% c("ev_taxes_ns", "ev_taxes_s") ~ "taxes"))
EV_time
plot_7.3 <-
EV_time %>%
ggplot(aes(x = as.Date(date),
y = ev,
color = ev_calc)) +
geom_line() +
geom_hline(yintercept = 2) +
scale_x_date(name = "Date",
date_labels = "%Y") +
scale_y_continuous(name = "Expected Value") +
labs(title = "Expected Value, Over Time") +
facet_grid(.~split) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
plot_7.3
EV_mean_draw <- mean(EV_time[EV_time$ev_type == "ev_taxes_s",]$ev)
EV_mean_draw
## [1] 0.4321589
quantile(x = EV_time[EV_time$ev_type == "ev_taxes_s",]$ev,
probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
## 0% 1% 5% 10% 25% 50% 75% 90%
## 0.2703832 0.2712707 0.2933206 0.2991337 0.3219430 0.3914926 0.5053136 0.6349851
## 95% 99% 100%
## 0.6984200 0.8124424 0.9341472
sum((EV_time[EV_time$ev_type == "ev_taxes_s",]$ev * EV_time[EV_time$ev_type == "ev_taxes_s",]$sales))/
sum(EV_time[EV_time$ev_type == "ev_taxes_s",]$sales)
## [1] 0.5493451
This one is a little more annoying. We can’t just use the quantile() function (as far as I understand, without needing to create over 11 billion rows).
The plan here is to:
cumulative_sales <- sum(EV_time[EV_time$ev_type == "ev_taxes_s",]$sales)
df_temp <-
EV_time %>%
filter(ev_type == "ev_taxes_s") %>%
arrange(ev) %>%
mutate(sales_cumult = cumsum(sales),
sales_percentile = sales_cumult/cumulative_sales)
df_temp
EV_ticket_weighted_percentiles <- data.frame(percentile = c(0, 1, 5, 10, 25, 50, 75, 90, 95, 99, 100),
ev = c(df_temp[which.min(abs(df_temp$sales_percentile - 0.00)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.01)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.05)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.10)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.25)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.50)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.75)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.90)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.95)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 0.99)), ]$ev,
df_temp[which.min(abs(df_temp$sales_percentile - 1)), ]$ev))
EV_ticket_weighted_percentiles
Note, solution to having boxes appear outside of plot limits can be found here:
plot_ev_basic <-
EV_time %>%
filter(ev_type == "ev_taxes_s") %>%
ggplot(aes(x = jackpot,
y = ev)) +
# COLORED BOXES
annotate("rect",
xmin = -100e6,
xmax = 1.8e9,
ymin = -0.5,
ymax = 2,
fill = "red",
alpha = 0.2) +
annotate("rect",
xmin = -100e6,
xmax = 1.8e9,
ymin = 2,
ymax = 3.5,
fill = "green",
alpha = 0.2) +
# DOWN ARROW
annotate(geom = "text",
x = 800e6,
y = 1.92,
label = "Losing Bet",
angle = 0,
size = 4) +
geom_segment(aes(x = 800e6,
y = 1.85,
xend = 800e6,
yend = 1.6),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed")) +
# UP ARROW
annotate(geom = "text",
x = 800e6,
y = 2.08,
label = "Winning Bet",
angle = 0,
size = 4) +
geom_segment(aes(x = 800e6,
y = 2.15,
xend = 800e6,
yend = 2.4),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed")) +
# AVERAGE EV
geom_segment(aes(x = 50e6,
y = EV_mean_draw,
xend = -50e6,
yend = EV_mean_draw),
arrow = arrow(length = unit(0.2, "cm"),
type = "open")) +
annotate(geom = "text",
x = 0,
y = EV_mean_draw + 0.3,
label = paste0("Average \n Expected \n Value \n ($", round(EV_mean_draw, 2), ")"),
angle = 0,
size = 4) +
# DATA
geom_point(size = 3,
alpha = 0.25) +
geom_line(size = 0.85,
alpha = 0.5) +
geom_hline(yintercept = 2,
color = "black",
size = 1,
linetype = "dotted") +
# AXES
scale_y_continuous(breaks = seq(0, 3, 0.25),
labels = scales::label_dollar()) +
scale_x_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
breaks = c(0, 1:16*100e6)) +
coord_cartesian(xlim = c(0, 1.6e9),
ylim = c(0, 3)) +
# LABELS
labs(title = paste0("Expected Value v. Jackpot Size",
"\n after accounting for ",
"(a) taxes, ",
"(b) cash value discount, ",
"(c) odds of a split pot"),
y = "Expected Value",
x = "Advertised Jackpot Size") +
# STYLES
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,
size = 13,
margin = margin(t = 0, b = 10, l = 0, r = 0)),
axis.title.x = element_text(size = 13,
margin = margin(t = 10, b = 0, r = 0, l = 0)),
axis.title.y = element_text(size = 13,
margin = margin(t = 0, b = 0, r = 10, l = 0)))
plot_ev_basic
if (plot_save){
ggsave(plot = plot_ev_basic,
filename = "Expected_Value_Basic.pdf",
width = 12,
height = 8)
}
plot_ev_comp3 <-
EV_time %>%
filter(ev_type == "ev_advert_s" | ev_type == "ev_cash_s" | ev_type == "ev_taxes_s") %>%
mutate(ev_type = case_when(ev_type == "ev_advert_s" ~ "As Advertised (No Cash Discount, No Taxes)",
ev_type == "ev_cash_s" ~ "With Cash Discount Only",
ev_type == "ev_taxes_s" ~ "With Taxes & Cash Discount"),
ev_type = factor(ev_type,
levels = c("As Advertised (No Cash Discount, No Taxes)",
"With Cash Discount Only (No Taxes)",
"With Taxes & Cash Discount"))) %>%
ggplot(aes(x = jackpot,
y = ev,
color = ev_type)) +
# COLORED BOXES
annotate("rect",
xmin = -100e6,
xmax = 1.8e9,
ymin = -0.5,
ymax = 2,
fill = "red",
alpha = 0.2) +
annotate("rect",
xmin = -100e6,
xmax = 1.8e9,
ymin = 2,
ymax = ceiling(max(EV_time$ev)),
fill = "green",
alpha = 0.2) +
# DOWN ARROW
annotate(geom = "text",
x = 150e6,
y = 1.92,
label = "Losing Bet",
angle = 0,
size = 4) +
geom_segment(aes(x = 150e6,
y = 1.85,
xend = 150e6,
yend = 1.6),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "black") +
# UP ARROW
annotate(geom = "text",
x = 150e6,
y = 2.1,
label = "Winning Bet",
angle = 0,
size = 4) +
geom_segment(aes(x = 150e6,
y = 2.15,
xend = 150e6,
yend = 2.4),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "black") +
# DATA
geom_point(size = 3,
alpha = 0.25) +
geom_line(size = 0.85,
alpha = 0.5) +
geom_hline(yintercept = 2,
color = "black",
size = 1,
linetype = "dotted") +
# AXES
scale_y_continuous(breaks = seq(0, ceiling(max(EV_time$ev)), 0.25),
labels = scales::label_dollar()) +
scale_x_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
breaks = c(0, 1:16*100e6)) +
scale_color_manual(values = c("blue4", "green4", "black")) +
coord_cartesian(xlim = c(0, 1.6e9),
ylim = c(0, 3)) +
# LABELS
labs(title = paste0("Expected Value v. Jackpot Size"),
y = "Expected Value",
x = "Advertised Jackpot Size",
color = "Expected Value Calculated:") +
# STYLES
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,
size = 13,
margin = margin(t = 0, b = 0, l = 0, r = 0)),
axis.title.x = element_text(size = 13,
margin = margin(t = 10, b = 0, r = 0, l = 0)),
axis.title.y = element_text(size = 13,
margin = margin(t = 0, b = 0, r = 10, l = 0)),
axis.text.x = element_text(angle = 0),
legend.position = "top")
plot_ev_comp3
## Warning: Removed 336 rows containing missing values (geom_point).
## Warning: Removed 336 row(s) containing missing values (geom_path).
if (plot_save){
ggsave(plot = plot_ev_comp3,
filename = "Expected_Value_Comp3.pdf",
width = 12,
height = 8)
}
plot_ev_panel <-
EV_time %>%
mutate(ev_calc = case_when(ev_calc == "advertised" ~ "As Advertised (No Cash Discount, No Taxes)",
ev_calc == "cash" ~ "With Cash Discount Only",
ev_calc == "taxes" ~ "With Taxes & Cash Discount"),
ev_calc = factor(ev_calc,
levels = c("As Advertised (No Cash Discount, No Taxes)",
"With Cash Discount Only (No Taxes)",
"With Taxes & Cash Discount")),
split = case_when(split == TRUE ~ "If Multiple Winners: Pot Gets Split",
split == FALSE ~ "If Multiple Winners: Each Person Gets Full Pot Value")) %>%
ggplot(aes(x = jackpot,
y = ev,
color = ev_calc)) +
# COLORED BOXES
annotate("rect",
xmin = -100e6,
xmax = 1.8e9,
ymin = -0.5,
ymax = 2,
fill = "red",
alpha = 0.2) +
annotate("rect",
xmin = -100e6,
xmax = 1.8e9,
ymin = 2,
ymax = ceiling(max(EV_time$ev)),
fill = "green",
alpha = 0.2) +
# DOWN ARROW
annotate(geom = "text",
x = 150e6,
y = 1.92,
label = "Losing Bet",
angle = 0,
size = 4) +
geom_segment(aes(x = 150e6,
y = 1.85,
xend = 150e6,
yend = 1.6),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "black") +
# UP ARROW
annotate(geom = "text",
x = 150e6,
y = 2.10,
label = "Winning Bet",
angle = 0,
size = 4) +
geom_segment(aes(x = 150e6,
y = 2.15,
xend = 150e6,
yend = 2.4),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "black") +
# DATA
geom_point(size = 3,
alpha = 0.25) +
geom_line(size = 0.85,
alpha = 0.5) +
geom_hline(yintercept = 2,
color = "black",
size = 1,
linetype = "dotted") +
# AXES
scale_y_continuous(breaks = seq(0, ceiling(max(EV_time$ev)), 0.25),
labels = scales::label_dollar()) +
scale_x_continuous(#labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
labels = c(paste0("$", c(0, 2.5, 5, 7.5)*100, "m"), paste0("$", c(1, 1.25, 1.5), "b")),
#breaks = c(0, 1:16*100e6)) +
breaks = c(0, 2.5, 5, 7.5, 10, 12.5, 15)*100e6) +
scale_color_manual(values = rep(c("blue4", "green4", "black"), 2)) +
coord_cartesian(xlim = c(0, 1.6e9),
ylim = c(0, max(EV_time$ev))) +
# PANELS
facet_grid(.~split) +
# LABELS
labs(title = paste0("Expected Value v. Jackpot Size"),
y = "Expected Value",
x = "Advertised Jackpot Size",
color = "Expected Value Calculated:") +
# STYLES
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,
size = 13,
margin = margin(t = 0, b = 0, l = 0, r = 0)),
axis.title.x = element_text(size = 13,
margin = margin(t = 10, b = 0, r = 0, l = 0)),
axis.title.y = element_text(size = 13,
margin = margin(t = 0, b = 0, r = 10, l = 0)),
axis.text.x = element_text(angle = 0),
legend.position = "top")
plot_ev_panel
## Warning: Removed 672 rows containing missing values (geom_point).
## Warning: Removed 672 row(s) containing missing values (geom_path).
## 7.10. Save
if (plot_save){
ggsave(plot = plot_ev_panel,
filename = "Expected_Value_Panel.pdf",
width = 12,
height = 8)
}
[note: need to amend this, given updated calculations]
Note there have only ever been 7 cases where a ticket has had an expected value higher than the cost of a ticket. And this is only in fantasy cases, where we change or relax the actual lottery rules and payout structures. This is 7 out of 2016 hypothetical drawings considered (336 drawins X 6 possible scenarios for each).
EV_time %>%
filter(ev > 2)
EV_time
EV_time %>%
filter(ev_type == "ev_taxes_s" & jackpot > 900e6)